In [1]:
import numpy as np
import pandas as pd
import sklearn

In [2]:
names = ['class','id','sequence']

data = pd.read_csv("promoters.csv", names=names)

In [3]:
print(data.iloc[0])

class                                                       +
id                                                        S10
sequence    \t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
Name: 0, dtype: object


In [4]:
# Get the classes
classes = data.loc[:,'class']
print(classes[:5])

0    +
1    +
2    +
3    +
4    +
Name: class, dtype: object


In [6]:
# Get the sequences
sequences = list(data.loc[:,'sequence'])
print(sequences[:2])

['\t\ttactagcaatacgcttgcgttcggtggttaagtatgtataatgcgcgggcttgtcgt', '\t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaatctaacgcatcgccaa']


In [8]:
dataset = {}

for i,seq in enumerate(sequences):
    
    #split into nucleotides and remove tab
    nucleotides = list(seq)
    nucleotides = [x for x in nucleotides if x != '\t']
    
    nucleotides.append(classes[i])
    
    dataset[i] = nucleotides

print(dataset[0])

['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+']


In [9]:
# Convert dataset to dataframe
dframe = pd.DataFrame(dataset)
print(dframe)

   0   1   2   3   4   5   6   7   8   9    ... 96  97  98  99  100 101 102  \
0    t   t   g   a   t   a   c   t   c   t  ...   c   c   t   a   g   c   g   
1    a   g   t   a   c   g   a   t   g   t  ...   c   g   a   g   a   c   t   
2    c   c   a   t   g   g   g   t   a   t  ...   g   c   t   a   g   t   a   
3    t   t   c   t   a   g   g   c   c   t  ...   a   t   g   g   a   c   t   
4    a   a   t   g   t   g   g   t   t   a  ...   g   a   a   g   g   a   t   
5    g   t   a   t   a   c   g   a   t   a  ...   t   g   c   g   c   a   c   
6    c   c   g   g   a   a   g   c   a   a  ...   a   g   c   t   a   t   t   
7    a   c   a   a   t   a   t   a   a   t  ...   g   a   g   g   t   g   c   
8    a   t   g   t   t   g   g   a   t   t  ...   a   c   a   t   g   g   a   
9    t   g   a   g   a   g   g   a   a   t  ...   c   t   a   a   t   c   a   
10   a   a   a   t   a   a   a   a   t   c  ...   c   t   c   c   c   c   c   
11   c   c   c   g   c   g   g   c   a   c  ...   c 

In [10]:
# Transpose Dataframe
dframe = dframe.transpose()
print(dframe)

    0  1  2  3  4  5  6  7  8  9   ... 48 49 50 51 52 53 54 55 56 57
0    t  a  c  t  a  g  c  a  a  t  ...  g  c  t  t  g  t  c  g  t  +
1    t  g  c  t  a  t  c  c  t  g  ...  c  a  t  c  g  c  c  a  a  +
2    g  t  a  c  t  a  g  a  g  a  ...  c  a  c  c  c  g  g  c  g  +
3    a  a  t  t  g  t  g  a  t  g  ...  a  a  c  a  a  a  c  t  c  +
4    t  c  g  a  t  a  a  t  t  a  ...  c  c  g  t  g  g  t  a  g  +
..  .. .. .. .. .. .. .. .. .. ..  ... .. .. .. .. .. .. .. .. .. ..
101  c  c  t  c  a  a  t  g  g  c  ...  g  a  a  c  t  a  t  a  t  -
102  g  t  a  t  t  c  t  c  a  a  ...  t  c  a  a  c  a  t  t  g  -
103  c  g  c  g  a  c  t  a  c  g  ...  a  a  g  g  c  t  t  c  c  -
104  c  t  c  g  t  c  c  t  c  a  ...  a  g  g  a  g  g  a  a  c  -
105  t  a  a  c  a  t  t  a  a  t  ...  t  c  a  a  g  a  a  c  t  -

[106 rows x 58 columns]


In [11]:
# Rename last column to class
dframe.rename(columns = {57 : 'class'}, inplace = True)

print(dframe[:2])

   0  1  2  3  4  5  6  7  8  9  ... 48 49 50 51 52 53 54 55 56 class
0  t  a  c  t  a  g  c  a  a  t  ...  g  c  t  t  g  t  c  g  t     +
1  t  g  c  t  a  t  c  c  t  g  ...  c  a  t  c  g  c  c  a  a     +

[2 rows x 58 columns]


In [12]:
# Check dataset
dframe.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,class
count,106,106,106,106,106,106,106,106,106,106,...,106,106,106,106,106,106,106,106,106,106
unique,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,2
top,t,a,a,c,a,a,a,a,a,a,...,c,c,c,t,t,c,c,c,t,+
freq,38,34,30,30,36,42,38,34,33,36,...,36,42,31,33,35,32,29,29,34,53


In [14]:
# Record value counts for each column

series = []

for name in dframe.columns:
    series.append(dframe[name].value_counts())

info = pd.DataFrame(series)
info = info.transpose()
print(info)

      0     1     2     3     4     5     6     7     8     9  ...    48  \
t  38.0  26.0  27.0  26.0  22.0  24.0  30.0  32.0  32.0  28.0  ...  21.0   
c  27.0  22.0  21.0  30.0  19.0  18.0  21.0  20.0  22.0  22.0  ...  36.0   
a  26.0  34.0  30.0  22.0  36.0  42.0  38.0  34.0  33.0  36.0  ...  23.0   
g  15.0  24.0  28.0  28.0  29.0  22.0  17.0  20.0  19.0  20.0  ...  26.0   
+   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   
-   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   

     49    50    51    52    53    54    55    56  class  
t  22.0  23.0  33.0  35.0  30.0  23.0  29.0  34.0    NaN  
c  42.0  31.0  32.0  21.0  32.0  29.0  29.0  17.0    NaN  
a  24.0  28.0  27.0  25.0  22.0  26.0  24.0  27.0    NaN  
g  18.0  24.0  14.0  25.0  22.0  28.0  24.0  28.0    NaN  
+   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   53.0  
-   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   53.0  

[6 rows x 58 columns]


In [15]:
# Switch to numerical data using pandas get dummies

num_data = pd.get_dummies(dframe)
print(num_data[:2])

   0_a  0_c  0_g  0_t  1_a  1_c  1_g  1_t  2_a  2_c  ...  55_a  55_c  55_g  \
0    0    0    0    1    1    0    0    0    0    1  ...     0     0     1   
1    0    0    0    1    0    0    1    0    0    1  ...     1     0     0   

   55_t  56_a  56_c  56_g  56_t  class_+  class_-  
0     0     0     0     0     1        1        0  
1     0     1     0     0     0        1        0  

[2 rows x 230 columns]


In [18]:
# Remove one of class columns and rename to class

df = num_data.drop(columns = ['class_-'])
df.rename(columns = {'class_+':'class'}, inplace=True)

print(df)

     0_a  0_c  0_g  0_t  1_a  1_c  1_g  1_t  2_a  2_c  ...  54_t  55_a  55_c  \
0      0    0    0    1    1    0    0    0    0    1  ...     0     0     0   
1      0    0    0    1    0    0    1    0    0    1  ...     0     1     0   
2      0    0    1    0    0    0    0    1    1    0  ...     0     0     1   
3      1    0    0    0    1    0    0    0    0    0  ...     0     0     0   
4      0    0    0    1    0    1    0    0    0    0  ...     1     1     0   
..   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   ...   ...   
101    0    1    0    0    0    1    0    0    0    0  ...     1     1     0   
102    0    0    1    0    0    0    0    1    1    0  ...     1     0     0   
103    0    1    0    0    0    0    1    0    0    1  ...     1     0     1   
104    0    1    0    0    0    0    0    1    0    1  ...     0     1     0   
105    0    0    0    1    1    0    0    0    1    0  ...     0     0     1   

     55_g  55_t  56_a  56_c  56_g  56_t

In [20]:
# Import Algorithms

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

from sklearn.model_selection import train_test_split
X = np.array(df.drop(['class'],1))
Y = np.array(df['class'])

# Define seed for reproducability
seed = 1

# Split data

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = seed)

  X = np.array(df.drop(['class'],1))


In [33]:
# Define scoring method
scoring = 'accuracy'

# Define models
names = ['KNN', 'MLP', 'Gaussian Process', 'Decision Tree', 'Random Forest', 'Ada Boost', 'Naive Bayes', 'SVM Linear', 'SVM RBF', 'SVM Sigmoid']

classifiers = [
    KNeighborsClassifier(n_neighbors = 3),
    MLPClassifier(alpha = 1),
    GaussianProcessClassifier(1.0*RBF(1.0)),
    DecisionTreeClassifier(max_depth = 5),
    RandomForestClassifier(max_depth = 5, n_estimators = 10, max_features = 1),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel = 'linear'),
    SVC(kernel = 'rbf'),
    SVC(kernel = 'sigmoid')
]

models = zip(names,classifiers)

from sklearn.model_selection import KFold, cross_val_score
# Evaluate the results in each model

names = []
results = []

for name,model in models:
    kfold = KFold(n_splits = 10)
    cv_results = cross_val_score(model, X_train, Y_train, cv = kfold, scoring = scoring)
    results.append(cv_results)
    names.append(name)
    msg = "{0}: {1} ({2})".format(name,cv_results.mean(),cv_results.std())
    print(msg)

KNN: 0.8375 (0.12562344526401112)




MLP: 0.8875 (0.08750000000000001)
Gaussian Process: 0.8732142857142857 (0.05615780426255853)
Decision Tree: 0.7482142857142857 (0.17559120253635507)
Random Forest: 0.675 (0.225)
Ada Boost: 0.925 (0.11456439237389601)
Naive Bayes: 0.8375 (0.1375)
SVM Linear: 0.85 (0.10897247358851683)
SVM RBF: 0.8875 (0.0673145600891813)
SVM Sigmoid: 0.9 (0.09354143466934853)


In [42]:
# Test the algorithm on train dataset
i = 0

for model in classifiers:
    model.fit(X_train,Y_train)
    predictions = model.predict(X_test)
    print(names[i])
    print(accuracy_score(Y_test,predictions))
    print(classification_report(Y_test,predictions))
    i = i+1

KNN
0.7777777777777778
              precision    recall  f1-score   support

           0       1.00      0.65      0.79        17
           1       0.62      1.00      0.77        10

    accuracy                           0.78        27
   macro avg       0.81      0.82      0.78        27
weighted avg       0.86      0.78      0.78        27





MLP
0.8888888888888888
              precision    recall  f1-score   support

           0       1.00      0.82      0.90        17
           1       0.77      1.00      0.87        10

    accuracy                           0.89        27
   macro avg       0.88      0.91      0.89        27
weighted avg       0.91      0.89      0.89        27

Gaussian Process
0.8888888888888888
              precision    recall  f1-score   support

           0       1.00      0.82      0.90        17
           1       0.77      1.00      0.87        10

    accuracy                           0.89        27
   macro avg       0.88      0.91      0.89        27
weighted avg       0.91      0.89      0.89        27

Decision Tree
0.7037037037037037
              precision    recall  f1-score   support

           0       0.91      0.59      0.71        17
           1       0.56      0.90      0.69        10

    accuracy                           0.70        27
   macro avg       0.74      0.74   