In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import re
import nltk
import math
import pickle

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.linear_model import Perceptron

from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from Bio.SeqUtils.ProtParam import ProteinAnalysis
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score, cross_val_predict

%matplotlib inline

In [2]:
# read in data from csv file
df = pd.read_csv('proteins-5-functions.csv')

df.head()

Unnamed: 0,classification,name,sequence,structureId,type
0,TRANSFERASE,,MEIYEGKLTAEGLRFGIVASRFNHALVDRLVEGAIDCIVRHGGREE...,5MPP,4
1,HYDROLASE,,MKFTLTIAGLLAVGSTAAPTTEKRNPGGIDYVQNYNGDVADFQYNE...,3M4F,0
2,TRANSFERASE,,MRGSHHHHHHGSMKRAVITGLGIVSSIGNNQQEVLASLREGRSGIT...,2BYY,4
3,HYDROLASE,,STGSATTTPIDSLDDAYITPVQIGTPAQTLNLDFDTGSSDLWVFSS...,4YCY,0
4,TRANSFERASE,,GSGMMRYLHKIELELNRLTSRYPFFKKIAFDAEIIKLVDDLNVDEN...,3AQC,4


In [3]:
df.columns

Index(['classification', 'name', 'sequence', 'structureId', 'type'], dtype='object')

In [4]:
df.shape

(104956, 5)

## 5 Classes of Proteins

In [5]:
df.classification.value_counts()

TRANSFERASE      35880
LIGASE           33967
ISOMERASE        16988
HYDROLASE        12664
IMMUNE SYSTEM     5457
Name: classification, dtype: int64

In [6]:
df.type.value_counts()

4    35880
3    33967
2    16988
0    12664
1     5457
Name: type, dtype: int64

## Remove Duplicate Sequences

In [7]:
# remove the duplicate protein sequences
#df = df.drop_duplicates(subset='sequence', keep="first")

# remove nan from 'sequence' column
#df = df[df['sequence'].notnull()]
#df.head()

In [8]:
#df_3.shape

## Classifier of Protein Sequences: Predict 5 Classes

## CountVectorizer: Count peptide frequency, transform the data

In [9]:
# In this case, peptide frequency is used for analysis
peptide_size = 6
vect_ = CountVectorizer(min_df=1,token_pattern=r'\w{1}',ngram_range=(peptide_size,peptide_size))

## Split the data into training & test sets for classification model

In [10]:
X = vect_.fit_transform(df.sequence)
y = df.type

In [11]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state =42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(83964, 14277858) (83964,)
(20992, 14277858) (20992,)


In [12]:
y_test.value_counts()

4    7121
3    6732
2    3480
0    2590
1    1069
Name: type, dtype: int64

## Classification Models

## MNB Classifier Model

In [13]:
%%time
mnb = MultinomialNB(alpha=50)
mnb.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("MNB classifier Score: {:.2f}".format(mnb.score(X_test, y_test)))
print()

MNB classifier Score: 0.90

CPU times: user 5.75 s, sys: 2.4 s, total: 8.16 s
Wall time: 8.77 s


In [14]:
%%time
pac = PassiveAggressiveClassifier()
pac.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("Passive Aggressive classifier Score: {:.2f}".format(pac.score(X_test, y_test)))
print()

Passive Aggressive classifier Score: 0.91

CPU times: user 44.4 s, sys: 1.66 s, total: 46.1 s
Wall time: 45.8 s


In [15]:
%%time
pac2 = PassiveAggressiveClassifier(loss='squared_hinge')
pac2.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("Passive Aggressive classifier with squared hinge loss Score: {:.2f}".format(pac2.score(X_test, y_test)))
print()

Passive Aggressive classifier with squared hinge loss Score: 0.88

CPU times: user 39.9 s, sys: 1.46 s, total: 41.4 s
Wall time: 40.9 s


In [16]:
%%time
sgd = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
sgd.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("SGD classifier Score: {:.2f}".format(sgd.score(X_test, y_test)))
print()

SGD classifier Score: 0.93

CPU times: user 53.7 s, sys: 1.19 s, total: 54.9 s
Wall time: 53.9 s


In [17]:
%%time
sgd2 = linear_model.SGDClassifier(loss= 'log',max_iter=1000, tol=1e-3)
sgd2.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("SGD classifier Score: {:.2f}".format(sgd2.score(X_test, y_test)))
print()

SGD classifier Score: 0.90

CPU times: user 1min 2s, sys: 1.41 s, total: 1min 3s
Wall time: 1min 2s


In [18]:
%%time
per = Perceptron(tol=1e-3, random_state=0).fit(X_train,y_train)
# evaluate accuracy of our model on test data
print("Perceptron classifier Score: {:.2f}".format(per.score(X_test, y_test)))
print()

Perceptron classifier Score: 0.92

CPU times: user 27.6 s, sys: 1.48 s, total: 29.1 s
Wall time: 28.4 s


In [19]:
# Generate Confusion Matrix for Perceptron Model
actual = y_test
predictions = per.predict(X_test)
print('Confusion Matrix for Perceptron Model')
print()
cm = confusion_matrix(actual,predictions)
print(cm)

Confusion Matrix for Perceptron Model

[[2230   53   30   69  208]
 [ 116  892    4   11   46]
 [  45   18 3304   34   79]
 [ 106   44   38 6337  207]
 [ 223   90   49  167 6592]]


In [20]:
#%%time
#rdg = RidgeClassifier().fit(X_train, y_train)
# evaluate accuracy of our model on test data
#print("Ridge classifier Score: {:.2f}".format(rdg.score(X_test, y_test)))
#print()

In [21]:
%%time
clf1 = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2 = PassiveAggressiveClassifier(C=0.1)
#clf3 = MultinomialNB(alpha=50)

eclf1 = VotingClassifier(estimators=[('sgd',clf1), ('pac',clf2)], voting='hard')
eclf1.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("Majority Voting Classifier Score: {:.2f}".format(eclf1.score(X_test, y_test)))
print()

Majority Voting Classifier Score: 0.92

CPU times: user 1min 37s, sys: 4.45 s, total: 1min 42s
Wall time: 1min 45s


In [22]:
# Generate Confusion Matrix for Voting Model
actual = y_test
predictions = eclf1.predict(X_test)
print('Confusion Matrix for Voting Model')
print()
cm = confusion_matrix(actual,predictions)
print(cm)

Confusion Matrix for Voting Model

[[2139  131    4   21  295]
 [  36  996    2    3   32]
 [  41   31 3253   18  137]
 [ 100   87    5 6269  271]
 [ 138  144   11   71 6757]]


In [23]:
%%time
clf1a = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2a = PassiveAggressiveClassifier(C=0.1)
clf3a = MultinomialNB(alpha=50)

eclf2 = VotingClassifier(estimators=[('sgd', clf1a), ('pac', clf2a), ('mnb', clf3a)], voting='hard')
eclf2.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("Majority Voting Classifier Score: {:.2f}".format(eclf2.score(X_test, y_test)))
print()

Majority Voting Classifier Score: 0.92

CPU times: user 1min 35s, sys: 6.81 s, total: 1min 42s
Wall time: 1min 41s


In [24]:
# Generate Confusion Matrix for Voting Model
actual = y_test
predictions = eclf2.predict(X_test)
print('Confusion Matrix for Voting Model')
print()
cm = confusion_matrix(actual,predictions)
print(cm)

Confusion Matrix for Voting Model

[[2069   49    5   16  451]
 [  36  907    1    6  119]
 [  23   19 3246   15  177]
 [  99   69   19 6201  344]
 [  90   54   10   47 6920]]


In [25]:
%%time
clf1b = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2b = PassiveAggressiveClassifier(C=0.1)
clf3b = linear_model.SGDClassifier(loss= 'log',max_iter=1000, tol=1e-3)

eclf3 = VotingClassifier(estimators=[('sgd', clf1b), ('pac', clf2b), ('sgd_lg', clf3b)], voting='hard')
eclf3.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("Majority Voting Classifier Score: {:.2f}".format(eclf3.score(X_test, y_test)))
print()

Majority Voting Classifier Score: 0.92

CPU times: user 2min 35s, sys: 3.4 s, total: 2min 38s
Wall time: 2min 36s


In [26]:
%%time
clf1c = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2c = PassiveAggressiveClassifier(C=0.1)
clf3c = Perceptron(tol=1e-3, random_state=0)

eclf4 = VotingClassifier(estimators=[('sgd', clf1c), ('pac', clf2c), ('per', clf3c)], voting='hard')
eclf4.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("Majority Voting Classifier Score: {:.2f}".format(eclf4.score(X_test, y_test)))
print()

Majority Voting Classifier Score: 0.93

CPU times: user 2min 1s, sys: 4.46 s, total: 2min 5s
Wall time: 2min 6s


In [27]:
vote_prediction4 = eclf4.predict(X_test)
print(classification_report(vote_prediction4, y_test))

              precision    recall  f1-score   support

           0       0.85      0.84      0.84      2625
           1       0.84      0.89      0.86      1019
           2       0.94      0.99      0.96      3292
           3       0.93      0.98      0.96      6373
           4       0.96      0.89      0.93      7683

    accuracy                           0.93     20992
   macro avg       0.90      0.92      0.91     20992
weighted avg       0.93      0.93      0.93     20992



In [28]:
%%time
clf1d = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2d = PassiveAggressiveClassifier(C=0.1)
clf3d = Perceptron(tol=1e-3, random_state=0)
clf4d = MultinomialNB(alpha=50)

eclf5 = VotingClassifier(estimators=[('sgd', clf1d), ('pac', clf2d), ('per', clf3d),('mnb',clf4d)], voting='hard')
eclf5.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("Majority Voting Classifier Score: {:.2f}".format(eclf4.score(X_test, y_test)))
print()

Majority Voting Classifier Score: 0.93

CPU times: user 2min 8s, sys: 12 s, total: 2min 20s
Wall time: 2min 26s


In [29]:
vote_prediction5 = eclf5.predict(X_test)
print(classification_report(vote_prediction5, y_test))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86      2445
           1       0.84      0.89      0.87      1014
           2       0.95      0.98      0.97      3352
           3       0.94      0.98      0.96      6464
           4       0.96      0.89      0.92      7717

    accuracy                           0.93     20992
   macro avg       0.91      0.92      0.91     20992
weighted avg       0.93      0.93      0.93     20992



## Cross Validation of Models

In [30]:
clf1e = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2e = PassiveAggressiveClassifier(C=0.1)

eclf_cv = VotingClassifier(estimators=[('sgd',clf1e), ('pac',clf2e)], voting='hard')
scores = cross_val_score(eclf_cv,X,y, cv = 5)
print("Cross-validation scores for Voting Classifier: {}".format(scores))
print()
print("The average accuracy score for Voting Classifier is: ")
print(np.mean(scores))

Cross-validation scores for Voting Classifier: [ 0.77192397  0.91520983  0.94492878  0.94287756  0.91443137]

The average accuracy score for Voting Classifier is: 
0.897874303013


In [31]:
clf1f = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2f = PassiveAggressiveClassifier(C=0.1)
clf3f = Perceptron(tol=1e-3, random_state=0)

eclf_cv2 = VotingClassifier(estimators=[('sgd',clf1f), ('pac',clf2f),('per', clf3f)], voting='hard')
scores = cross_val_score(eclf_cv2,X,y, cv = 5)

print("Cross-validation scores for Voting Classifier: {}".format(scores))
print()
print("The average accuracy score for Voting Classifier is: ")
print(np.mean(scores))

Cross-validation scores for Voting Classifier: [ 0.79640833  0.92068785  0.94712019  0.94459266  0.91538425]

The average accuracy score for Voting Classifier is: 
0.90483865627


In [32]:
clf1g = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2g = PassiveAggressiveClassifier(C=0.1)
clf3g = Perceptron(tol=1e-3, random_state=0)
clf4g = MultinomialNB(alpha=50)

eclf_cv3 = VotingClassifier(estimators=[('sgd',clf1g), ('pac',clf2g),('per', clf3g),('mnb',clf4g)], voting='hard')
scores3 = cross_val_score(eclf_cv3,X,y, cv = 5)

print("Cross-validation scores for Voting Classifier: {}".format(scores3))
print()
print("The average accuracy score for Voting Classifier is: ")
print(np.mean(scores3))

Cross-validation scores for Voting Classifier: [ 0.80212452  0.92630877  0.94607213  0.94244879  0.91428844]

The average accuracy score for Voting Classifier is: 
0.906248527074
