In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import re
import nltk
import math
import pickle

import xgboost

from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.linear_model import Perceptron

from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from Bio.SeqUtils.ProtParam import ProteinAnalysis
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score, cross_val_predict

%matplotlib inline

In [2]:
# read in data from csv file
df = pd.read_csv('proteins-5-functions.csv')

df.head()

Unnamed: 0,classification,name,sequence,structureId,type
0,TRANSFERASE,,MEIYEGKLTAEGLRFGIVASRFNHALVDRLVEGAIDCIVRHGGREE...,5MPP,4
1,HYDROLASE,,MKFTLTIAGLLAVGSTAAPTTEKRNPGGIDYVQNYNGDVADFQYNE...,3M4F,0
2,TRANSFERASE,,MRGSHHHHHHGSMKRAVITGLGIVSSIGNNQQEVLASLREGRSGIT...,2BYY,4
3,HYDROLASE,,STGSATTTPIDSLDDAYITPVQIGTPAQTLNLDFDTGSSDLWVFSS...,4YCY,0
4,TRANSFERASE,,GSGMMRYLHKIELELNRLTSRYPFFKKIAFDAEIIKLVDDLNVDEN...,3AQC,4


In [3]:
# read in more immune system sequences data from csv file
df_2 = pd.read_csv('uniprot-human-immune.csv')

df_2.head()

Unnamed: 0,name,sequence,classification,type
0,tr|Q6FGW4|Q6FGW4_HUMAN,MHSSALLCCLVLLTGVRASPGQGTQSENSCTHFPGNLPNMLRDLRD...,IMMUNE SYSTEM,1
1,tr|A0A024R5Z3|A0A024R5Z3_HUMAN,MSSILPFTPPIVKRLLGWKKGEQNGQEEKWCEKAVKSLVKKLKKTG...,IMMUNE SYSTEM,1
2,tr|A0A0B4Q6D0|A0A0B4Q6D0_9PLVG,MRVKGIRKNYQHLWRGGTLLLGMLMICSAVEKLWVTVYYGVPVWKE...,IMMUNE SYSTEM,1
3,tr|A0A0B4Q7W6|A0A0B4Q7W6_9PLVG,MRVKGIRKNYQHLWRGGTLLLGMLMICSAVEKLWVTVYYGVPVWKE...,IMMUNE SYSTEM,1
4,tr|A0A0B4Q6H6|A0A0B4Q6H6_9PLVG,MRVKGIRKNYQHLWRGGTLLLGMLMICSAVEKLWVTVHYGVPVWKE...,IMMUNE SYSTEM,1


In [4]:
df_2.shape

(84798, 4)

In [5]:
df3 = df_2.sample(frac=0.33)
df3.shape

(27983, 4)

In [6]:
df.columns

Index(['classification', 'name', 'sequence', 'structureId', 'type'], dtype='object')

In [7]:
df.shape

(104956, 5)

In [8]:
# concat the dataframes
df_ = pd.concat([df,df3])

df_.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  from ipykernel import kernelapp as app


Unnamed: 0,classification,name,sequence,structureId,type
0,TRANSFERASE,,MEIYEGKLTAEGLRFGIVASRFNHALVDRLVEGAIDCIVRHGGREE...,5MPP,4
1,HYDROLASE,,MKFTLTIAGLLAVGSTAAPTTEKRNPGGIDYVQNYNGDVADFQYNE...,3M4F,0
2,TRANSFERASE,,MRGSHHHHHHGSMKRAVITGLGIVSSIGNNQQEVLASLREGRSGIT...,2BYY,4
3,HYDROLASE,,STGSATTTPIDSLDDAYITPVQIGTPAQTLNLDFDTGSSDLWVFSS...,4YCY,0
4,TRANSFERASE,,GSGMMRYLHKIELELNRLTSRYPFFKKIAFDAEIIKLVDDLNVDEN...,3AQC,4


In [9]:
df_.shape

(132939, 5)

## Remove duplicate protein sequences from data

In [10]:
# remove the duplicate protein sequences
df_ = df_.drop_duplicates(subset='sequence', keep="first")

df_.shape

(132937, 5)

## 5 Classes of Proteins

In [11]:
df_.classification.value_counts()

TRANSFERASE      35880
LIGASE           33967
IMMUNE SYSTEM    33438
ISOMERASE        16988
HYDROLASE        12664
Name: classification, dtype: int64

In [12]:
df_.type.value_counts()

4    35880
3    33967
1    33438
2    16988
0    12664
Name: type, dtype: int64

## Subset the data: use fewer rows of data for quicker comparison of XGBoost with other models

In [13]:
df_new= df_.sample(frac = 1.0)

df_new.shape

(132937, 5)

## Classifier of Protein Sequences: Predict 5 Classes

## CountVectorizer: Count peptide frequency, transform the data

In [14]:
# In this case, peptide frequency is used for analysis
peptide_size = 5
vect_ = CountVectorizer(min_df=1,token_pattern=r'\w{1}',ngram_range=(peptide_size,peptide_size))

## Split the data into training & test sets for classification model

In [15]:
X = vect_.fit_transform(df_new.sequence)
y = df_new.type

In [16]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state =42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(106349, 2707148) (106349,)
(26588, 2707148) (26588,)


In [17]:
y_test.value_counts()

4    7117
3    6835
1    6715
2    3420
0    2501
Name: type, dtype: int64

## Classification Models

## MNB Classifier Model

In [18]:
%%time
mnb = MultinomialNB(alpha=50)
mnb.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("MNB classifier Score: {:.2f}".format(mnb.score(X_test, y_test)))
print()

MNB classifier Score: 0.84

CPU times: user 2.77 s, sys: 597 ms, total: 3.37 s
Wall time: 3.79 s


In [19]:
%%time
pac = PassiveAggressiveClassifier()
pac.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("Passive Aggressive classifier Score: {:.2f}".format(pac.score(X_test, y_test)))
print()

Passive Aggressive classifier Score: 0.93

CPU times: user 39.9 s, sys: 753 ms, total: 40.7 s
Wall time: 40 s


In [20]:
%%time
pac2 = PassiveAggressiveClassifier(loss='squared_hinge')
pac2.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("Passive Aggressive classifier with squared hinge loss Score: {:.2f}".format(pac2.score(X_test, y_test)))
print()

Passive Aggressive classifier with squared hinge loss Score: 0.93

CPU times: user 37.7 s, sys: 438 ms, total: 38.2 s
Wall time: 37.1 s


In [21]:
%%time
sgd = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
sgd.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("SGD classifier Score: {:.2f}".format(sgd.score(X_test, y_test)))
print()

SGD classifier Score: 0.93

CPU times: user 31.5 s, sys: 413 ms, total: 31.9 s
Wall time: 30.8 s


In [22]:
%%time
sgd2 = linear_model.SGDClassifier(loss= 'log',max_iter=1000, tol=1e-3)
sgd2.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("SGD classifier Score: {:.2f}".format(sgd2.score(X_test, y_test)))
print()

SGD classifier Score: 0.93

CPU times: user 1min 9s, sys: 747 ms, total: 1min 10s
Wall time: 1min 9s


In [23]:
%%time
per = Perceptron(tol=1e-3, random_state=0).fit(X_train,y_train)
# evaluate accuracy of our model on test data
print("Perceptron classifier Score: {:.2f}".format(per.score(X_test, y_test)))
print()

Perceptron classifier Score: 0.93

CPU times: user 38.7 s, sys: 480 ms, total: 39.1 s
Wall time: 38 s


In [24]:
# Generate Confusion Matrix for Perceptron Model
actual = y_test
predictions = per.predict(X_test)
print('Confusion Matrix for Perceptron Model')
print()
cm = confusion_matrix(actual,predictions)
print(cm)

Confusion Matrix for Perceptron Model

[[2078   30   62   90  241]
 [  92 6524   14   28   57]
 [  48    8 3235   37   92]
 [  87   16   48 6449  235]
 [ 218   32   95  270 6502]]


## Voting Models

In [26]:
%%time
clf1 = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2 = PassiveAggressiveClassifier(C=0.1)
#clf3 = MultinomialNB(alpha=50)

eclf1 = VotingClassifier(estimators=[('sgd',clf1), ('pac',clf2)], voting='hard')
eclf1.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("Majority Voting Classifier Score: {:.2f}".format(eclf1.score(X_test, y_test)))
print()

Majority Voting Classifier Score: 0.94

CPU times: user 1min 27s, sys: 1.16 s, total: 1min 28s
Wall time: 1min 26s


In [27]:
# Generate Confusion Matrix for Voting Model
actual = y_test
predictions = eclf1.predict(X_test)
print('Confusion Matrix for Voting Model')
print()
cm = confusion_matrix(actual,predictions)
print(cm)

Confusion Matrix for Voting Model

[[2137   30   11   39  284]
 [ 118 6529    2   16   50]
 [  94   20 3146   26  134]
 [ 171   48    3 6374  239]
 [ 232   43   16  151 6675]]


In [28]:
%%time
clf1a = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2a = PassiveAggressiveClassifier(C=0.1)
clf3a = MultinomialNB(alpha=50)

eclf2 = VotingClassifier(estimators=[('sgd', clf1a), ('pac', clf2a), ('mnb', clf3a)], voting='hard')
eclf2.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("Majority Voting Classifier Score: {:.2f}".format(eclf2.score(X_test, y_test)))
print()

Majority Voting Classifier Score: 0.94

CPU times: user 1min 12s, sys: 1.29 s, total: 1min 13s
Wall time: 1min 11s


In [29]:
# Generate Confusion Matrix for Voting Model
actual = y_test
predictions = eclf2.predict(X_test)
print('Confusion Matrix for Voting Model')
print()
cm = confusion_matrix(actual,predictions)
print(cm)

Confusion Matrix for Voting Model

[[2020   24    9   46  402]
 [  83 6527    2   18   85]
 [  84   20 3119   28  169]
 [ 109   20    1 6431  274]
 [ 134   26   14  133 6810]]


In [30]:
%%time
clf1b = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2b = PassiveAggressiveClassifier(C=0.1)
clf3b = linear_model.SGDClassifier(loss= 'log',max_iter=1000, tol=1e-3)

eclf3 = VotingClassifier(estimators=[('sgd', clf1b), ('pac', clf2b), ('sgd_lg', clf3b)], voting='hard')
eclf3.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("Majority Voting Classifier Score: {:.2f}".format(eclf3.score(X_test, y_test)))
print()

Majority Voting Classifier Score: 0.93

CPU times: user 2min 8s, sys: 2.04 s, total: 2min 10s
Wall time: 2min 7s


In [31]:
%%time
clf1c = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2c = PassiveAggressiveClassifier(C=0.1)
clf3c = Perceptron(tol=1e-3, random_state=0)

eclf4 = VotingClassifier(estimators=[('sgd', clf1c), ('pac', clf2c), ('per', clf3c)], voting='hard')
eclf4.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("Majority Voting Classifier Score: {:.2f}".format(eclf4.score(X_test, y_test)))
print()

Majority Voting Classifier Score: 0.94

CPU times: user 2min 3s, sys: 1.99 s, total: 2min 5s
Wall time: 2min 2s


In [32]:
vote_prediction4 = eclf4.predict(X_test)
print(classification_report(vote_prediction4, y_test))

              precision    recall  f1-score   support

           0       0.84      0.83      0.84      2553
           1       0.98      0.98      0.98      6653
           2       0.92      0.98      0.95      3213
           3       0.93      0.98      0.95      6531
           4       0.95      0.89      0.92      7638

    accuracy                           0.94     26588
   macro avg       0.93      0.93      0.93     26588
weighted avg       0.94      0.94      0.94     26588



In [33]:
%%time
clf1d = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2d = PassiveAggressiveClassifier(C=0.1)
clf3d = Perceptron(tol=1e-3, random_state=0)
clf4d = MultinomialNB(alpha=50)

eclf5 = VotingClassifier(estimators=[('sgd', clf1d), ('pac', clf2d), ('per', clf3d),('mnb',clf4d)], voting='hard')
eclf5.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("Majority Voting Classifier Score: {:.2f}".format(eclf5.score(X_test, y_test)))
print()

Majority Voting Classifier Score: 0.94

CPU times: user 2min 11s, sys: 3.31 s, total: 2min 14s
Wall time: 2min 12s


In [34]:
vote_prediction5 = eclf5.predict(X_test)
print(classification_report(vote_prediction5, y_test))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84      2423
           1       0.97      0.99      0.98      6598
           2       0.92      0.99      0.96      3188
           3       0.95      0.94      0.95      6917
           4       0.94      0.90      0.92      7462

    accuracy                           0.94     26588
   macro avg       0.92      0.93      0.93     26588
weighted avg       0.94      0.94      0.94     26588



In [35]:
%%time
clf1e = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2e = PassiveAggressiveClassifier(C=0.1)
clf3e = Perceptron(tol=1e-3, random_state=0)
clf4e = MultinomialNB(alpha=50)
clf5e = LogisticRegression(C=3.0)

eclf_6 = VotingClassifier(estimators=[('sgd', clf1e), ('pac', clf2e), \
                                      ('per', clf3e),('mnb',clf4e),('lr',clf5e)], voting='hard')
eclf_6.fit(X_train, y_train)
# evaluate accuracy of our model on test data
print("Majority Voting Classifier Score: {:.2f}".format(eclf_6.score(X_test, y_test)))
print()



Majority Voting Classifier Score: 0.94

CPU times: user 56min 58s, sys: 53.1 s, total: 57min 52s
Wall time: 51min 6s


In [36]:
vote_prediction_6 = eclf_6.predict(X_test)
print(classification_report(vote_prediction_6, y_test))

              precision    recall  f1-score   support

           0       0.82      0.85      0.84      2404
           1       0.97      0.99      0.98      6580
           2       0.92      0.99      0.96      3180
           3       0.94      0.96      0.95      6683
           4       0.96      0.88      0.92      7741

    accuracy                           0.94     26588
   macro avg       0.92      0.94      0.93     26588
weighted avg       0.94      0.94      0.94     26588



## Cross Validation of Models

In [37]:
clf1e = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2e = PassiveAggressiveClassifier(C=0.1)

eclf_cv = VotingClassifier(estimators=[('sgd',clf1e), ('pac',clf2e)], voting='hard')
scores = cross_val_score(eclf_cv,X,y, cv = 5)
print("Cross-validation scores for Voting Classifier: {}".format(scores))
print()
print("The average accuracy score for Voting Classifier is: ")
print(np.mean(scores))

Cross-validation scores for Voting Classifier: [ 0.93576291  0.93068562  0.93230029  0.93278417  0.93334587]

The average accuracy score for Voting Classifier is: 
0.932975772209


In [38]:
clf1f = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2f = PassiveAggressiveClassifier(C=0.1)
clf3f = Perceptron(tol=1e-3, random_state=0)

eclf_cv2 = VotingClassifier(estimators=[('sgd',clf1f), ('pac',clf2f),('per', clf3f)], voting='hard')
scores = cross_val_score(eclf_cv2,X,y, cv = 5)

print("Cross-validation scores for Voting Classifier: {}".format(scores))
print()
print("The average accuracy score for Voting Classifier is: ")
print(np.mean(scores))

Cross-validation scores for Voting Classifier: [ 0.93963669  0.93565008  0.94016097  0.94041977  0.93895054]

The average accuracy score for Voting Classifier is: 
0.938963610684


In [39]:
clf1g = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf2g = PassiveAggressiveClassifier(C=0.1)
clf3g = Perceptron(tol=1e-3, random_state=0)
clf4g = MultinomialNB(alpha=50)

eclf_cv3 = VotingClassifier(estimators=[('sgd',clf1g), ('pac',clf2g),('per', clf3g),('mnb',clf4g)], voting='hard')
scores3 = cross_val_score(eclf_cv3,X,y, cv = 5)

print("Cross-validation scores for Voting Classifier: {}".format(scores3))
print()
print("The average accuracy score for Voting Classifier is: ")
print(np.mean(scores3))

Cross-validation scores for Voting Classifier: [ 0.94065215  0.93497311  0.9402362   0.94060784  0.93722024]

The average accuracy score for Voting Classifier is: 
0.938737906207
