In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import re
import nltk
import math
import pickle

from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.metrics
from sklearn.metrics import confusion_matrix
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score, cross_val_predict

%matplotlib inline

In [2]:
# Data taken from https://www.kaggle.com/shahir/protein-data-set
# This is a protein data set retrieved from Research Collaboratory for Structural Bioinformatics (RCSB) Protein Data Bank (PDB).

In [3]:
# read in data from csv file
df = pd.read_csv('proteins-5-functions.csv')

df.head()

Unnamed: 0,classification,name,sequence,structureId,type
0,TRANSFERASE,,MEIYEGKLTAEGLRFGIVASRFNHALVDRLVEGAIDCIVRHGGREE...,5MPP,4
1,HYDROLASE,,MKFTLTIAGLLAVGSTAAPTTEKRNPGGIDYVQNYNGDVADFQYNE...,3M4F,0
2,TRANSFERASE,,MRGSHHHHHHGSMKRAVITGLGIVSSIGNNQQEVLASLREGRSGIT...,2BYY,4
3,HYDROLASE,,STGSATTTPIDSLDDAYITPVQIGTPAQTLNLDFDTGSSDLWVFSS...,4YCY,0
4,TRANSFERASE,,GSGMMRYLHKIELELNRLTSRYPFFKKIAFDAEIIKLVDDLNVDEN...,3AQC,4


## Classification of Proteins: Logistic Regression Model

## 5 Classes of Proteins

## CountVectorizer: Count peptide frequency, transform the data

In [4]:
# In this case, peptide frequency is used for analysis
peptide_size = 6
vect_ = CountVectorizer(min_df=1,token_pattern=r'\w{1}',ngram_range=(peptide_size,peptide_size))

## Use a smaller amount of data to find optimum C value

In [9]:
df_new = df.sample(frac=0.10)
df_new.shape

(10496, 5)

In [10]:
df_new.head()

Unnamed: 0,classification,name,sequence,structureId,type
102319,TRANSFERASE,sp|Q2RMC5|GLMU_MOOTA,MADTVAVILAAGQGKRMHSRRPKVLHRIAGRCLVEHVLAAVGEAGI...,,4
29913,TRANSFERASE,,NDPDTLEIYS,2A0T,4
44613,ISOMERASE,,MKHLKNNTKKFTALLFALLFSMSIAGCNMIEKTPEAIEKSPVAKVG...,sp|Q899I2|PRSA_CLOTE,2
32691,ISOMERASE,,MTTLEAIKFDRTNVTLQILDQLLIPYSTEYLNIEGVDDAYDAIKSM...,sp|A5DNT0|MTNA_PICGU,2
30207,HYDROLASE,,ATSTKKLHKEPATLIKAIDGDTVKLMYKGQPMTFRLLEVDTPEFNE...,5ISR,0


In [12]:
df_new.columns

Index(['classification', 'name', 'sequence', 'structureId', 'type'], dtype='object')

## Split the data into training & test sets for classification model

In [13]:
X = vect_.fit_transform(df_new.sequence)
y = df_new.type

In [14]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state =42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(8396, 3126417) (8396,)
(2100, 3126417) (2100,)


In [15]:
y_test.value_counts()

4    731
3    679
2    330
0    257
1    103
Name: type, dtype: int64

## Optimization of Logistic Regression Model

In [16]:
C_s = [0.001,0.01,0.1,1.0,10,100]
# Iterate over the alphas and print the corresponding score
for c in C_s:
    lr_op = LogisticRegression(C = c)
    lr_op.fit(X_train, y_train)
    print('C Value: ', c)
    print('Score: ',lr_op.score(X_test, y_test) )
    print()



C Value:  0.001
Score:  0.632857142857

C Value:  0.01
Score:  0.710952380952

C Value:  0.1
Score:  0.740952380952

C Value:  1.0
Score:  0.754285714286

C Value:  10
Score:  0.694285714286

C Value:  100
Score:  0.659047619048



In [17]:
C_s = np.arange(1,10.5,0.5)
# Iterate over the alphas and print the corresponding score
for c in C_s:
    lr_op = LogisticRegression(C = c)
    lr_op.fit(X_train, y_train)
    print('C Value: ', c)
    print('Score: ',lr_op.score(X_test, y_test) )
    print()



C Value:  1.0
Score:  0.754285714286

C Value:  1.5
Score:  0.758095238095

C Value:  2.0
Score:  0.762380952381

C Value:  2.5
Score:  0.76619047619

C Value:  3.0
Score:  0.769047619048

C Value:  3.5
Score:  0.767619047619

C Value:  4.0
Score:  0.760476190476

C Value:  4.5
Score:  0.752380952381

C Value:  5.0
Score:  0.743333333333

C Value:  5.5
Score:  0.737142857143

C Value:  6.0
Score:  0.73

C Value:  6.5
Score:  0.723333333333

C Value:  7.0
Score:  0.71619047619

C Value:  7.5
Score:  0.711904761905

C Value:  8.0
Score:  0.70619047619

C Value:  8.5
Score:  0.703333333333

C Value:  9.0
Score:  0.698571428571

C Value:  9.5
Score:  0.696666666667

C Value:  10.0
Score:  0.694285714286



## Train & test optimized logistic regression model with entire dataset

In [18]:
X = vect_.fit_transform(df.sequence)
y = df.type

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state =42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(83964, 14277858) (83964,)
(20992, 14277858) (20992,)


In [19]:
y_test.value_counts()

4    7121
3    6732
2    3480
0    2590
1    1069
Name: type, dtype: int64

In [20]:
# Logistic Regression with optimized C value
lr = LogisticRegression(C=3.0)
lr.fit(X_train, y_train)
lr.predict(X_test)
print("Logistic Regression Score: {:.2f}".format(lr.score(X_test, y_test)))



Logistic Regression Score: 0.92


In [21]:
import pickle

# Save the Modle to file in the current working directory

Pkl_Filename = "lr_function5_hexa.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(lr, file)

In [22]:
# Load the Model back from file

with open(Pkl_Filename, 'rb') as file:  
    Pickled_lr = pickle.load(file)

Pickled_lr

LogisticRegression(C=3.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)