In [53]:
import pandas as pd
import tensorflow as tf
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score,ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score,ShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


Dataset Creation

In [85]:
## Reading dataset
dataset = pd.read_csv("./data/virus.csv")

In [86]:
## Extracting Columns 
## FASTA_com = amino acid sequence of (CDR + epitope)
## IC50 = ic50 value 

df = dataset[['FASTA_com', 'IC50']].copy()
df.head()

Unnamed: 0,FASTA_com,IC50
0,ALALHFYPGVYDDYGPPIARGVNTLDSWK,50.0
1,ALALHFYPGVYDDYGPPIARGVNALDSWK,50.0
2,ALALHFYPGVYDDYGPPIARGVNALDSWK,50.0
3,ALALHFYPGVYDDYGPPIARGVNALDSWN,50.0
4,ALALHFYPGVYDDYGPPIARGVNALDSWK,50.0


In [87]:
## Binary classification
## When ic50 value <= 10 replace it with 1
## when ic50 value > 10 replace it with 0
df.loc[df['IC50'] <= 10, 'IC50'] = 1
df.loc[df['IC50'] > 10, 'IC50'] = 0

In [88]:
## Checking if there are any empty rows
df.isna().sum()

FASTA_com      0
IC50         102
dtype: int64

In [89]:
## Drop empty rows
df = df.dropna()

One Hot Encoding

In [90]:
codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
         'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

def one_hot_encode(seq):
    o = list(set(codes) - set(seq))
    s = pd.DataFrame(list(seq))    
    x = pd.DataFrame(np.zeros((len(seq),len(o)),dtype=int),columns=o)    
    a = s[0].str.get_dummies(sep=',')
    a = a.join(x)
    a = a.sort_index(axis=1)
    e = a.values.flatten()
    return e

In [91]:
X = df.FASTA_com.apply(lambda x: pd.Series(one_hot_encode(x)),1)
Y = df.IC50

In [92]:
X.isna().sum()

0         0
1         0
2         0
3         0
4         0
       ... 
975    1142
976    1142
977    1142
978    1142
979    1142
Length: 980, dtype: int64

In [93]:
X.fillna(0, inplace = True)

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state= 42)

In [95]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=0)

In [96]:
y_pred = classifier.predict(X_test)

In [97]:

cm = confusion_matrix(y_test, y_pred)
  
print ("Confusion Matrix : \n", cm)

Confusion Matrix : 
 [[ 85  45]
 [  5 232]]


In [98]:
print ("Accuracy : ",100* accuracy_score(y_test, y_pred))

Accuracy :  86.37602179836512


BLOSUM

In [99]:
import epitopepredict as ep
blosum = ep.blosum62

def blosum_encode(seq):
    #encode a peptide into blosum features
    s=list(seq)
    x = pd.DataFrame([blosum[i] for i in seq]).reset_index(drop=True)
    e = x.values.flatten()    
    return e
        

In [100]:
XB = df.FASTA_com.apply(lambda x: pd.Series(blosum_encode(x)),1)

In [101]:
XB.fillna(0, inplace = True)

In [102]:
XB.isna().sum()

0       0
1       0
2       0
3       0
4       0
       ..
1171    0
1172    0
1173    0
1174    0
1175    0
Length: 1176, dtype: int64

In [103]:
X_train, X_test, y_train, y_test = train_test_split(XB, Y, test_size=0.2, random_state= 42)

In [104]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=0)

In [105]:
y_pred = classifier.predict(X_test)

In [106]:
cm = confusion_matrix(y_test, y_pred)
  
print ("Confusion Matrix : \n", cm)

Confusion Matrix : 
 [[ 88  42]
 [  6 231]]


In [107]:
print ("Accuracy : ", 100 * accuracy_score(y_test, y_pred))

Accuracy :  86.92098092643052


NLF

In [108]:
#read the matrix a csv file on github
nlf = pd.read_csv('https://raw.githubusercontent.com/dmnfarrell/epitopepredict/master/epitopepredict/mhcdata/NLF.csv',index_col=0)

def nlf_encode(seq):    
    x = pd.DataFrame([nlf[i] for i in seq]).reset_index(drop=True)  
    # show_matrix(x)
    e = x.values.flatten()
    return e

In [109]:
XN = df.FASTA_com.apply(lambda x: pd.Series(nlf_encode(x)),1)

In [110]:
XN.fillna(0, inplace = True)

In [111]:
X_train, X_test, y_train, y_test = train_test_split(XN, Y, test_size=0.2, random_state= 42)

In [112]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=0)

In [113]:
y_pred = classifier.predict(X_test)

In [114]:
cm = confusion_matrix(y_test, y_pred)
  
print ("Confusion Matrix : \n", cm)

Confusion Matrix : 
 [[ 87  43]
 [  5 232]]


In [115]:
print ("Accuracy : ",100 * accuracy_score(y_test, y_pred))

Accuracy :  86.92098092643052
