In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score,ShuffleSplit
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,cross_val_score,ShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


Dataset Creation

In [2]:
## Reading dataset
dataset = pd.read_csv("./data/virus.csv")

In [3]:
## Extracting Columns 
## FASTA_com = amino acid sequence of (CDR + epitope)
## IC50 = ic50 value 

df = dataset[['FASTA_com', 'IC50']].copy()
df.head()

Unnamed: 0,FASTA_com,IC50
0,ALALHFYPGVYDDYGPPIARGVNTLDSWK,50.0
1,ALALHFYPGVYDDYGPPIARGVNALDSWK,50.0
2,ALALHFYPGVYDDYGPPIARGVNALDSWK,50.0
3,ALALHFYPGVYDDYGPPIARGVNALDSWN,50.0
4,ALALHFYPGVYDDYGPPIARGVNALDSWK,50.0


In [4]:
## Binary classification
## When ic50 value <= 10 replace it with 1
## when ic50 value > 10 replace it with 0
df.loc[df['IC50'] <= 10, 'IC50'] = 1
df.loc[df['IC50'] > 10, 'IC50'] = 0

In [5]:
## Checking if there are any empty rows
df.isna().sum()

FASTA_com      0
IC50         102
dtype: int64

In [6]:
## Drop empty rows
df = df.dropna()

One Hot Encoding

In [7]:
codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
         'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

def one_hot_encode(seq):
    o = list(set(codes) - set(seq))
    s = pd.DataFrame(list(seq))    
    x = pd.DataFrame(np.zeros((len(seq),len(o)),dtype=int),columns=o)    
    a = s[0].str.get_dummies(sep=',')
    a = a.join(x)
    a = a.sort_index(axis=1)
    e = a.values.flatten()
    return e

In [8]:
X = df.FASTA_com.apply(lambda x: pd.Series(one_hot_encode(x)),1)
Y = df.IC50

In [9]:
X.isna().sum()

0         0
1         0
2         0
3         0
4         0
       ... 
975    1142
976    1142
977    1142
978    1142
979    1142
Length: 980, dtype: int64

In [10]:
X.fillna(0, inplace = True)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state= 42)

In [12]:
clf = SVC(kernel='linear')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8610354223433242


In [13]:
clf = SVC(kernel='poly', degree=2)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8474114441416893


In [14]:
clf = SVC(kernel='sigmoid')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8419618528610354


In [15]:
clf = SVC(kernel='rbf')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8419618528610354


BLOSUM

In [16]:
import epitopepredict as ep
blosum = ep.blosum62

def blosum_encode(seq):
    #encode a peptide into blosum features
    s=list(seq)
    x = pd.DataFrame([blosum[i] for i in seq]).reset_index(drop=True)
    e = x.values.flatten()    
    return e
        

In [17]:
XB = df.FASTA_com.apply(lambda x: pd.Series(blosum_encode(x)),1)

In [18]:
XB.fillna(0, inplace = True)

In [19]:
XB.isna().sum()

0       0
1       0
2       0
3       0
4       0
       ..
1171    0
1172    0
1173    0
1174    0
1175    0
Length: 1176, dtype: int64

In [20]:
X_train, X_test, y_train, y_test = train_test_split(XB, Y, test_size=0.2, random_state= 42)

In [21]:
clf = SVC(kernel='linear')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8719346049046321


In [22]:
clf = SVC(kernel='poly', degree=2)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8283378746594006


In [23]:
clf = SVC(kernel='sigmoid')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8310626702997275


In [24]:
clf = SVC(kernel='rbf')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8283378746594006


NLF

In [25]:
#read the matrix a csv file on github
nlf = pd.read_csv('https://raw.githubusercontent.com/dmnfarrell/epitopepredict/master/epitopepredict/mhcdata/NLF.csv',index_col=0)

def nlf_encode(seq):    
    x = pd.DataFrame([nlf[i] for i in seq]).reset_index(drop=True)  
    # show_matrix(x)
    e = x.values.flatten()
    return e

In [26]:
XN = df.FASTA_com.apply(lambda x: pd.Series(nlf_encode(x)),1)

In [27]:
XN.fillna(0, inplace = True)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(XN, Y, test_size=0.2, random_state= 42)

In [29]:
clf = SVC(kernel='linear')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8610354223433242


In [30]:
clf = SVC(kernel='poly', degree=2)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8310626702997275


In [31]:
clf = SVC(kernel='sigmoid')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8228882833787466


In [32]:
clf = SVC(kernel='rbf')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8310626702997275
