<a href="https://colab.research.google.com/github/vvithurshan/Antibody_Efficiency_Prediction/blob/main/SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reference

https://towardsdatascience.com/protein-sequence-classification-99c80d0ad2df

https://dmnfarrell.github.io/bioinformatics/mhclearning

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score,ShuffleSplit
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,cross_val_score,ShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


In [2]:
!pip install epitopepredict

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting epitopepredict
  Downloading epitopepredict-0.5.0.tar.gz (11.0 MB)
[K     |████████████████████████████████| 11.0 MB 2.1 MB/s 
Collecting biopython>=1.78
  Downloading biopython-1.80-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 82.9 MB/s 
Building wheels for collected packages: epitopepredict
  Building wheel for epitopepredict (setup.py) ... [?25l[?25hdone
  Created wheel for epitopepredict: filename=epitopepredict-0.5.0-py3-none-any.whl size=5973755 sha256=8fe6f26bb1238bb91fc268f80b2e008fd3accd7b16989e953ec4c18caf27a0b7
  Stored in directory: /root/.cache/pip/wheels/33/0d/a5/5b2802337ae05b248638603e88da786b68227295df42f3da31
Successfully built epitopepredict
Installing collected packages: biopython, epitopepredict
Successfully installed biopython-1.80 epitopepredict-0.5.0


In [3]:
!git clone https://github.com/vvithurshan/Antibody_Efficiency_Prediction.git

Cloning into 'Antibody_Efficiency_Prediction'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 17 (delta 6), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (17/17), done.


Dataset Creation

In [4]:
## Reading dataset
dataset = pd.read_csv("./Antibody_Efficiency_Prediction/data/virus.csv")

In [5]:
## Extracting Columns 
## FASTA_com = amino acid sequence of (CDR + epitope)
## IC50 = ic50 value 

df = dataset[['FASTA_com', 'IC50']].copy()
df.head()

Unnamed: 0,FASTA_com,IC50
0,ALALHFYPGVYDDYGPPIARGVNTLDSWK,50.0
1,ALALHFYPGVYDDYGPPIARGVNALDSWK,50.0
2,ALALHFYPGVYDDYGPPIARGVNALDSWK,50.0
3,ALALHFYPGVYDDYGPPIARGVNALDSWN,50.0
4,ALALHFYPGVYDDYGPPIARGVNALDSWK,50.0


In [6]:
## Binary classification
## When ic50 value <= 10 replace it with 1
## when ic50 value > 10 replace it with 0
df.loc[df['IC50'] <= 10, 'IC50'] = 1
df.loc[df['IC50'] > 10, 'IC50'] = 0

In [7]:
## Checking if there are any empty rows
df.isna().sum()

FASTA_com      0
IC50         102
dtype: int64

In [8]:
## Drop empty rows
df = df.dropna()

One Hot Encoding

In [9]:
codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
         'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

def one_hot_encode(seq):
    o = list(set(codes) - set(seq))
    s = pd.DataFrame(list(seq))    
    x = pd.DataFrame(np.zeros((len(seq),len(o)),dtype=int),columns=o)    
    a = s[0].str.get_dummies(sep=',')
    a = a.join(x)
    a = a.sort_index(axis=1)
    e = a.values.flatten()
    return e

In [10]:
X = df.FASTA_com.apply(lambda x: pd.Series(one_hot_encode(x)),1)
Y = df.IC50

In [11]:
X.isna().sum()

0         0
1         0
2         0
3         0
4         0
       ... 
975    1142
976    1142
977    1142
978    1142
979    1142
Length: 980, dtype: int64

In [12]:
X.fillna(0, inplace = True)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state= 42)

In [14]:
clf = SVC(kernel='linear')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8610354223433242


In [15]:
clf = SVC(kernel='poly', degree=2)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8474114441416893


In [16]:
clf = SVC(kernel='sigmoid')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8419618528610354


In [17]:
clf = SVC(kernel='rbf')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8419618528610354


BLOSUM

In [18]:
import epitopepredict as ep
blosum = ep.blosum62

def blosum_encode(seq):
    #encode a peptide into blosum features
    s=list(seq)
    x = pd.DataFrame([blosum[i] for i in seq]).reset_index(drop=True)
    e = x.values.flatten()    
    return e
        

wrote config file /root/.config/epitopepredict/default.conf


  matplotlib.use('agg', warn=False)


In [19]:
XB = df.FASTA_com.apply(lambda x: pd.Series(blosum_encode(x)),1)

In [20]:
XB.fillna(0, inplace = True)

In [21]:
XB.isna().sum()

0       0
1       0
2       0
3       0
4       0
       ..
1171    0
1172    0
1173    0
1174    0
1175    0
Length: 1176, dtype: int64

In [22]:
X_train, X_test, y_train, y_test = train_test_split(XB, Y, test_size=0.2, random_state= 42)

In [23]:
clf = SVC(kernel='linear')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8719346049046321


In [24]:
clf = SVC(kernel='poly', degree=2)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8283378746594006


In [25]:
clf = SVC(kernel='sigmoid')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8310626702997275


In [26]:
clf = SVC(kernel='rbf')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8283378746594006


NLF

In [27]:
#read the matrix a csv file on github
nlf = pd.read_csv('https://raw.githubusercontent.com/dmnfarrell/epitopepredict/master/epitopepredict/mhcdata/NLF.csv',index_col=0)

def nlf_encode(seq):    
    x = pd.DataFrame([nlf[i] for i in seq]).reset_index(drop=True)  
    # show_matrix(x)
    e = x.values.flatten()
    return e

In [28]:
XN = df.FASTA_com.apply(lambda x: pd.Series(nlf_encode(x)),1)

In [29]:
XN.fillna(0, inplace = True)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(XN, Y, test_size=0.2, random_state= 42)

In [31]:
clf = SVC(kernel='linear')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8610354223433242


In [32]:
clf = SVC(kernel='poly', degree=2)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8310626702997275


In [33]:
clf = SVC(kernel='sigmoid')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8228882833787466


In [34]:
clf = SVC(kernel='rbf')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8310626702997275
