In [1]:
cd ../../../cd_hit_py/

/Users/in-divye.singh/Documents/Projects/cd_hit_py


In [2]:
from cd_hit import CD_HIT

In [3]:
cd ../MIC_predictor/src/

/Users/in-divye.singh/Documents/Projects/MIC_predictor/src


In [4]:
import biovec
import numpy as np
import pandas as pd
from itertools import chain, combinations
from collections import Counter

from utils import *

from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [5]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [6]:
def pearson_score(y_true, y_pred):
    pcc = pearsonr(y_true, y_pred)
    return pcc[0]

pcc_scorer = make_scorer(pearson_score)

In [7]:
avp_ic50 = pd.read_csv("../data/raw/AVP-IC50Pred_train.csv")
ha_avp = pd.read_csv("../data/raw/HA_AVP.csv")

In [8]:
df = pd.concat([avp_ic50[['Sequence','MIC']], ha_avp], axis=0).drop_duplicates(['Sequence']).reset_index(drop=True)
df = sequence_filtering(df)

In [9]:
df['pIC50'] = df['MIC'].apply(lambda x: -np.log(x*1e-6))

In [10]:
family = pd.read_csv("../data/raw/712pep_family.csv")

In [11]:
cov_seq = family[(family['Abbreviation'] == 'SARS-CoV')].reset_index(drop=True)

In [12]:
cov_seq

Unnamed: 0,Sequence,Abbreviation,Family
0,SNNTIAIPTNFSISITTEVM,SARS-CoV,Coronaviridae
1,FKLPLGINITNFRAILTAFS,SARS-CoV,Coronaviridae
2,VLYNSTFFSTFKCYGVSATK,SARS-CoV,Coronaviridae
3,GIGVTQNVLYENQKQIANQF,SARS-CoV,Coronaviridae
4,PALNCYWPLNDYGFYTTSGI,SARS-CoV,Coronaviridae
5,FGGASCCLYCRCHIDHPNPKGFCDLKGKY,SARS-CoV,Coronaviridae
6,GGASCCLYCRCH,SARS-CoV,Coronaviridae
7,IEEINKKVEEIQKKIEELNKKAEELNKKLEELQKK,SARS-CoV,Coronaviridae
8,YQDVNCTDVSTAIHADQLTP,SARS-CoV,Coronaviridae
9,PTTFMLKYDENGTITDAVDC,SARS-CoV,Coronaviridae


In [13]:
df_cov = df.merge(cov_seq,how='right',on='Sequence').reset_index(drop=True)

In [14]:
df_cov#.to_csv("../data/raw/hiv_cdhit_filtered.csv", index=False)

Unnamed: 0,Sequence,MIC,pIC50,Abbreviation,Family
0,SNNTIAIPTNFSISITTEVM,278.801,8.185012,SARS-CoV,Coronaviridae
1,FKLPLGINITNFRAILTAFS,266.434,8.230384,SARS-CoV,Coronaviridae
2,VLYNSTFFSTFKCYGVSATK,262.667,8.244623,SARS-CoV,Coronaviridae
3,GIGVTQNVLYENQKQIANQF,262.539,8.245111,SARS-CoV,Coronaviridae
4,PALNCYWPLNDYGFYTTSGI,258.33,8.261273,SARS-CoV,Coronaviridae
5,FGGASCCLYCRCHIDHPNPKGFCDLKGKY,160.0,8.740337,SARS-CoV,Coronaviridae
6,GGASCCLYCRCH,160.0,8.740337,SARS-CoV,Coronaviridae
7,IEEINKKVEEIQKKIEELNKKAEELNKKLEELQKK,100.0,9.21034,SARS-CoV,Coronaviridae
8,YQDVNCTDVSTAIHADQLTP,61.704,9.693162,SARS-CoV,Coronaviridae
9,PTTFMLKYDENGTITDAVDC,60.018,9.720866,SARS-CoV,Coronaviridae


In [15]:
def get_physicochemical_properties(df):
    params = ['aromaticity', 'helix', 'turn', 'sheet', 'gravy', 'net_charge_at_pH7point4']

    prop = []
    for seq in df.Sequence:
        X = ProteinAnalysis(seq)
        aromaticity = X.aromaticity()
        sec_struc = X.secondary_structure_fraction()
        helix = sec_struc[0]
        turn = sec_struc[1]
        sheet = sec_struc[2]
        gravy = X.gravy() # hydrophobicity related
        net_charge_at_pH7point4 = X.charge_at_pH(7.4)

        prop.append([aromaticity, helix, turn, sheet, gravy, net_charge_at_pH7point4])
    return pd.DataFrame(prop, columns=params)

In [16]:
aa_freq = reduce_by_kmer_frequency(df_cov)

In [17]:
uniprot_embedding = biovec.models.load_protvec("../data/embeddings/uniprot__kmer_3_contextWindow_10_vector_100_reduction_None")

avg_protvec = convert_sequences_to_avg_vectors(df_cov['Sequence'], uniprot_embedding, kmer=3)
avg_protvec = avg_protvec.reset_index(drop=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
Creating vectors: 100%|██████████| 27/27 [00:00<00:00, 847.85sequence/s]


In [18]:
physicochemical_prop = get_physicochemical_properties(df_cov)

In [19]:
X = pd.concat([aa_freq, avg_protvec, physicochemical_prop[['helix','turn','sheet']]], axis=1)

In [20]:
y = df_cov[['pIC50', 'MIC']]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
y_train_pmic, y_train_mic = y_train['pIC50'], y_train['MIC']
y_test_pmic, y_test_mic = y_test['pIC50'], y_test['MIC']

In [23]:
from scipy.stats import pearsonr

In [24]:
from sklearn.model_selection import cross_val_score, GridSearchCV, LeaveOneOut

In [51]:
def multi_objective_score(y_true, y_pred):
    mape = mean_absolute_percentage_error(y_true, y_pred)
    pcc = pearson_score(y_true, y_pred)
    std_diff = abs(np.std(y_pred) - np.std(y_true))
    return mape + 10*std_diff
multi_objective_scorer = make_scorer(multi_objective_score, greater_is_better=False)

In [52]:
param_grid = {
    'C':[0.001,0.01,0.1,1,10,100,1000],
    'kernel':['rbf','poly','sigmoid','linear'],
    'degree':[1,2,3,4,5,6],
    'gamma': np.arange(0.1,1.0,0.1).round(1).tolist() + np.arange(1,11,1).round().tolist()
}
svr = SVR()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = svr, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring=multi_objective_scorer)

In [53]:
grid_search.fit(X_train, y_train_pmic)

Fitting 5 folds for each of 3192 candidates, totalling 15960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 360 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 3560 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 8088 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 13928 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 15960 out of 15960 | elapsed:   25.4s finished


GridSearchCV(cv=5, estimator=SVR(), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'degree': [1, 2, 3, 4, 5, 6],
                         'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,
                                   1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'kernel': ['rbf', 'poly', 'sigmoid', 'linear']},
             scoring=make_scorer(multi_objective_score, greater_is_better=False),
             verbose=2)

In [54]:
grid_search.best_params_

{'C': 0.01, 'degree': 6, 'gamma': 7, 'kernel': 'poly'}

In [55]:
best_grid = grid_search.best_estimator_

In [56]:
y_pred_pmic = best_grid.predict(X_test)

In [57]:
np.std(y_test_pmic), np.std(y_pred_pmic)

(2.4037940553898784, 1.8046209785406346)

In [58]:
mean_absolute_percentage_error(y_test_pmic, y_pred_pmic)

19.267728370660528

In [59]:
y_pred_mic = np.exp(-y_pred_pmic)/1e-6

In [60]:
ape_mic = 100*np.abs(y_test_mic-y_pred_mic)/y_test_mic

In [61]:
mean_absolute_percentage_error(y_test_mic, y_pred_mic)

7037.535999807114

In [62]:
pearson_score(y_test_mic, y_pred_mic)

-0.2995853822894446

In [63]:
list(zip(y_test_mic.round(4), y_pred_mic.round(4), ape_mic))

[(61.704, 37.2877, 39.5700747241679),
 (11.0, 3727.3695, 33785.17749709676),
 (60.018, 59.7536, 0.4405546380141495),
 (0.14, 11.5871, 8176.515085019573),
 (278.801, 42.7807, 84.65545573439569),
 (33.889, 80.9464, 138.85733162978485)]

In [65]:
loo = LeaveOneOut()

from tqdm import tqdm

result_df = pd.DataFrame(columns = list(df_cov.columns)+["y_pred_pmic", "y_pred_mic", "ape_pmic", "ape_mic"])
for train_index, test_index in tqdm(loo.split(X)):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    y_train_pmic, y_train_mic = y_train['pIC50'], y_train['MIC']
    y_test_pmic, y_test_mic = y_test['pIC50'], y_test['MIC']
    svr = SVR(**grid_search.best_params_)
    _ = svr.fit(X_train, y_train_pmic)
    y_pred_pmic = svr.predict(X_test)
    y_pred_mic = np.exp(-y_pred_pmic)/1e-6
    ape = 100*np.abs(y_test_pmic-y_pred_pmic)/y_test_pmic
    ape_mic = 100*np.abs(y_test_mic-y_pred_mic)/y_test_mic
    df_val = df_cov.iloc[test_index,:].values[0].tolist()
    res = np.append(df_val, [y_pred_pmic[0], y_pred_mic[0], ape.values[0], ape_mic.values[0]])
    res = pd.DataFrame([res], columns = list(df_cov.columns)+["y_pred_pmic", "y_pred_mic", "ape_pmic", "ape_mic"])
    result_df = result_df.append(res)
result_df = result_df[["Sequence", "pIC50", "y_pred_pmic", "ape_pmic", "MIC", "y_pred_mic", "ape_mic"]]

27it [00:00, 97.77it/s]


In [66]:
result_df#.to_csv("../results/SVM_HIV_CoV_pMIC_to_MIC_rbf_c_100_gamma_2.csv", index=False)

Unnamed: 0,Sequence,pIC50,y_pred_pmic,ape_pmic,MIC,y_pred_mic,ape_mic
0,SNNTIAIPTNFSISITTEVM,8.185012292283764,18.031200802853167,120.29534176572558,278.801,0.0147621287657289,99.99470513779876
0,FKLPLGINITNFRAILTAFS,8.230383999814125,14.049197044383916,70.69916840698082,266.434,0.7916099574676875,99.70288703488757
0,VLYNSTFFSTFKCYGVSATK,8.244623487739776,10.739607505259,30.26195218289115,262.66700000000003,21.669441726383717,91.750223009977
0,GIGVTQNVLYENQKQIANQF,8.245110915539666,16.13141353479719,95.64822959984818,262.539,0.0986770406793368,99.96241433056448
0,PALNCYWPLNDYGFYTTSGI,8.26127272058809,10.180435937398643,23.230842047228,258.33,37.90468126036944,85.32703082864187
0,FGGASCCLYCRCHIDHPNPKGFCDLKGKY,8.740336742730447,9.36997115614267,7.203777519623661,160.0,85.24584720349382,46.72134549781636
0,GGASCCLYCRCH,8.740336742730447,8.702004570026757,0.4385663142277883,160.0,166.25221253648093,3.9076328353005785
0,IEEINKKVEEIQKKIEELNKKAEELNKKLEELQKK,9.210340371976184,10.144376329278776,10.141166553893427,100.0,39.296451755732335,60.70354824426767
0,YQDVNCTDVSTAIHADQLTP,9.693161799332575,10.2207958633262,5.443363836451716,61.70399999999999,36.40531187537683,41.00007799271225
0,PTTFMLKYDENGTITDAVDC,9.720866040733176,9.644710143659855,0.7834270810255564,60.018,64.76727201691855,7.913079437699612


In [67]:
result_df['ape_pmic'].astype('float').mean()

24.690386488539648

In [68]:
result_df['ape_mic'].astype('float').mean()

4087.5629729079283

In [69]:
pearsonr(result_df['MIC'].astype('float'), result_df['y_pred_mic'].astype('float'))

(-0.16301796745335015, 0.4165390678181603)