In [1]:
cd ../../src

/Users/in-divye.singh/Documents/Projects/MIC_predictor/src


In [2]:
import biovec
import numpy as np
import pandas as pd
from itertools import chain, combinations
from collections import Counter

from utils import *

from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [3]:
import seaborn as sns

In [4]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [5]:
avp_ic50 = pd.read_csv("../data/raw/AVP-IC50Pred_train.csv")
ha_avp = pd.read_csv("../data/raw/HA_AVP.csv")

In [6]:
df = pd.concat([avp_ic50[['Sequence','MIC']], ha_avp], axis=0).drop_duplicates(['Sequence']).reset_index(drop=True)
df = sequence_filtering(df)

In [7]:
df['pIC50'] = df['MIC'].apply(lambda x: -np.log(x*1e-6))

In [8]:
family = pd.read_csv("../data/raw/712pep_family.csv")

In [9]:
family

Unnamed: 0,Sequence,Abbreviation,Family
0,KWKVFKKIEKMGRNIRNGIVKAGPAIAVLGEAKAL,JV,Arenaviridae
1,KVLTTGLPALISWIKRKRQQ,JV,Arenaviridae
2,HRILARIRQMMT,PRRSV,Arteriviridae
3,LMRIRQMMT,PRRSV,Arteriviridae
4,HRILMRARQMMT,PRRSV,Arteriviridae
...,...,...,...
702,YTSLIHSLIEEGQNQQEKNEQELLELDKWASLWNWF,HIV,Retroviridae
703,YTSLIQSLIEESQNQQEKNEQQLLELDKWASLWNWF,HIV,Retroviridae
704,KKKKFVKKVAKKVKKVAKKVAKVAVAV,FIV,Retroviridae
705,KRKRFAKKFLRFLRKVIRFLKRFIRRF,FIV,Retroviridae


In [10]:
hiv_cov_seq = family[(family['Abbreviation'] == "HIV")|(family['Abbreviation'] == "SARS-CoV")]

In [11]:
df_hiv_cov = df.merge(hiv_cov_seq,how='right',on='Sequence').reset_index(drop=True)

In [12]:
df_hiv_cov

Unnamed: 0,Sequence,MIC,pIC50,Abbreviation,Family
0,ACWAAGIKQEF,333.000,8.007368,HIV,Retroviridae
1,ACWGAGIKQEF,333.000,8.007368,HIV,Retroviridae
2,ACWWAGIKAEF,333.000,8.007368,HIV,Retroviridae
3,ACWWAGIKQAF,333.000,8.007368,HIV,Retroviridae
4,ACWWAGIRQEF,333.000,8.007368,HIV,Retroviridae
...,...,...,...,...,...
287,YTSLIHSLIEEGQNQQEKNEQELLELDKWASLWNWF,0.001,20.723266,HIV,Retroviridae
288,YTSLIQSLIEESQNQQEKNEQQLLELDKWASLWNWF,0.001,20.723266,HIV,Retroviridae
289,QKQIANQFNKAISQIQESLTTTSTALGKLQDVVNQNAQALNTLVKQ,3.970,12.436744,SARS-CoV,Coronaviridae
290,QNQSANQFQKEISQINEVLTTTNTSLGKLQDDVNQNNQSLNTLQKE,5.070,12.192170,SARS-CoV,Coronaviridae


In [13]:
def get_physicochemical_properties(df):
    params = ['aromaticity', 'helix', 'turn', 'sheet', 'gravy', 'net_charge_at_pH7point4']

    prop = []
    for seq in df.Sequence:
        X = ProteinAnalysis(seq)
        aromaticity = X.aromaticity()
        sec_struc = X.secondary_structure_fraction()
        helix = sec_struc[0]
        turn = sec_struc[1]
        sheet = sec_struc[2]
        gravy = X.gravy() # hydrophobicity related
        net_charge_at_pH7point4 = X.charge_at_pH(7.4)

        prop.append([aromaticity, helix, turn, sheet, gravy, net_charge_at_pH7point4])
    return pd.DataFrame(prop, columns=params)

In [18]:
dipep_freq = reduce_by_kmer_frequency(df_hiv_cov, kmer=2)

In [15]:
uniprot_embedding = biovec.models.load_protvec("../data/embeddings/uniprot__kmer_3_contextWindow_10_vector_100_reduction_None")

avg_protvec = convert_sequences_to_avg_vectors(df_hiv_cov['Sequence'], uniprot_embedding, kmer=3)
avg_protvec = avg_protvec.reset_index(drop=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
Creating vectors: 100%|██████████| 292/292 [00:00<00:00, 1141.79sequence/s]


In [16]:
physicochemical_prop = get_physicochemical_properties(df_hiv_cov)

In [19]:
X = pd.concat([dipep_freq, physicochemical_prop[['turn']]], axis=1)

In [20]:
y = df_hiv_cov[['pIC50', 'MIC']]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
y_train_pmic, y_train_mic = y_train['pIC50'], y_train['MIC']
y_test_pmic, y_test_mic = y_test['pIC50'], y_test['MIC']

In [23]:
X_train

Unnamed: 0,AA,AC,AD,AE,AF,AG,AH,AI,AK,AL,...,YN,YP,YQ,YR,YS,YT,YV,YW,YY,turn
210,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.166667
229,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.037037,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.107143
226,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.222222
73,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.157895
25,0.090909,0.0,0.0,0.0,0.090909,0.090909,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.083333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.034483,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.266667
71,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.066667
106,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.083333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.153846
270,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.194444


In [24]:
from sklearn.model_selection import cross_val_score, GridSearchCV, LeaveOneOut

In [25]:
def multi_objective_score(y_true, y_pred):
    mape = mean_absolute_percentage_error(y_true, y_pred)
    std_diff = abs(np.std(y_pred) - np.std(y_true))
    return mape + 10*std_diff
multi_objective_scorer = make_scorer(multi_objective_score, greater_is_better=False)

In [26]:
param_grid = {
    'C':[0.001,0.01,0.1,1,10,100,1000],
    'kernel':['rbf','poly','sigmoid','linear'],
    'degree':[1,2,3,4,5,6],
    'gamma': np.arange(0.1,1.0,0.1).round(1).tolist() + np.arange(1,11,1).round().tolist()
}
svr = SVR()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = svr, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring=multi_objective_scorer)

In [41]:
grid_search.fit(X_train, y_train_pmic)

Fitting 5 folds for each of 3192 candidates, totalling 15960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 252 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 658 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 1224 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 1954 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 2844 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 3898 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 5112 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 6490 tasks      | elapsed:   35.7s
[Parallel(n_jobs=-1)]: Done 8028 tasks      | elapsed:   44.3s
[Parallel(n_jobs=-1)]: Done 9730 tasks      | elapsed:   53.8s
[Parallel(n_jobs=-1)]: Done 11592 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 13618 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 15804 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 15960 out of 

GridSearchCV(cv=5, estimator=SVR(), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'degree': [1, 2, 3, 4, 5, 6],
                         'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,
                                   1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'kernel': ['rbf', 'poly', 'sigmoid', 'linear']},
             scoring=make_scorer(multi_objective_score, greater_is_better=False),
             verbose=2)

In [42]:
grid_search.best_params_

{'C': 100, 'degree': 1, 'gamma': 5, 'kernel': 'rbf'}

In [43]:
best_grid = grid_search.best_estimator_

In [44]:
y_pred_pmic = best_grid.predict(X_test)

In [45]:
y_pred_mic = np.exp(-y_pred_pmic)/1e-6

In [46]:
mean_absolute_percentage_error(y_test_pmic, y_pred_pmic)

12.318743193755372

In [47]:
np.std(y_test_pmic), np.std(y_pred_pmic)

(3.8969081681731246, 3.0770360209575327)

In [48]:
mean_absolute_percentage_error(y_test_mic, y_pred_mic)

269.9393082281367

In [49]:
mean_squared_error(y_test_mic,y_pred_mic)

6855.501407815547

In [50]:
svr = SVR(C=100, gamma=5)
cv_score = cross_val_score(svr, X_train, y_train_pmic, cv=5, scoring=mape_scorer)
cv_mean = cv_score.mean()
print(cv_score, cv_mean)

_ = svr.fit(X_train, y_train_pmic)
y_pred = svr.predict(X_test)

print(np.std(y_test_pmic), np.std(y_pred))

[-15.39267762 -14.23312201 -15.81252064 -14.65631394 -14.30950802] -14.880828446484319
3.8969081681731246 3.0770360209575327


In [51]:
ape = 100*np.abs(y_test_pmic-y_pred)/y_test_pmic

In [52]:
y_pred_mic = np.exp(-y_pred)/1e-6

In [53]:
ape_mic = 100*np.abs(y_test_mic-y_pred_mic)/y_test_mic

In [54]:
ape_mic.mean()

269.9393082281366

In [55]:
list(zip(y_test_pmic.round(4), y_pred.round(4), ape, y_test_mic.round(4), y_pred_mic.round(4), ape_mic))

[(10.2265, 12.3335, 20.603760216282204, 36.2, 4.4019, 87.84018511882351),
 (18.526, 16.5767, 10.521923230905161, 0.009, 0.0632, 602.3739995502144),
 (9.2103, 10.0633, 9.26039189367937, 100.0, 42.6171, 57.38285774009048),
 (13.6332, 12.7923, 6.16787902658886, 1.2, 2.7821, 131.84030428420044),
 (12.0577, 15.0391, 24.72632909576598, 5.8, 0.2942, 94.92789807327807),
 (11.4176, 10.052, 11.960641656704007, 11.0, 43.0997, 291.8151748949122),
 (16.8731, 15.7403, 6.713942155412845, 0.047, 0.1459, 210.4496045817558),
 (11.3306, 12.0506, 6.354727571426755, 12.0, 5.8409, 51.326186479171916),
 (8.0629, 9.2775, 15.063373210620535, 315.0, 93.5057, 70.31565639444001),
 (10.2862, 13.0396, 26.767435833070326, 34.1, 2.1726, 93.628628993524),
 (12.2061, 10.186, 16.549539653466372, 5.0, 37.6935, 653.8693058678326),
 (19.337, 17.9664, 7.087599481458592, 0.004, 0.0157, 293.7425526430536),
 (8.9029, 10.2451, 15.076362721356396, 136.0, 35.5318, 73.87367642174526),
 (20.0301, 18.6019, 7.130219332154487, 0.002, 

### Leave one out CV

In [56]:
loo = LeaveOneOut()

In [57]:
from tqdm import tqdm

In [58]:
result_df = pd.DataFrame(columns = list(df_hiv_cov.columns)+["y_pred_pmic", "y_pred_mic", "ape_pmic", "ape_mic"])
for train_index, test_index in tqdm(loo.split(X)):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    y_train_pmic, y_train_mic = y_train['pIC50'], y_train['MIC']
    y_test_pmic, y_test_mic = y_test['pIC50'], y_test['MIC']
    svr = SVR(C=100, gamma=2)
    _ = svr.fit(X_train, y_train_pmic)
    y_pred_pmic = svr.predict(X_test)
    y_pred_mic = np.exp(-y_pred_pmic)/1e-6
    ape = 100*np.abs(y_test_pmic-y_pred_pmic)/y_test_pmic
    ape_mic = 100*np.abs(y_test_mic-y_pred_mic)/y_test_mic
    df_val = df_hiv_cov.iloc[test_index,:].values[0].tolist()
    res = np.append(df_val, [y_pred_pmic[0], y_pred_mic[0], ape.values[0], ape_mic.values[0]])
    res = pd.DataFrame([res], columns = list(df_hiv_cov.columns)+["y_pred_pmic", "y_pred_mic", "ape_pmic", "ape_mic"])
    result_df = result_df.append(res)
result_df = result_df[["Sequence", "pIC50", "y_pred_pmic", "ape_pmic", "MIC", "y_pred_mic", "ape_mic"]]

292it [00:15, 19.16it/s]


In [64]:
result_df#.to_csv("../results/SVM_HIV_CoV_pMIC_to_MIC_rbf_c_100_gamma_5_dipep_turn.csv", index=False)

In [60]:
result_df['ape_pmic'].astype('float').mean()

14.6938786920488

In [61]:
result_df['ape_mic'].astype('float').mean()

23360.587338393747