In [1]:
cd ../../../cd_hit_py/

/Users/in-divye.singh/Documents/Projects/cd_hit_py


In [2]:
from cd_hit import CD_HIT

In [3]:
cd ../MIC_predictor/src/

/Users/in-divye.singh/Documents/Projects/MIC_predictor/src


In [4]:
import biovec
import numpy as np
import pandas as pd
from itertools import chain, combinations
from collections import Counter

from utils import *

from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [5]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [6]:
def pearson_score(y_true, y_pred):
    pcc = pearsonr(y_true, y_pred)
    return pcc[0]

pcc_scorer = make_scorer(pearson_score)

In [7]:
avp_ic50 = pd.read_csv("../data/raw/AVP-IC50Pred_train.csv")
ha_avp = pd.read_csv("../data/raw/HA_AVP.csv")

In [8]:
df = pd.concat([avp_ic50[['Sequence','MIC']], ha_avp], axis=0).drop_duplicates(['Sequence']).reset_index(drop=True)
df = sequence_filtering(df)

In [9]:
df['pIC50'] = df['MIC'].apply(lambda x: -np.log(x*1e-6))

In [10]:
family = pd.read_csv("../data/raw/712pep_family.csv")

In [11]:
hiv_cov_seq = family[(family['Abbreviation'] == "HIV")].reset_index(drop=True)

In [12]:
hiv_cov_seq

Unnamed: 0,Sequence,Abbreviation,Family
0,ACWAAGIKQEF,HIV,Retroviridae
1,ACWGAGIKQEF,HIV,Retroviridae
2,ACWWAGIKAEF,HIV,Retroviridae
3,ACWWAGIKQAF,HIV,Retroviridae
4,ACWWAGIRQEF,HIV,Retroviridae
...,...,...,...
260,NMTWMEWDREINNYTSLIHSLIEESQNQQEKNEQEL,HIV,Retroviridae
261,QIWNNMTWMEWDREINNYTSLIHSLIEESQNQQEKN,HIV,Retroviridae
262,SLIHSLIEESQNQQEKNEQELLELDKWASLWNWFNI,HIV,Retroviridae
263,YTSLIHSLIEEGQNQQEKNEQELLELDKWASLWNWF,HIV,Retroviridae


In [13]:
header = ["seq_"+str(i) for i in range(len(hiv_cov_seq['Sequence']))]

In [14]:
seq = hiv_cov_seq['Sequence'].to_list()

In [19]:
cdhit = CD_HIT()

In [21]:
header_filtered, seq_filterted = cdhit.from_list(seq_lst = seq, header_lst=header,threshold=0.9)

In [22]:
filtered_seq = pd.DataFrame(seq_filterted, columns=['Sequence'])

In [24]:
df_hiv_cdhit_filtered = df.merge(filtered_seq,how='right',on='Sequence').reset_index(drop=True)

In [135]:
df_hiv_cdhit_filtered#.to_csv("../data/raw/hiv_cdhit_filtered.csv", index=False)

Unnamed: 0,Sequence,MIC,pIC50
0,ACWAAGIKQEF,333.000,8.007368
1,ACWGAGIKQEF,333.000,8.007368
2,ACWWAGIKAEF,333.000,8.007368
3,ACWWAGIKQAF,333.000,8.007368
4,ACWWAGIRQEF,333.000,8.007368
...,...,...,...
148,VWGIKQLQARILAVERYLKDQQLLGIWG,0.020,17.727534
149,MTWMEWDREINNYTSLIHSLIEESQNQQEKNEQELLEL,0.008,18.643824
150,MTWEAWDRAIAEYAARIEALIRAAQEQQEKNEAALREL,0.007,18.777356
151,TTWEEWDREINEYTSRIESLIRESQEQQEKNEQELREL,0.005,19.113828


In [26]:
def get_physicochemical_properties(df):
    params = ['aromaticity', 'helix', 'turn', 'sheet', 'gravy', 'net_charge_at_pH7point4']

    prop = []
    for seq in df.Sequence:
        X = ProteinAnalysis(seq)
        aromaticity = X.aromaticity()
        sec_struc = X.secondary_structure_fraction()
        helix = sec_struc[0]
        turn = sec_struc[1]
        sheet = sec_struc[2]
        gravy = X.gravy() # hydrophobicity related
        net_charge_at_pH7point4 = X.charge_at_pH(7.4)

        prop.append([aromaticity, helix, turn, sheet, gravy, net_charge_at_pH7point4])
    return pd.DataFrame(prop, columns=params)

In [27]:
aa_freq = reduce_by_kmer_frequency(df_hiv_cdhit_filtered)

In [28]:
uniprot_embedding = biovec.models.load_protvec("../data/embeddings/uniprot__kmer_3_contextWindow_10_vector_100_reduction_None")

avg_protvec = convert_sequences_to_avg_vectors(df_hiv_cdhit_filtered['Sequence'], uniprot_embedding, kmer=3)
avg_protvec = avg_protvec.reset_index(drop=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
Creating vectors: 100%|██████████| 153/153 [00:00<00:00, 1102.45sequence/s]


In [29]:
physicochemical_prop = get_physicochemical_properties(df_hiv_cdhit_filtered)

In [30]:
X = pd.concat([aa_freq, avg_protvec, physicochemical_prop[['helix','turn','sheet']]], axis=1)

In [31]:
y = df_hiv_cdhit_filtered[['pIC50', 'MIC']]

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [76]:
y_train_pmic, y_train_mic = y_train['pIC50'], y_train['MIC']
y_test_pmic, y_test_mic = y_test['pIC50'], y_test['MIC']

In [34]:
from sklearn.model_selection import cross_val_score, GridSearchCV, LeaveOneOut

In [92]:
def multi_objective_score(y_true, y_pred):
    mape = mean_absolute_percentage_error(y_true, y_pred)
    pcc = pearson_score(y_true, y_pred)
    return mape - 10*pcc
multi_objective_scorer = make_scorer(multi_objective_score, greater_is_better=False)

In [117]:
max_n_neighbors = int(np.sqrt(X_train.shape[0]))
param_grid = {
    'n_neighbors': range(1, max_n_neighbors),
    'weights': ['uniform', 'distance'],
    'metric': ["euclidean", "manhattan", "chebyshev"]
}
knn = KNeighborsRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = knn, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring=pcc_scorer)

In [118]:
grid_search.fit(X_train, y_train_pmic)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.7s finished


GridSearchCV(cv=5, estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan', 'chebyshev'],
                         'n_neighbors': range(1, 11),
                         'weights': ['uniform', 'distance']},
             scoring=make_scorer(pearson_score), verbose=2)

In [119]:
grid_search.best_params_

{'metric': 'manhattan', 'n_neighbors': 4, 'weights': 'distance'}

In [120]:
best_grid = grid_search.best_estimator_

In [121]:
y_pred_pmic = best_grid.predict(X_test)

In [122]:
mean_absolute_percentage_error(y_test_pmic, y_pred_pmic)

17.88702629952539

In [123]:
y_pred_mic = np.exp(-y_pred_pmic)/1e-6

In [124]:
ape_mic = 100*np.abs(y_test_mic-y_pred_mic)/y_test_mic

In [125]:
mean_absolute_percentage_error(y_test_mic, y_pred_mic)

155.42735334294008

In [126]:
pearson_score(y_test_mic, y_pred_mic)

0.5172681969937348

In [127]:
list(zip(y_test_mic.round(4), y_pred_mic.round(4), ape_mic))

[(11.0, 27.6149, 151.04493632395685),
 (10.81, 13.8738, 28.34221340870418),
 (6.0, 11.1908, 86.51337633125804),
 (2.0, 4.0281, 101.40269465116609),
 (100.0, 30.3824, 69.61759438185618),
 (2.02, 9.9288, 391.5254192387505),
 (12.72, 37.3416, 193.5661266868698),
 (11.4, 27.0059, 136.8939338300924),
 (142.0, 0.3191, 99.7752913732036),
 (187.0, 7.2177, 96.14027879963967),
 (136.0, 23.0699, 83.03683915017656),
 (200.0, 30.6354, 84.68229610054931),
 (250.0, 72.9032, 70.83870036880778),
 (54.4, 9.291, 82.92101721605472),
 (40.0, 20.5593, 48.60173127665725),
 (23.46, 16.6051, 29.21936677519379),
 (3.05, 0.7162, 76.51772944432103),
 (0.253, 2.1382, 745.1384552268098),
 (29.5, 19.5278, 33.80413242093551),
 (6.9, 15.1982, 120.26440708748062),
 (16.0, 0.9295, 94.19064034750515),
 (1.0, 2.7851, 178.51043749981312),
 (100.0, 33.9372, 66.0627872902759),
 (28.6, 0.7654, 97.32365870733368),
 (35.2, 0.2041, 99.42004762121371),
 (100.0, 26.7225, 73.27754623579158),
 (110.0, 0.2111, 99.80806042213403),
 (0

In [128]:
loo = LeaveOneOut()

from tqdm import tqdm

result_df = pd.DataFrame(columns = list(df_hiv_cdhit_filtered.columns)+["y_pred_pmic", "y_pred_mic", "ape_pmic", "ape_mic"])
for train_index, test_index in tqdm(loo.split(X)):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    y_train_pmic, y_train_mic = y_train['pIC50'], y_train['MIC']
    y_test_pmic, y_test_mic = y_test['pIC50'], y_test['MIC']
    knn = KNeighborsRegressor(**grid_search.best_params_)
    _ = knn.fit(X_train, y_train_pmic)
    y_pred_pmic = knn.predict(X_test)
    y_pred_mic = np.exp(-y_pred_pmic)/1e-6
    ape = 100*np.abs(y_test_pmic-y_pred_pmic)/y_test_pmic
    ape_mic = 100*np.abs(y_test_mic-y_pred_mic)/y_test_mic
    df_val = df_hiv_cdhit_filtered.iloc[test_index,:].values[0].tolist()
    res = np.append(df_val, [y_pred_pmic[0], y_pred_mic[0], ape.values[0], ape_mic.values[0]])
    res = pd.DataFrame([res], columns = list(df_hiv_cdhit_filtered.columns)+["y_pred_pmic", "y_pred_mic", "ape_pmic", "ape_mic"])
    result_df = result_df.append(res)
result_df = result_df[["Sequence", "pIC50", "y_pred_pmic", "ape_pmic", "MIC", "y_pred_mic", "ape_mic"]]

153it [00:01, 110.82it/s]


In [129]:
result_df#.to_csv("../results/SVM_HIV_CoV_pMIC_to_MIC_rbf_c_100_gamma_2.csv", index=False)

Unnamed: 0,Sequence,pIC50,y_pred_pmic,ape_pmic,MIC,y_pred_mic,ape_mic
0,ACWAAGIKQEF,8.00736806798383,9.03066286788145,12.779415048861052,333.0,119.68313276752599,64.05911928903123
0,ACWGAGIKQEF,8.00736806798383,9.102282061228403,13.673831200821278,333.0,111.41127082375617,66.54316191478794
0,ACWWAGIKAEF,8.00736806798383,8.487746165213096,5.999200900355521,333.0,205.97697542291095,38.145052425552265
0,ACWWAGIKQAF,8.00736806798383,8.506803586328427,6.237199465596067,333.0,202.0887528873989,39.31268682060093
0,ACWWAGIRQEF,8.00736806798383,8.539554338888953,6.646207173028357,333.0,195.5774019699292,41.26804745647771
...,...,...,...,...,...,...,...
0,VWGIKQLQARILAVERYLKDQQLLGIWG,17.72753356339242,14.104157508915907,20.43925649058609,0.02,0.749276690134296,3646.3834506714797
0,MTWMEWDREINNYTSLIHSLIEESQNQQEKNEQELLEL,18.643824295266576,14.74484059651796,20.91300388267721,0.008,0.39481813471321686,4835.22668391521
0,MTWEAWDRAIAEYAARIEALIRAAQEQQEKNEAALREL,18.7773556878911,13.067580245358851,30.40777166624316,0.006999999999999999,2.1126230194343596,30080.328849062284
0,TTWEEWDREINEYTSRIESLIRESQEQQEKNEQELREL,19.11382792451231,15.236434774143634,20.285801283144117,0.005,0.24149072409314754,4729.814481862951


In [130]:
result_df['ape_pmic'].astype('float').mean()

16.33867497206926

In [131]:
result_df['ape_mic'].astype('float').mean()

1906.146553954179

In [132]:
from scipy.stats import pearsonr

In [133]:
pearsonr(result_df['MIC'].astype('float'), result_df['y_pred_mic'].astype('float'))

(0.5055121692719897, 2.654597001611252e-11)