In [1]:
cd ../../../cd_hit_py/

/Users/in-divye.singh/Documents/Projects/cd_hit_py


In [2]:
from cd_hit import CD_HIT

In [3]:
cd ../MIC_predictor/src/

/Users/in-divye.singh/Documents/Projects/MIC_predictor/src


In [4]:
import biovec
import numpy as np
import pandas as pd
from itertools import chain, combinations
from collections import Counter

from utils import *

from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [5]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [6]:
def pearson_score(y_true, y_pred):
    pcc = pearsonr(y_true, y_pred)
    return pcc[0]

pcc_scorer = make_scorer(pearson_score)

In [7]:
avp_ic50 = pd.read_csv("../data/raw/AVP-IC50Pred_train.csv")
ha_avp = pd.read_csv("../data/raw/HA_AVP.csv")

In [8]:
df = pd.concat([avp_ic50[['Sequence','MIC']], ha_avp], axis=0).drop_duplicates(['Sequence']).reset_index(drop=True)
df = sequence_filtering(df)

In [9]:
df['pIC50'] = df['MIC'].apply(lambda x: -np.log(x*1e-6))

In [10]:
family = pd.read_csv("../data/raw/712pep_family.csv")

In [11]:
family[family['Family']=='Orthomyxoviridae']['Abbreviation'].unique()

array(['INFV A', 'AIV', 'INFV B'], dtype=object)

In [43]:
infv_cov_seq = family[(family['Abbreviation'] == 'SARS-CoV') | (family['Family'] == 'Orthomyxoviridae')] #(family['Abbreviation'] == 'SARS-CoV') |
infv_cov_seq = infv_cov_seq.reset_index(drop=True)

In [44]:
infv_cov_seq

Unnamed: 0,Sequence,Abbreviation,Family
0,SNNTIAIPTNFSISITTEVM,SARS-CoV,Coronaviridae
1,FKLPLGINITNFRAILTAFS,SARS-CoV,Coronaviridae
2,VLYNSTFFSTFKCYGVSATK,SARS-CoV,Coronaviridae
3,GIGVTQNVLYENQKQIANQF,SARS-CoV,Coronaviridae
4,PALNCYWPLNDYGFYTTSGI,SARS-CoV,Coronaviridae
...,...,...,...
62,MDVNPWLLFLKVPAQ,INFV A,Orthomyxoviridae
63,WLVFFVIAYFAR,INFV A,Orthomyxoviridae
64,WLVFFVIFYFFRRRKK,INFV A,Orthomyxoviridae
65,RRKKWLVFFVIFYFFR,INFV A,Orthomyxoviridae


In [45]:
df_infv_cov = df.merge(infv_cov_seq,how='right',on='Sequence').reset_index(drop=True)

In [46]:
df_infv_cov#.to_csv("../data/raw/infv_cov_data.csv", index=False)

Unnamed: 0,Sequence,MIC,pIC50,Abbreviation,Family
0,SNNTIAIPTNFSISITTEVM,278.801000,8.185012,SARS-CoV,Coronaviridae
1,FKLPLGINITNFRAILTAFS,266.434000,8.230384,SARS-CoV,Coronaviridae
2,VLYNSTFFSTFKCYGVSATK,262.667000,8.244623,SARS-CoV,Coronaviridae
3,GIGVTQNVLYENQKQIANQF,262.539000,8.245111,SARS-CoV,Coronaviridae
4,PALNCYWPLNDYGFYTTSGI,258.330000,8.261273,SARS-CoV,Coronaviridae
...,...,...,...,...,...
62,RRKKWLVFFVIFYFFR,0.000040,23.942142,INFV A,Orthomyxoviridae
63,AGDDQGLDKCVPNSKEK,0.000004,26.322688,INFV A,Orthomyxoviridae
64,QKQIANQFNKAISQIQESLTTTSTALGKLQDVVNQNAQALNTLVKQ,3.970000,12.436744,SARS-CoV,Coronaviridae
65,QNQSANQFQKEISQINEVLTTTNTSLGKLQDDVNQNNQSLNTLQKE,5.070000,12.192170,SARS-CoV,Coronaviridae


In [47]:
def get_physicochemical_properties(df):
    params = ['aromaticity', 'helix', 'turn', 'sheet', 'gravy', 'net_charge_at_pH7point4']

    prop = []
    for seq in df.Sequence:
        X = ProteinAnalysis(seq)
        aromaticity = X.aromaticity()
        sec_struc = X.secondary_structure_fraction()
        helix = sec_struc[0]
        turn = sec_struc[1]
        sheet = sec_struc[2]
        gravy = X.gravy() # hydrophobicity related
        net_charge_at_pH7point4 = X.charge_at_pH(7.4)

        prop.append([aromaticity, helix, turn, sheet, gravy, net_charge_at_pH7point4])
    return pd.DataFrame(prop, columns=params)

In [48]:
aa_freq = reduce_by_kmer_frequency(df_infv_cov)

In [49]:
uniprot_embedding = biovec.models.load_protvec("../data/embeddings/uniprot__kmer_3_contextWindow_10_vector_100_reduction_None")

avg_protvec = convert_sequences_to_avg_vectors(df_infv_cov['Sequence'], uniprot_embedding, kmer=3)
avg_protvec = avg_protvec.reset_index(drop=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
Creating vectors: 100%|██████████| 67/67 [00:00<00:00, 988.54sequence/s]


In [50]:
physicochemical_prop = get_physicochemical_properties(df_infv_cov)

In [51]:
X = pd.concat([aa_freq, avg_protvec, physicochemical_prop[['helix','turn','sheet']]], axis=1)

In [52]:
y = df_infv_cov[['pIC50', 'MIC']]

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
y_train_pmic, y_train_mic = y_train['pIC50'], y_train['MIC']
y_test_pmic, y_test_mic = y_test['pIC50'], y_test['MIC']

In [55]:
from scipy.stats import pearsonr

In [56]:
from sklearn.model_selection import cross_val_score, GridSearchCV, LeaveOneOut

In [57]:
def multi_objective_score(y_true, y_pred):
    mape = mean_absolute_percentage_error(y_true, y_pred)
    pcc = pearson_score(y_true, y_pred)
    return mape - 10*pcc
multi_objective_scorer = make_scorer(multi_objective_score, greater_is_better=False)

In [58]:
max_n_neighbors = int(np.sqrt(X_train.shape[0]))
param_grid = {
    'n_neighbors': range(1, max_n_neighbors),
    'weights': ['uniform', 'distance'],
    'metric': ["euclidean", "manhattan", "chebyshev"]
}
knn = KNeighborsRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = knn, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring=mape_scorer)

In [59]:
grid_search.fit(X_train, y_train_pmic)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    7.1s finished


GridSearchCV(cv=5, estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan', 'chebyshev'],
                         'n_neighbors': range(1, 7),
                         'weights': ['uniform', 'distance']},
             scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
             verbose=2)

In [60]:
grid_search.best_params_

{'metric': 'chebyshev', 'n_neighbors': 6, 'weights': 'uniform'}

In [61]:
best_grid = grid_search.best_estimator_

In [62]:
y_pred_pmic = best_grid.predict(X_test)

In [63]:
mean_absolute_percentage_error(y_test_pmic, y_pred_pmic)

23.311526912917397

In [64]:
y_pred_mic = np.exp(-y_pred_pmic)/1e-6

In [65]:
ape_mic = 100*np.abs(y_test_mic-y_pred_mic)/y_test_mic

In [66]:
mean_absolute_percentage_error(y_test_mic, y_pred_mic)

18432.05211276027

In [67]:
pearson_score(y_test_mic, y_pred_mic)

0.17508973148882

In [68]:
list(zip(y_test_mic.round(4), y_pred_mic.round(4), ape_mic))

[(0.899, 0.1492, 83.40214577763336),
 (41.6, 3.3878, 91.85632326529041),
 (258.33, 5.126, 98.0157218342734),
 (66.0, 11.6121, 82.40592058067952),
 (0.138, 0.0806, 41.60159177919896),
 (0.62, 0.2794, 54.94321171380423),
 (0.0, 0.0765, 254851.86075615647),
 (160.0, 5.3688, 96.64447197245744),
 (3.97, 7.5471, 90.1029598094794),
 (53.0, 20.283, 61.730168473226804),
 (3.0, 0.1905, 93.6509751985317),
 (0.003, 0.0692, 2205.1911357936688),
 (3.0, 0.0564, 98.11871156001449),
 (278.801, 2.2151, 99.20548472898899)]

In [69]:
loo = LeaveOneOut()

from tqdm import tqdm

result_df = pd.DataFrame(columns = list(df_infv_cov.columns)+["y_pred_pmic", "y_pred_mic", "ape_pmic", "ape_mic"])
for train_index, test_index in tqdm(loo.split(X)):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    y_train_pmic, y_train_mic = y_train['pIC50'], y_train['MIC']
    y_test_pmic, y_test_mic = y_test['pIC50'], y_test['MIC']
    knn = KNeighborsRegressor(**grid_search.best_params_)
    _ = knn.fit(X_train, y_train_pmic)
    y_pred_pmic = knn.predict(X_test)
    y_pred_mic = np.exp(-y_pred_pmic)/1e-6
    ape = 100*np.abs(y_test_pmic-y_pred_pmic)/y_test_pmic
    ape_mic = 100*np.abs(y_test_mic-y_pred_mic)/y_test_mic
    df_val = df_infv_cov.iloc[test_index,:].values[0].tolist()
    res = np.append(df_val, [y_pred_pmic[0], y_pred_mic[0], ape.values[0], ape_mic.values[0]])
    res = pd.DataFrame([res], columns = list(df_infv_cov.columns)+["y_pred_pmic", "y_pred_mic", "ape_pmic", "ape_mic"])
    result_df = result_df.append(res)
result_df = result_df[["Sequence", "pIC50", "y_pred_pmic", "ape_pmic", "MIC", "y_pred_mic", "ape_mic"]]

67it [00:00, 104.43it/s]


In [75]:
result_df#.to_csv("../results/kNN_Orthomyxoviridae_CoV_pMIC_to_MIC_dist_chebyshev_k_6_weight_uniform.csv", index=False)

Unnamed: 0,Sequence,pIC50,y_pred_pmic,ape_pmic,MIC,y_pred_mic,ape_mic
0,SNNTIAIPTNFSISITTEVM,8.185012292283764,11.973209473436825,46.282119633764005,278.801,6.31104381064293,97.73636256303135
0,FKLPLGINITNFRAILTAFS,8.230383999814125,15.72024037152923,91.00251424337256,266.434,0.14886285791397294,99.94412767968278
0,VLYNSTFFSTFKCYGVSATK,8.244623487739776,11.8554738847152,43.79642566267537,262.66700000000003,7.099587425320445,97.29711481635667
0,GIGVTQNVLYENQKQIANQF,8.245110915539666,13.251904118233632,60.72438871935124,262.539,1.7569975938352702,99.33076701220189
0,PALNCYWPLNDYGFYTTSGI,8.26127272058809,11.742939723275429,42.14443851987362,258.33,7.945222738815014,96.92439022226803
...,...,...,...,...,...,...,...
0,RRKKWLVFFVIFYFFR,23.942141661814613,17.69504893268824,26.092455793501035,4e-05,0.020660360325249687,51550.900813124215
0,AGDDQGLDKCVPNSKEK,26.32268829627837,10.825071778968073,58.87550824169211,3.7000000000000006e-06,19.894409825912625,537686652.0516925
0,QKQIANQFNKAISQIQESLTTTSTALGKLQDVVNQNAQALNTLVKQ,12.436744463265175,11.794348830130067,5.165303790172524,3.97,7.547087504436332,90.1029598094794
0,QNQSANQFQKEISQINEVLTTTNTSLGKLQDDVNQNNQSLNTLQKE,12.192169740361182,12.090104902436115,0.8371343255432976,5.07,5.6147982417263576,10.74552745022401


In [71]:
result_df['ape_pmic'].astype('float').mean()

21.565288159246506

In [72]:
result_df['ape_mic'].astype('float').mean()

8028163.216787797

In [73]:
pearsonr(result_df['MIC'].astype('float'), result_df['y_pred_mic'].astype('float'))

(0.1106813196127264, 0.37257513555719807)