In [1]:
cd ../../src/

/Users/in-divye.singh/Documents/Projects/MIC_predictor/src


In [31]:
import biovec
import numpy as np
import pandas as pd
from itertools import chain, combinations
from collections import Counter

from utils import *

from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [18]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [29]:
def pearson_score(y_true, y_pred):
    pcc = pearsonr(y_true, y_pred)
    return pcc[0]

pcc_scorer = make_scorer(pearson_score)

In [3]:
mic = pd.read_csv("../data/raw/MIC_pIC50_values.csv")

In [4]:
seq = pd.read_csv("../data/raw/peptide_target_seq.csv")

In [5]:
seq = seq.merge(mic, on='Sequence')

In [6]:
def get_physicochemical_properties(df):
    params = ['aromaticity', 'helix', 'turn', 'sheet', 'gravy', 'net_charge_at_pH7point4']

    prop = []
    for seq in df.Sequence:
        X = ProteinAnalysis(seq)
        aromaticity = X.aromaticity()
        sec_struc = X.secondary_structure_fraction()
        helix = sec_struc[0]
        turn = sec_struc[1]
        sheet = sec_struc[2]
        gravy = X.gravy() # hydrophobicity related
        net_charge_at_pH7point4 = X.charge_at_pH(7.4)

        prop.append([aromaticity, helix, turn, sheet, gravy, net_charge_at_pH7point4])
    return pd.DataFrame(prop, columns=params)

In [7]:
aa_freq = reduce_by_kmer_frequency(seq)

In [9]:
uniprot_embedding = biovec.models.load_protvec("../data/embeddings/uniprot__kmer_3_contextWindow_10_vector_100_reduction_None")

avg_protvec = convert_sequences_to_avg_vectors(seq['Sequence'], uniprot_embedding, kmer=3)
avg_protvec = avg_protvec.reset_index(drop=True)

Creating vectors: 100%|██████████| 50/50 [00:00<00:00, 848.55sequence/s]


In [10]:
physicochemical_prop = get_physicochemical_properties(seq)

In [11]:
X = pd.concat([aa_freq, avg_protvec, physicochemical_prop[['helix','turn','sheet']]], axis=1)

In [12]:
y = seq[['pIC50', 'MIC']]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
y_train_pmic, y_train_mic = y_train['pIC50'], y_train['MIC']
y_test_pmic, y_test_mic = y_test['pIC50'], y_test['MIC']

In [15]:
from sklearn.model_selection import cross_val_score, GridSearchCV, LeaveOneOut

In [16]:
from scipy.stats import pearsonr

In [19]:
max_n_neighbors = int(np.sqrt(X_train.shape[0]))
param_grid = {
    'n_neighbors': range(1, max_n_neighbors),
    'weights': ['uniform', 'distance'],
    'metric': ["euclidean", "manhattan", "chebyshev"]
}
knn = KNeighborsRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = knn, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring=mape_scorer)

In [20]:
grid_search.fit(X_train, y_train_pmic)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


GridSearchCV(cv=5, estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan', 'chebyshev'],
                         'n_neighbors': range(1, 6),
                         'weights': ['uniform', 'distance']},
             scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
             verbose=2)

In [21]:
grid_search.best_params_

{'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}

In [22]:
best_grid = grid_search.best_estimator_

In [23]:
y_pred_pmic = best_grid.predict(X_test)

In [24]:
mean_absolute_percentage_error(y_test_pmic, y_pred_pmic)

11.637828767961471

In [25]:
y_pred_mic = np.exp(-y_pred_pmic)/1e-6

In [26]:
ape_mic = 100*np.abs(y_test_mic-y_pred_mic)/y_test_mic

In [27]:
mean_absolute_percentage_error(y_test_mic, y_pred_mic)

305.55068353606896

In [30]:
pearson_score(y_test_mic, y_pred_mic)

0.8476934341810262

In [32]:
r2_score(y_test_mic, y_pred_mic)

0.6922341355793649

In [33]:
list(zip(y_test_mic.round(4), y_pred_mic.round(4), ape_mic))

[(23.8, 23.8295, 0.1238722518916188),
 (383.0, 280.7419, 26.699254733626383),
 (137.0, 6.9023, 94.96180727750749),
 (156.0, 271.9291, 74.31350628748845),
 (0.026, 0.0352, 35.395038567944404),
 (24.3, 4.8358, 80.09953103434026),
 (22.0, 1.3949, 93.65953197010755),
 (0.001, 0.0247, 2370.3452988702034),
 (3.4, 1.2736, 62.54065740533187),
 (0.023, 0.073, 217.3683369622488)]

In [34]:
loo = LeaveOneOut()

from tqdm import tqdm

result_df = pd.DataFrame(columns = list(seq.columns)+["y_pred_pmic", "y_pred_mic", "ape_pmic", "ape_mic"])
for train_index, test_index in tqdm(loo.split(X)):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    y_train_pmic, y_train_mic = y_train['pIC50'], y_train['MIC']
    y_test_pmic, y_test_mic = y_test['pIC50'], y_test['MIC']
    knn = KNeighborsRegressor(**grid_search.best_params_)
    _ = knn.fit(X_train, y_train_pmic)
    y_pred_pmic = knn.predict(X_test)
    y_pred_mic = np.exp(-y_pred_pmic)/1e-6
    ape = 100*np.abs(y_test_pmic-y_pred_pmic)/y_test_pmic
    ape_mic = 100*np.abs(y_test_mic-y_pred_mic)/y_test_mic
    df_val = seq.iloc[test_index,:].values[0].tolist()
    res = np.append(df_val, [y_pred_pmic[0], y_pred_mic[0], ape.values[0], ape_mic.values[0]])
    res = pd.DataFrame([res], columns = list(seq.columns)+["y_pred_pmic", "y_pred_mic", "ape_pmic", "ape_mic"])
    result_df = result_df.append(res)
result_df = result_df[["Sequence", "pIC50", "y_pred_pmic", "ape_pmic", "MIC", "y_pred_mic", "ape_mic"]]

50it [00:00, 104.65it/s]


In [35]:
result_df#.to_csv("../results/SVM_HIV_CoV_pMIC_to_MIC_rbf_c_100_gamma_2.csv", index=False)

Unnamed: 0,Sequence,pIC50,y_pred_pmic,ape_pmic,MIC,y_pred_mic,ape_mic
0,VSTALPQWRIYSYAGDNI,10.59663473,10.647920808093843,0.4839845800162214,25.0,23.750171461126207,4.9993141554951706
0,ALPQWRIYSYAGDNIVTA,10.59663473,10.644187075402703,0.4487495003303126,25.0,23.83901400667812,4.64394397328752
0,AGALMFAWLLLGLQGIFN,10.59663473,12.136597273637276,14.532562298080409,25.0,5.359728304435674,78.5610867822573
0,MASAGMQILGVVLTLLGW,10.59663473,11.12324818125001,4.9696291763188,25.0,14.765042250379292,40.93983099848283
0,MANSGLQLLGFSMALLGW,10.59663473,11.12339675013746,4.971031214713385,25.0,14.762848787423232,40.94860485030707
0,MASTGLELLGMTLAVLGW,10.59663473,10.928807632265377,3.134701824957375,25.0,17.934082015401124,28.263671938395504
0,GWIGAIVSTALPQWRIYS,10.74745762,10.84750786726224,0.930920137577976,21.5,19.45302705693108,9.520804386367066
0,AFLGWIGAIVSTALPQWR,11.28978191,11.007562798671357,2.499774695192898,12.5,16.57586562249056,32.606924979924486
0,FILAFLGWIGAIVSTALP,11.62945928,12.322648711840694,5.960633380718052,8.9,4.449811991228888,50.00211245810238
0,MANAGLQLLGFILAFL,11.78736231,11.832178531642407,0.3802056852395726,7.6,7.26691624603575,4.38268097321381


In [36]:
result_df['ape_pmic'].astype('float').mean()

10.374656830142525

In [37]:
result_df['ape_mic'].astype('float').mean()

729.7154998273481

In [38]:
pearsonr(result_df['MIC'].astype('float'), result_df['y_pred_mic'].astype('float'))

(0.8329374628276527, 6.29985434062599e-14)

In [39]:
r2_score(result_df['MIC'].astype('float'), result_df['y_pred_mic'].astype('float'))

0.6706137911620913