In [1]:
cd ../../

/Users/in-divye.singh/Documents/Projects/MIC_predictor


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [27]:
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.metrics import make_scorer, r2_score, mean_squared_error

In [4]:
from notebooks.pcm.utils import ZScalesExtractor

In [5]:
mic = pd.read_csv("data/raw/MIC_pIC50_values.csv")

In [6]:
seq = pd.read_csv("data/raw/peptide_target_seq.csv")

In [7]:
seq = seq.merge(mic, on='Sequence')

In [8]:
zscale = ZScalesExtractor("data/raw/z_scales_5.csv")

In [9]:
zscales_features = zscale.transform(seq)

In [10]:
seq.shape

(50, 7)

In [11]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score

In [12]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better = False)

In [14]:
best_params_ = {'bootstrap': False,
 'max_depth': 2,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 200}

### Leave one out CV

In [15]:
loo = LeaveOneOut()

In [16]:
from tqdm import tqdm

In [17]:
result_df = pd.DataFrame(columns = list(seq.columns)+["y_pred_pmic", "y_pred_mic", "ape_pmic", "ape_mic"])
for train_index, test_index in tqdm(loo.split(zscales_features)):
    X_train, X_test = zscales_features.iloc[train_index,:], zscales_features.iloc[test_index,:]
    y_train, y_test = seq[['pIC50', 'MIC']].iloc[train_index], seq[['pIC50', 'MIC']].iloc[test_index]
    y_train_pmic, y_train_mic = y_train['pIC50'], y_train['MIC']
    y_test_pmic, y_test_mic = y_test['pIC50'], y_test['MIC']
    rf = RandomForestRegressor(**best_params_)
    _ = rf.fit(X_train, y_train_pmic)
    y_pred_pmic = rf.predict(X_test)
    y_pred_mic = np.exp(-y_pred_pmic)/1e-6
    ape = 100*np.abs(y_test_pmic-y_pred_pmic)/y_test_pmic
    ape_mic = 100*np.abs(y_test_mic-y_pred_mic)/y_test_mic
    df_val = seq.iloc[test_index,:].values[0].tolist()
    res = np.append(df_val, [y_pred_pmic[0], y_pred_mic[0], ape.values[0], ape_mic.values[0]])
    res = pd.DataFrame([res], columns = list(seq.columns)+["y_pred_pmic", "y_pred_mic", "ape_pmic", "ape_mic"])
    result_df = result_df.append(res)
result_df = result_df[["Sequence", "pIC50", "y_pred_pmic", "ape_pmic", "MIC", "y_pred_mic", "ape_mic"]]

50it [00:09,  5.43it/s]


In [18]:
result_df#.to_csv("../results/SVM_HIV_CoV_pMIC_to_MIC_rbf_c_100_gamma_2.csv", index=False)

Unnamed: 0,Sequence,pIC50,y_pred_pmic,ape_pmic,MIC,y_pred_mic,ape_mic
0,VSTALPQWRIYSYAGDNI,10.59663473,11.28363244158068,6.483168752016429,25.0,12.577105236205997,49.69157905517603
0,ALPQWRIYSYAGDNIVTA,10.59663473,11.28363244158068,6.483168752016429,25.0,12.577105236205997,49.69157905517603
0,AGALMFAWLLLGLQGIFN,10.59663473,11.28363244158068,6.483168752016429,25.0,12.577105236205997,49.69157905517603
0,MASAGMQILGVVLTLLGW,10.59663473,11.28363244158068,6.483168752016429,25.0,12.577105236205997,49.69157905517603
0,MANSGLQLLGFSMALLGW,10.59663473,11.28363244158068,6.483168752016429,25.0,12.577105236205997,49.69157905517603
0,MASTGLELLGMTLAVLGW,10.59663473,11.28363244158068,6.483168752016429,25.0,12.577105236205997,49.69157905517603
0,GWIGAIVSTALPQWRIYS,10.74745762,11.278767187064496,4.943583737197329,21.5,12.638445150407753,41.21653418414999
0,AFLGWIGAIVSTALPQWR,11.28978191,11.261272855128986,0.2525208644266368,12.5,12.861491641073572,2.891933128588576
0,FILAFLGWIGAIVSTALP,11.62945928,11.250315520612924,3.260201100141577,8.9,13.00319423093066,46.10330596551302
0,MANAGLQLLGFILAFL,11.78736231,11.245221874483844,4.599336316796035,7.6,13.0695968731021,71.96837990923817


In [19]:
result_df['ape_pmic'].astype('float').mean()

9.871018318298685

In [29]:
pearsonr(result_df['pIC50'].astype('float'), result_df['y_pred_pmic'].astype('float'))

(0.8704098859040363, 2.2120962591160563e-16)

In [30]:
r2_score(result_df['pIC50'].astype('float'), result_df['y_pred_pmic'].astype('float'))

0.7479937877410594

In [32]:
mean_squared_error(result_df['pIC50'].astype('float'), result_df['y_pred_pmic'].astype('float'))

3.352014476759489

In [20]:
result_df['ape_mic'].astype('float').mean()

877.85220765868

In [23]:
from scipy.stats import pearsonr

In [25]:
pearsonr(result_df['MIC'].astype('float'), result_df['y_pred_mic'].astype('float'))

(0.8441833362543676, 1.3527433912667762e-14)

In [26]:
r2_score(result_df['MIC'].astype('float'), result_df['y_pred_mic'].astype('float'))

0.701525628950505

In [28]:
np.sqrt(mean_squared_error(result_df['MIC'].astype('float'), result_df['y_pred_mic'].astype('float')))

64.08321218259441