In [1]:
cd ../../

/Users/in-divye.singh/Documents/Projects/MIC_predictor


In [2]:
from notebooks.utils import *

In [3]:
import biovec
import numpy as np
import pandas as pd
from itertools import chain

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
from sklearn.svm import SVR
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, ShuffleSplit
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

### Physico-chemical properties

In [6]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [7]:
params = ['instability_index', 'isoelectric_point', 'helix', 'turn', 'sheet', 'with_reduced_cysteines',
          'with_disulfid_bridges', 'net_charge_at_pH7point4']

In [8]:
def get_all_prop(df):
    prop = []
    for seq in df.Seq:
        X = ProteinAnalysis(seq)
#         molecular_weight = X.molecular_weight()
#         aromaticity = X.aromaticity()
        instability_index = X.instability_index()
        isoelectric_point = X.isoelectric_point()
        sec_struc = X.secondary_structure_fraction()
        helix = sec_struc[0]
        turn = sec_struc[1]
        sheet = sec_struc[2]
        epsilon_prot = X.molar_extinction_coefficient()
        with_reduced_cysteines = epsilon_prot[0]
        with_disulfid_bridges = epsilon_prot[1]
#         gravy = X.gravy() # hydrophobicity related
        # flexibility = X.flexibility()
        # X.protein_scale()
        net_charge_at_pH7point4 = X.charge_at_pH(7.4)

        prop.append([instability_index, isoelectric_point, helix, turn, sheet,
                     with_reduced_cysteines, with_disulfid_bridges, net_charge_at_pH7point4])
    return pd.concat([df, pd.DataFrame(prop, columns=params)], axis=1)

In [9]:
feat_105 = pd.read_csv("data/raw/105_feature_ha_avp_ic50.csv")

In [10]:
feat_105

Unnamed: 0,Seq,MIC,hpi_<-1.5_frac,hpi_<-2.0_frac,hpi_<-2.5_frac,hpi_<-1.5,hpi_<-2.0,hpi_<-2.5,fraction_A,fraction_C,...,T_lcfraction,V_lcscore,V_lcfraction,W_lcscore,W_lcfraction,Y_lcscore,Y_lcfraction,lcs_fractions,lcs_scores,lcs_lowest_complexity
0,AAQRRGRVGRNPNQVGD,442.00000,0.117647,0.000000,0.0,2,0,0,0.117647,0.000000,...,,,,,,,,0.000000,0,8
1,HRILARIRQMMT,435.50000,0.000000,0.000000,0.0,0,0,0,0.083333,0.000000,...,,,,,,,,0.000000,0,8
2,RNPSQVGD,383.00000,0.000000,0.000000,0.0,0,0,0,0.000000,0.000000,...,,,,,,,,0.000000,0,8
3,RVGRNPNQVGD,374.00000,0.000000,0.000000,0.0,0,0,0,0.000000,0.000000,...,,,,,,,,1.000000,11,7
4,AAQRRGRIGRNPSQVGD,358.00000,0.000000,0.000000,0.0,0,0,0,0.117647,0.000000,...,,,,,,,,0.000000,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,NGAICWGPCPTAFRQIGNCGHFKVRCCKIR,0.11515,0.000000,0.000000,0.0,0,0,0,0.066667,0.166667,...,,,,,,,,0.000000,0,11
708,CFPYITRPGTYHDWWYTRKNRQ,0.30000,0.500000,0.045455,0.0,11,1,0,0.000000,0.045455,...,,,,,,,,0.000000,0,12
709,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,0.01131,0.305556,0.138889,0.0,11,5,0,0.027778,0.000000,...,,,,,,,,0.055556,2,7
710,GTNLSVPNPLGFFPDHQLDPAFGANSNNPDWDFNPNKDHWPEANKVG,0.00008,0.297872,0.127660,0.0,14,6,0,0.063830,0.000000,...,,,,,,,,0.000000,0,9


In [11]:
df = get_all_prop(feat_105)

In [12]:
df

Unnamed: 0,Seq,MIC,hpi_<-1.5_frac,hpi_<-2.0_frac,hpi_<-2.5_frac,hpi_<-1.5,hpi_<-2.0,hpi_<-2.5,fraction_A,fraction_C,...,lcs_scores,lcs_lowest_complexity,instability_index,isoelectric_point,helix,turn,sheet,with_reduced_cysteines,with_disulfid_bridges,net_charge_at_pH7point4
0,AAQRRGRVGRNPNQVGD,442.00000,0.117647,0.000000,0.0,2,0,0,0.117647,0.000000,...,0,8,24.817647,11.999968,0.117647,0.352941,0.117647,0,0,2.609418
1,HRILARIRQMMT,435.50000,0.000000,0.000000,0.0,0,0,0,0.083333,0.000000,...,0,8,36.466667,11.999968,0.250000,0.000000,0.333333,0,0,2.594004
2,RNPSQVGD,383.00000,0.000000,0.000000,0.0,0,0,0,0.000000,0.000000,...,0,8,48.687500,6.087706,0.125000,0.500000,0.000000,0,0,-0.440856
3,RVGRNPNQVGD,374.00000,0.000000,0.000000,0.0,0,0,0,0.000000,0.000000,...,11,7,-11.454545,9.598972,0.181818,0.454545,0.000000,0,0,0.559118
4,AAQRRGRIGRNPSQVGD,358.00000,0.000000,0.000000,0.0,0,0,0,0.117647,0.000000,...,0,10,56.905882,11.999968,0.117647,0.352941,0.117647,0,0,2.609418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,NGAICWGPCPTAFRQIGNCGHFKVRCCKIR,0.11515,0.000000,0.000000,0.0,0,0,0,0.066667,0.166667,...,0,11,-1.326667,9.408790,0.233333,0.266667,0.066667,5500,5750,4.466476
708,CFPYITRPGTYHDWWYTRKNRQ,0.30000,0.500000,0.045455,0.0,11,1,0,0.000000,0.045455,...,0,12,33.700000,9.625146,0.318182,0.181818,0.000000,15470,15470,2.559925
709,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,0.01131,0.305556,0.138889,0.0,11,5,0,0.027778,0.000000,...,2,7,62.650000,4.300688,0.361111,0.194444,0.361111,17990,17990,-5.406266
710,GTNLSVPNPLGFFPDHQLDPAFGANSNNPDWDFNPNKDHWPEANKVG,0.00008,0.297872,0.127660,0.0,14,6,0,0.063830,0.000000,...,0,9,18.168085,4.443127,0.234043,0.446809,0.148936,11000,11000,-4.370952


In [13]:
X = df.drop(["Seq", "MIC"], axis=1)

In [14]:
nan_col = []
for col in X.columns:
    if np.any(np.isnan(X[col].values)):
        nan_col.append(col)

In [15]:
X = X.drop(nan_col, axis=1)

In [16]:
y = df["MIC"]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [31]:
def cross_validation(model, X, y, metric):
    score = make_scorer(metric)
    kf = ShuffleSplit(n_splits=5)
    result = cross_val_score(model, X = X, y = y, scoring = score, cv = kf)
    return result

In [81]:
clf = KNeighborsRegressor(n_neighbors=1)
_ = clf.fit(X_train, y_train)
mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)
perm_importance = permutation_importance(clf, X_train, y_train, scoring=mape_scorer,
                                         n_repeats=100,n_jobs=-1,random_state=42)
sorted_idx = perm_importance.importances_mean.argsort()[::-1]

In [82]:
len(X_train.columns)

70

In [83]:
list(zip(X_train.columns[sorted_idx], perm_importance.importances_mean[sorted_idx]))

[('with_disulfid_bridges', 804181.2916479373),
 ('with_reduced_cysteines', 761614.1433508743),
 ('molecular_weight', 518195.93842438527),
 ('instability_index', 40183.1281081033),
 ('lcs_scores', 30.827568801772248),
 ('hpi_<-1.5', 18.199649208445305),
 ('net_charge_at_pH7point4', 16.043741621413357),
 ('length', 5.772816356211258),
 ('lcs_lowest_complexity', 1.3867174329561707),
 ('isoelectric_point', 1.3609234640374397),
 ('IEP', 1.3609234640374397),
 ('hpi_<-2.0', 0.9540279693696208),
 ('beta_sheet', 0.24895422212544838),
 ('sheet', 0.24895422212544838),
 ('fraction_L', 0.18880028880240707),
 ('Asx', 0.13412265285357486),
 ('Alipatic', 0.13412265285357486),
 ('Aromatic', 0.13412265285357486),
 ('Pos_charge', 0.13412265285357486),
 ('Xle', 0.13412265285357486),
 ('Small', 0.13412265285357486),
 ('fraction_K', 0.13412265285357486),
 ('fraction_W', 0.13412265285357486),
 ('fraction_S', 0.13412265285357486),
 ('Hydrophobic', 0.13412265285357486),
 ('fraction_G', 0.13412265285357486),
 (

In [84]:
score = []
for i in np.r_[1:25, np.arange(25,66,5)]:
    print(sorted_idx[:i])    
    cv_r2 = cross_validation(model = clf,
                        X = X_train.iloc[:,sorted_idx[:i]],
                        y = y_train,
                        metric = mean_absolute_percentage_error)
#     clf = SVR()
    _ = clf.fit(X_train.iloc[:,sorted_idx[:i]],y_train)
    y_pred = clf.predict(X_test.iloc[:,sorted_idx[:i]])
    test_r2 = mean_absolute_percentage_error(y_test,y_pred)
    score.append([sorted_idx[:i],np.mean(cv_r2), test_r2])

score_df = pd.DataFrame(score,columns = ['features','cv_mape','test_mape'])

[68]
[68 67]
[68 67 28]
[68 67 28 62]
[68 67 28 62 60]
[68 67 28 62 60  3]
[68 67 28 62 60  3 69]
[68 67 28 62 60  3 69 26]
[68 67 28 62 60  3 69 26 61]
[68 67 28 62 60  3 69 26 61 63]
[68 67 28 62 60  3 69 26 61 63 27]
[68 67 28 62 60  3 69 26 61 63 27  4]
[68 67 28 62 60  3 69 26 61 63 27  4 57]
[68 67 28 62 60  3 69 26 61 63 27  4 57 66]
[68 67 28 62 60  3 69 26 61 63 27  4 57 66 15]
[68 67 28 62 60  3 69 26 61 63 27  4 57 66 15 30]
[68 67 28 62 60  3 69 26 61 63 27  4 57 66 15 30 36]
[68 67 28 62 60  3 69 26 61 63 27  4 57 66 15 30 36 35]
[68 67 28 62 60  3 69 26 61 63 27  4 57 66 15 30 36 35 33]
[68 67 28 62 60  3 69 26 61 63 27  4 57 66 15 30 36 35 33 32]
[68 67 28 62 60  3 69 26 61 63 27  4 57 66 15 30 36 35 33 32 37]
[68 67 28 62 60  3 69 26 61 63 27  4 57 66 15 30 36 35 33 32 37 14]
[68 67 28 62 60  3 69 26 61 63 27  4 57 66 15 30 36 35 33 32 37 14 24]
[68 67 28 62 60  3 69 26 61 63 27  4 57 66 15 30 36 35 33 32 37 14 24 21]
[68 67 28 62 60  3 69 26 61 63 27  4 57 66 15 30 36 

In [85]:
score_df.sort_values(by='test_mape')

Unnamed: 0,features,cv_mape,test_mape
2,"[68, 67, 28]",93452.949546,123939.3
16,"[68, 67, 28, 62, 60, 3, 69, 26, 61, 63, 27, 4,...",87409.608284,6324519.0
30,"[68, 67, 28, 62, 60, 3, 69, 26, 61, 63, 27, 4,...",73831.762251,6324519.0
29,"[68, 67, 28, 62, 60, 3, 69, 26, 61, 63, 27, 4,...",79315.29224,6324519.0
28,"[68, 67, 28, 62, 60, 3, 69, 26, 61, 63, 27, 4,...",88181.066686,6324519.0
27,"[68, 67, 28, 62, 60, 3, 69, 26, 61, 63, 27, 4,...",14770.276341,6324519.0
26,"[68, 67, 28, 62, 60, 3, 69, 26, 61, 63, 27, 4,...",41628.811181,6324519.0
25,"[68, 67, 28, 62, 60, 3, 69, 26, 61, 63, 27, 4,...",115205.649659,6324519.0
24,"[68, 67, 28, 62, 60, 3, 69, 26, 61, 63, 27, 4,...",11051.976711,6324519.0
23,"[68, 67, 28, 62, 60, 3, 69, 26, 61, 63, 27, 4,...",67906.477356,6324519.0


In [98]:
clf = KNeighborsRegressor(n_neighbors=1)
_ = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(mean_absolute_percentage_error(y_test,y_pred), r2_score(y_test,y_pred))

6324518.901636967 -0.6059264036843475


In [96]:
for i in range(1,60):
    clf = KNeighborsRegressor(n_neighbors=i)
    _ = clf.fit(X_train.iloc[:,[68, 67, 28]],y_train)
    y_pred = clf.predict(X_test.iloc[:,[68, 67, 28]])
    print(i, mean_absolute_percentage_error(y_test,y_pred), r2_score(y_test,y_pred))

1 123939.31575859865 -1.0142691565009585
2 747070.7933552462 -0.47361505187398567
3 2578636.879088805 -0.003304547073408326
4 2838293.880452276 -0.015488581159511572
5 15805430.274458645 0.00717877421814106
6 13280425.584111592 0.01921663374920035
7 23320368.894216105 0.019377471094904175
8 20585093.218438372 0.014519669387846657
9 18298826.198833697 0.024030504156536292
10 16489966.080374073 0.044449815102015844
11 14992497.480263053 0.07475305215034189
12 13748878.366836967 0.0837842682387312
13 12735677.945065452 0.06370691212038904
14 11872221.826487578 0.06324478220577756
15 11813099.45005649 0.09100013870913737
16 11075289.527572414 0.0914527948265863
17 10557798.30761163 0.09720955955711297
18 10051323.975643689 0.11786771297443144
19 9523834.14596771 0.10481425770018937
20 9048595.211911026 0.09698625942390049
21 8628389.3754415 0.08785399184299836
22 8285366.175829518 0.10161622589095887
23 8507894.73917322 0.1192085952261297
24 8177710.233105336 0.12289991542310785
25 8397843