In [1]:
cd ../../

/Users/in-divye.singh/Documents/Projects/MIC_predictor


In [2]:
from notebooks.utils import *

In [3]:
import biovec
import numpy as np
import pandas as pd
from itertools import chain

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
from sklearn.svm import SVR
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, ShuffleSplit
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

### Physico-chemical properties

In [6]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [7]:
params = ['instability_index', 'isoelectric_point', 'helix', 'turn', 'sheet', 'with_reduced_cysteines',
          'with_disulfid_bridges', 'net_charge_at_pH7point4']

In [8]:
def get_all_prop(df):
    prop = []
    for seq in df.Sequence:
        X = ProteinAnalysis(seq)
#         molecular_weight = X.molecular_weight()
#         aromaticity = X.aromaticity()
        instability_index = X.instability_index()
        isoelectric_point = X.isoelectric_point()
        sec_struc = X.secondary_structure_fraction()
        helix = sec_struc[0]
        turn = sec_struc[1]
        sheet = sec_struc[2]
        epsilon_prot = X.molar_extinction_coefficient()
        with_reduced_cysteines = epsilon_prot[0]
        with_disulfid_bridges = epsilon_prot[1]
#         gravy = X.gravy() # hydrophobicity related
        # flexibility = X.flexibility()
        # X.protein_scale()
        net_charge_at_pH7point4 = X.charge_at_pH(7.4)

        prop.append([instability_index, isoelectric_point, helix, turn, sheet,
                     with_reduced_cysteines, with_disulfid_bridges, net_charge_at_pH7point4])
    return pd.concat([df, pd.DataFrame(prop, columns=params)], axis=1)

In [9]:
feat_105 = pd.read_csv("data/raw/105_feature_ha_avp_ic50.csv")

In [10]:
feat_105

Unnamed: 0,Sequence,MIC,hpi_<-1.5_frac,hpi_<-2.0_frac,hpi_<-2.5_frac,hpi_<-1.5,hpi_<-2.0,hpi_<-2.5,fraction_A,fraction_C,...,T_lcfraction,V_lcscore,V_lcfraction,W_lcscore,W_lcfraction,Y_lcscore,Y_lcfraction,lcs_fractions,lcs_scores,lcs_lowest_complexity
0,AAQRRGRVGRNPNQVGD,442.00000,0.117647,0.000000,0.0,2,0,0,0.117647,0.000000,...,,,,,,,,0.000000,0,8
1,HRILARIRQMMT,435.50000,0.000000,0.000000,0.0,0,0,0,0.083333,0.000000,...,,,,,,,,0.000000,0,8
2,RNPSQVGD,383.00000,0.000000,0.000000,0.0,0,0,0,0.000000,0.000000,...,,,,,,,,0.000000,0,8
3,RVGRNPNQVGD,374.00000,0.000000,0.000000,0.0,0,0,0,0.000000,0.000000,...,,,,,,,,1.000000,11,7
4,AAQRRGRIGRNPSQVGD,358.00000,0.000000,0.000000,0.0,0,0,0,0.117647,0.000000,...,,,,,,,,0.000000,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,NGAICWGPCPTAFRQIGNCGHFKVRCCKIR,0.11515,0.000000,0.000000,0.0,0,0,0,0.066667,0.166667,...,,,,,,,,0.000000,0,11
708,CFPYITRPGTYHDWWYTRKNRQ,0.30000,0.500000,0.045455,0.0,11,1,0,0.000000,0.045455,...,,,,,,,,0.000000,0,12
709,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,0.01131,0.305556,0.138889,0.0,11,5,0,0.027778,0.000000,...,,,,,,,,0.055556,2,7
710,GTNLSVPNPLGFFPDHQLDPAFGANSNNPDWDFNPNKDHWPEANKVG,0.00008,0.297872,0.127660,0.0,14,6,0,0.063830,0.000000,...,,,,,,,,0.000000,0,9


In [11]:
df = get_all_prop(feat_105)

In [12]:
aa_freq = reduce_by_kmer_frequency(df)

############# Dipep freq #############
dipep_freq = reduce_by_kmer_frequency(df, kmer=2)

############# Tripep freq #############
tripep_freq = reduce_by_kmer_frequency(df, kmer=3)

############# Prot2Vec #############
uniprot_embedding = biovec.models.load_protvec("data/embeddings/uniprot__kmer_3_contextWindow_10_vector_100_reduction_None")

avg_protvec = convert_sequences_to_avg_vectors(df['Sequence'], uniprot_embedding, kmer=5)
avg_protvec = avg_protvec.reset_index(drop=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
Creating vectors: 100%|██████████| 712/712 [00:00<00:00, 1088.13sequence/s]


In [13]:
df = pd.concat([df, aa_freq, dipep_freq, avg_protvec], axis=1)
df

Unnamed: 0,Sequence,MIC,hpi_<-1.5_frac,hpi_<-2.0_frac,hpi_<-2.5_frac,hpi_<-1.5,hpi_<-2.0,hpi_<-2.5,fraction_A,fraction_C,...,90,91,92,93,94,95,96,97,98,99
0,AAQRRGRVGRNPNQVGD,442.00000,0.117647,0.000000,0.0,2,0,0,0.117647,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,HRILARIRQMMT,435.50000,0.000000,0.000000,0.0,0,0,0,0.083333,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,RNPSQVGD,383.00000,0.000000,0.000000,0.0,0,0,0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,RVGRNPNQVGD,374.00000,0.000000,0.000000,0.0,0,0,0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AAQRRGRIGRNPSQVGD,358.00000,0.000000,0.000000,0.0,0,0,0,0.117647,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,NGAICWGPCPTAFRQIGNCGHFKVRCCKIR,0.11515,0.000000,0.000000,0.0,0,0,0,0.066667,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
708,CFPYITRPGTYHDWWYTRKNRQ,0.30000,0.500000,0.045455,0.0,11,1,0,0.000000,0.045455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
709,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,0.01131,0.305556,0.138889,0.0,11,5,0,0.027778,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
710,GTNLSVPNPLGFFPDHQLDPAFGANSNNPDWDFNPNKDHWPEANKVG,0.00008,0.297872,0.127660,0.0,14,6,0,0.063830,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
X = df.drop(["Sequence", "MIC"], axis=1)

In [15]:
nan_col = []
for col in X.columns:
    if np.any(np.isnan(X[col].values)):
        nan_col.append(col)

In [16]:
X = X.drop(nan_col, axis=1)

In [17]:
y = df["MIC"]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [24]:
def cross_validation(model, X, y, metric):
    score = make_scorer(metric)
    kf = ShuffleSplit(n_splits=5)
    result = cross_val_score(model, X = X, y = y, scoring = score, cv = kf)
    return result

In [25]:
clf = KNeighborsRegressor(n_neighbors=1)
_ = clf.fit(X_train, y_train)
mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)
perm_importance = permutation_importance(clf, X_train, y_train, scoring=mape_scorer,
                                         n_repeats=10,n_jobs=-1,random_state=42)
sorted_idx = perm_importance.importances_mean.argsort()[::-1]

In [26]:
len(X_train.columns)

590

In [27]:
list(zip(X_train.columns[sorted_idx], perm_importance.importances_mean[sorted_idx]))

[('with_reduced_cysteines', 787022.4434749486),
 ('with_disulfid_bridges', 584728.6106125878),
 ('molecular_weight', 390482.9945106767),
 ('instability_index', 30753.173151200295),
 ('lcs_scores', 23.320855162871506),
 ('net_charge_at_pH7point4', 16.182207825835274),
 ('hpi_<-1.5', 13.503559534203252),
 ('IEP', 1.5396688477760985),
 ('isoelectric_point', 1.5396688477760985),
 ('lcs_lowest_complexity', 0.9192208954700082),
 ('gravy', 0.3422655084306917),
 ('hpi_<-2.0', 0.27865371041431886),
 ('sheet', 0.05840205488711638),
 ('beta_sheet', 0.05840205488711638),
 ('L', 0.035465274254882564),
 ('fraction_L', 0.035465274254882564),
 ('length', 0.016857644991212655),
 ('I', 0.009733675814519397),
 ('fraction_I', 0.009733675814519397),
 ('LG', 0.0062642468113243656),
 ('HD', 0.0),
 ('HM', 0.0),
 ('GK', 0.0),
 ('GL', 0.0),
 ('GM', 0.0),
 ('GN', 0.0),
 ('GP', 0.0),
 ('GQ', 0.0),
 ('GR', 0.0),
 ('HQ', 0.0),
 ('GS', 0.0),
 ('GT', 0.0),
 ('HP', 0.0),
 ('HN', 0.0),
 ('HL', 0.0),
 ('HC', 0.0),
 ('GV

In [28]:
score = []
for i in range(1,100):#np.r_[1:25, np.arange(25,10,5)]:
    print(sorted_idx[:i])    
    cv_r2 = cross_validation(model = clf,
                        X = X_train.iloc[:,sorted_idx[:i]],
                        y = y_train,
                        metric = mean_absolute_percentage_error)
#     clf = SVR()
    _ = clf.fit(X_train.iloc[:,sorted_idx[:i]],y_train)
    y_pred = clf.predict(X_test.iloc[:,sorted_idx[:i]])
    test_r2 = mean_absolute_percentage_error(y_test,y_pred)
    score.append([sorted_idx[:i],np.mean(cv_r2), test_r2])

score_df = pd.DataFrame(score,columns = ['features','cv_mape','test_mape'])

[67]
[67 68]
[67 68 28]
[67 68 28 62]
[67 68 28 62 60]
[67 68 28 62 60 69]
[67 68 28 62 60 69  3]
[67 68 28 62 60 69  3 27]
[67 68 28 62 60 69  3 27 63]
[67 68 28 62 60 69  3 27 63 61]
[67 68 28 62 60 69  3 27 63 61 29]
[67 68 28 62 60 69  3 27 63 61 29  4]
[67 68 28 62 60 69  3 27 63 61 29  4 66]
[67 68 28 62 60 69  3 27 63 61 29  4 66 57]
[67 68 28 62 60 69  3 27 63 61 29  4 66 57 79]
[67 68 28 62 60 69  3 27 63 61 29  4 66 57 79 15]
[67 68 28 62 60 69  3 27 63 61 29  4 66 57 79 15 26]
[67 68 28 62 60 69  3 27 63 61 29  4 66 57 79 15 26 77]
[67 68 28 62 60 69  3 27 63 61 29  4 66 57 79 15 26 77 13]
[ 67  68  28  62  60  69   3  27  63  61  29   4  66  57  79  15  26  77
  13 275]
[ 67  68  28  62  60  69   3  27  63  61  29   4  66  57  79  15  26  77
  13 275 212]
[ 67  68  28  62  60  69   3  27  63  61  29   4  66  57  79  15  26  77
  13 275 212 220]
[ 67  68  28  62  60  69   3  27  63  61  29   4  66  57  79  15  26  77
  13 275 212 220 198]
[ 67  68  28  62  60  69   3  27  63

[ 67  68  28  62  60  69   3  27  63  61  29   4  66  57  79  15  26  77
  13 275 212 220 198 199 200 201 202 203 204 223 205 206 222 221 219 211
 207 208 218 217 216 215 214 196 213 209 210 197 191 195 166 173 172 171
 170 169 168 167 165 175 164 163 162 161 160 159]
[ 67  68  28  62  60  69   3  27  63  61  29   4  66  57  79  15  26  77
  13 275 212 220 198 199 200 201 202 203 204 223 205 206 222 221 219 211
 207 208 218 217 216 215 214 196 213 209 210 197 191 195 166 173 172 171
 170 169 168 167 165 175 164 163 162 161 160 159 158]
[ 67  68  28  62  60  69   3  27  63  61  29   4  66  57  79  15  26  77
  13 275 212 220 198 199 200 201 202 203 204 223 205 206 222 221 219 211
 207 208 218 217 216 215 214 196 213 209 210 197 191 195 166 173 172 171
 170 169 168 167 165 175 164 163 162 161 160 159 158 174]
[ 67  68  28  62  60  69   3  27  63  61  29   4  66  57  79  15  26  77
  13 275 212 220 198 199 200 201 202 203 204 223 205 206 222 221 219 211
 207 208 218 217 216 215 214 196 21

[ 67  68  28  62  60  69   3  27  63  61  29   4  66  57  79  15  26  77
  13 275 212 220 198 199 200 201 202 203 204 223 205 206 222 221 219 211
 207 208 218 217 216 215 214 196 213 209 210 197 191 195 166 173 172 171
 170 169 168 167 165 175 164 163 162 161 160 159 158 174 176 194 186 193
 192 225 190 189 188 187 185 177 184 183 182 181 180 179 178 224 589 226
 268 274]
[ 67  68  28  62  60  69   3  27  63  61  29   4  66  57  79  15  26  77
  13 275 212 220 198 199 200 201 202 203 204 223 205 206 222 221 219 211
 207 208 218 217 216 215 214 196 213 209 210 197 191 195 166 173 172 171
 170 169 168 167 165 175 164 163 162 161 160 159 158 174 176 194 186 193
 192 225 190 189 188 187 185 177 184 183 182 181 180 179 178 224 589 226
 268 274 273]
[ 67  68  28  62  60  69   3  27  63  61  29   4  66  57  79  15  26  77
  13 275 212 220 198 199 200 201 202 203 204 223 205 206 222 221 219 211
 207 208 218 217 216 215 214 196 213 209 210 197 191 195 166 173 172 171
 170 169 168 167 165 175 16

In [29]:
score_df.sort_values(by='test_mape')

Unnamed: 0,features,cv_mape,test_mape
2,"[67, 68, 28]",5.587010e+04,1.239381e+05
0,[67],1.598365e+06,5.676959e+06
84,"[67, 68, 28, 62, 60, 69, 3, 27, 63, 61, 29, 4,...",2.182548e+04,6.324518e+06
85,"[67, 68, 28, 62, 60, 69, 3, 27, 63, 61, 29, 4,...",1.014132e+05,6.324518e+06
86,"[67, 68, 28, 62, 60, 69, 3, 27, 63, 61, 29, 4,...",2.366492e+04,6.324518e+06
...,...,...,...
6,"[67, 68, 28, 62, 60, 69, 3]",5.544543e+04,6.324533e+06
3,"[67, 68, 28, 62]",3.757782e+04,6.324538e+06
5,"[67, 68, 28, 62, 60, 69]",6.563862e+03,6.324540e+06
4,"[67, 68, 28, 62, 60]",5.583197e+04,6.324551e+06


In [None]:
df.columns[-100:]

In [30]:
bbb = list(df.columns[132:532])+list(df.columns[-100:])

In [37]:
len(bbb)

500

In [40]:
X_train[bbb]

Unnamed: 0,AA,AC,AD,AE,AF,AG,AH,AI,AK,AL,...,90,91,92,93,94,95,96,97,98,99
375,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
344,0.0,0.0,0.0,0.000000,0.029412,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
526,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
234,0.0,0.0,0.0,0.052632,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.029412,0.058824,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.083333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
435,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
clf = KNeighborsRegressor(n_neighbors=1, weights='uniform', metric='euclidean')
_ = clf.fit(X_train[bbb],y_train)
y_pred = clf.predict(X_test[bbb])
print(mean_absolute_percentage_error(y_test,y_pred), r2_score(y_test,y_pred))

304284.14724882756 -0.051568616409732426


In [None]:
for i in range(1,60):
    clf = KNeighborsRegressor(n_neighbors=i)
    _ = clf.fit(X_train.iloc[:,[68, 67, 28]],y_train)
    y_pred = clf.predict(X_test.iloc[:,[68, 67, 28]])
    print(i, mean_absolute_percentage_error(y_test,y_pred), r2_score(y_test,y_pred))