In [4]:
cd ../../

/Users/in-divye.singh/Documents/Projects/MIC_predictor


In [5]:
from tqdm import tqdm
import biovec
import numpy as np
import pandas as pd
from itertools import chain

In [6]:
from notebooks.utils import *

In [7]:
from sklearn.svm import SVR
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

In [8]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [9]:
mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [11]:
avp_ic50 = pd.read_csv("data/raw/AVP-IC50Pred_train.csv")
ha_avp = pd.read_csv("data/raw/HA_AVP.csv")

df = pd.concat([avp_ic50[['Sequence', 'MIC']], ha_avp], axis=0).drop_duplicates(['Sequence']).reset_index(drop=True)
df = sequence_filtering(df)

In [16]:
df_lessthan1 = df[df['MIC'] <= 1]

### Amino acid frequency

In [17]:
############# Amino acid frequency #############
aa_freq = reduce_by_kmer_frequency(df_lessthan1).sort_index(axis=1)

In [18]:
aa_freq

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
0,0.090909,0.000000,0.045455,0.136364,0.045455,0.045455,0.000000,0.045455,0.136364,0.136364,0.0,0.000000,0.000000,0.000000,0.090909,0.090909,0.000000,0.045455,0.045455,0.045455
1,0.057143,0.000000,0.085714,0.114286,0.000000,0.000000,0.000000,0.114286,0.085714,0.114286,0.0,0.114286,0.028571,0.028571,0.057143,0.142857,0.000000,0.028571,0.028571,0.000000
2,0.057143,0.000000,0.114286,0.114286,0.000000,0.000000,0.000000,0.114286,0.114286,0.114286,0.0,0.057143,0.028571,0.028571,0.057143,0.142857,0.000000,0.028571,0.028571,0.000000
3,0.028571,0.000000,0.057143,0.114286,0.000000,0.028571,0.028571,0.085714,0.114286,0.085714,0.0,0.085714,0.000000,0.057143,0.057143,0.200000,0.000000,0.000000,0.057143,0.000000
4,0.000000,0.222222,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.111111,0.222222,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.333333,0.000000,0.000000,0.111111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,0.066667,0.166667,0.000000,0.000000,0.066667,0.133333,0.033333,0.100000,0.066667,0.000000,0.0,0.066667,0.066667,0.033333,0.100000,0.000000,0.033333,0.033333,0.033333,0.000000
211,0.000000,0.045455,0.045455,0.000000,0.045455,0.045455,0.045455,0.045455,0.045455,0.000000,0.0,0.045455,0.090909,0.045455,0.136364,0.000000,0.136364,0.000000,0.090909,0.136364
212,0.027778,0.000000,0.027778,0.166667,0.027778,0.000000,0.027778,0.055556,0.055556,0.166667,0.0,0.083333,0.000000,0.111111,0.000000,0.111111,0.027778,0.000000,0.083333,0.027778
213,0.063830,0.000000,0.106383,0.021277,0.085106,0.085106,0.042553,0.000000,0.042553,0.063830,0.0,0.170213,0.148936,0.021277,0.000000,0.042553,0.021277,0.042553,0.042553,0.000000


In [20]:
X_train, X_test, y_train, y_test = train_test_split(aa_freq, df_lessthan1['MIC'], test_size=0.2, random_state=42)

In [21]:
X_train.shape, X_test.shape

((172, 20), (43, 20))

In [22]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None, 80, 90, 100, 110],
    'max_features': ['auto', 2, 3],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring=mape_scorer)

In [23]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1200 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:   55.1s
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 3257 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed:  6.1min finished


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True],
                         'max_depth': [None, 80, 90, 100, 110],
                         'max_features': ['auto', 2, 3],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [2, 8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
             verbose=2)

In [24]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 80,
 'max_features': 2,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [25]:
best_grid = grid_search.best_estimator_

In [26]:
y_pred = best_grid.predict(X_test)

In [27]:
mean_squared_error(y_pred=y_pred, y_true=y_test)

0.04720975228589264

In [28]:
mean_absolute_error(y_pred=y_pred, y_true=y_test)

0.1636252345946844

In [29]:
mean_absolute_percentage_error(y_pred=y_pred, y_true=y_test)

1366.1603511148835

In [30]:
list(zip(y_test, y_pred))

[(0.001, 0.11297666666666674),
 (0.011309999999999999, 0.08197999999999997),
 (0.022000000000000002, 0.15187173699999992),
 (0.005, 0.033979999999999906),
 (0.89, 0.4504267166666666),
 (0.045, 0.31836993700000016),
 (0.008, 0.02429999999999994),
 (0.2, 0.18417299999999995),
 (0.5, 0.5615845238095234),
 (0.022000000000000002, 0.03316999999999999),
 (0.012, 0.03277999999999994),
 (0.7, 0.6036600739999998),
 (0.003, 0.033979999999999906),
 (0.98, 0.4124941406666664),
 (0.25, 0.38469044066666674),
 (0.005, 0.3110066666666669),
 (0.006999999999999999, 0.048101699999999914),
 (0.84, 0.48680773333333327),
 (0.013000000000000001, 0.23286507400000012),
 (0.252, 0.2632187000000002),
 (0.003, 0.11430999999999995),
 (0.02, 0.29019766666666685),
 (0.17, 0.3331905036666668),
 (0.79, 0.41093670366666635),
 (0.057, 0.16019333333333344),
 (0.019, 0.14664),
 (0.002, 0.17122999999999994),
 (0.88, 0.24220870000000008),
 (0.46, 0.434971587),
 (0.015, 0.07201000000000002),
 (0.049, 0.37464000000000025),
 (0

### Prot2Vec

In [61]:
############# Prot2Vec #############
uniprot_embedding = biovec.models.load_protvec("data/embeddings/uniprot__kmer_3_contextWindow_10_vector_100_reduction_None")

avg_vectors = convert_sequences_to_avg_vectors(df_lessthan1['Sequence'], uniprot_embedding)
avg_vectors = avg_vectors.reset_index(drop=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
Creating vectors: 100%|██████████| 215/215 [00:00<00:00, 1000.89sequence/s]


In [62]:
avg_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.000326,0.082828,-0.016972,0.033496,-0.029845,-0.009866,-0.002361,0.067565,0.043482,-0.015739,...,0.021694,-0.085800,0.006715,-0.002862,0.015911,0.012537,0.021614,0.026963,0.057681,-0.040158
1,-0.008685,0.061298,0.045503,0.042366,-0.056826,0.029961,0.062079,0.041146,0.023968,-0.025709,...,0.001860,-0.079363,0.019131,-0.056620,0.033148,0.016609,0.009392,-0.012630,0.077096,-0.104904
2,-0.015213,0.069901,0.032177,0.035516,-0.057223,0.020554,0.045740,0.042754,0.030702,-0.020374,...,0.001696,-0.100250,0.035932,-0.038657,0.035121,0.013145,0.018021,-0.004209,0.073029,-0.101384
3,-0.015084,0.040755,0.016946,0.028864,-0.062246,0.028597,0.051237,0.072327,0.030972,-0.013748,...,0.026069,-0.061902,0.022880,-0.031131,0.037069,0.027536,0.006920,-0.015567,0.095171,-0.086851
4,0.003517,-0.026367,0.021674,0.054289,0.002337,-0.000452,0.038705,-0.030244,0.050282,-0.014623,...,0.009277,-0.035637,0.031246,0.030047,-0.005038,-0.018434,0.016361,-0.052519,0.119378,-0.104380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,-0.038337,-0.017915,-0.015082,0.042641,0.046676,0.037965,-0.039406,0.050116,0.023495,0.058380,...,0.006357,0.000806,0.007335,-0.015952,0.054503,0.056923,-0.067147,0.017195,0.037202,-0.061830
211,-0.018060,0.002701,-0.002945,0.016760,0.020204,0.018499,0.028342,0.035244,-0.012086,0.028506,...,0.020611,0.010793,0.019357,0.026522,0.038127,0.043636,0.014737,0.010608,0.054837,-0.038420
212,-0.004227,0.033425,-0.024497,0.029364,-0.066315,0.013616,0.038417,0.074161,0.056543,-0.023648,...,0.051252,-0.057756,0.006149,0.001708,0.016342,0.048390,-0.021271,-0.017798,0.070233,-0.046075
213,0.003485,-0.004911,0.027475,0.020559,0.002146,0.072764,0.055225,0.014650,0.019468,-0.001088,...,0.023278,-0.013674,-0.007323,-0.014028,-0.024394,0.045786,-0.018962,0.007527,0.048972,-0.061540


In [63]:
X_train, X_test, y_train, y_test = train_test_split(avg_vectors, df_lessthan1['MIC'], test_size=0.2, random_state=42)

In [64]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None, 80, 90, 100, 110],
    'max_features': ['auto', 2, 3],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring=mape_scorer)

In [65]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1200 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 3257 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 11.4min finished


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True],
                         'max_depth': [None, 80, 90, 100, 110],
                         'max_features': ['auto', 2, 3],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [2, 8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
             verbose=2)

In [66]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 80,
 'max_features': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 12,
 'n_estimators': 100}

In [72]:
best_grid = grid_search.best_estimator_

In [73]:
y_pred = best_grid.predict(X_test)

In [74]:
mean_squared_error(y_pred=y_pred, y_true=y_test)

0.05511788655142501

In [75]:
mean_absolute_error(y_pred=y_pred, y_true=y_test)

0.19289436706236848

In [76]:
mean_absolute_percentage_error(y_pred=y_pred, y_true=y_test)

1733.8658146266446

In [77]:
protvec_aa_freq = pd.concat([avg_vectors, aa_freq], axis=1)

In [78]:
protvec_aa_freq

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,M,N,P,Q,R,S,T,V,W,Y
0,0.000326,0.082828,-0.016972,0.033496,-0.029845,-0.009866,-0.002361,0.067565,0.043482,-0.015739,...,0.0,0.000000,0.000000,0.000000,0.090909,0.090909,0.000000,0.045455,0.045455,0.045455
1,-0.008685,0.061298,0.045503,0.042366,-0.056826,0.029961,0.062079,0.041146,0.023968,-0.025709,...,0.0,0.114286,0.028571,0.028571,0.057143,0.142857,0.000000,0.028571,0.028571,0.000000
2,-0.015213,0.069901,0.032177,0.035516,-0.057223,0.020554,0.045740,0.042754,0.030702,-0.020374,...,0.0,0.057143,0.028571,0.028571,0.057143,0.142857,0.000000,0.028571,0.028571,0.000000
3,-0.015084,0.040755,0.016946,0.028864,-0.062246,0.028597,0.051237,0.072327,0.030972,-0.013748,...,0.0,0.085714,0.000000,0.057143,0.057143,0.200000,0.000000,0.000000,0.057143,0.000000
4,0.003517,-0.026367,0.021674,0.054289,0.002337,-0.000452,0.038705,-0.030244,0.050282,-0.014623,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.333333,0.000000,0.000000,0.111111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,-0.038337,-0.017915,-0.015082,0.042641,0.046676,0.037965,-0.039406,0.050116,0.023495,0.058380,...,0.0,0.066667,0.066667,0.033333,0.100000,0.000000,0.033333,0.033333,0.033333,0.000000
211,-0.018060,0.002701,-0.002945,0.016760,0.020204,0.018499,0.028342,0.035244,-0.012086,0.028506,...,0.0,0.045455,0.090909,0.045455,0.136364,0.000000,0.136364,0.000000,0.090909,0.136364
212,-0.004227,0.033425,-0.024497,0.029364,-0.066315,0.013616,0.038417,0.074161,0.056543,-0.023648,...,0.0,0.083333,0.000000,0.111111,0.000000,0.111111,0.027778,0.000000,0.083333,0.027778
213,0.003485,-0.004911,0.027475,0.020559,0.002146,0.072764,0.055225,0.014650,0.019468,-0.001088,...,0.0,0.170213,0.148936,0.021277,0.000000,0.042553,0.021277,0.042553,0.042553,0.000000


In [79]:
X_train, X_test, y_train, y_test = train_test_split(protvec_aa_freq, df_lessthan1['MIC'], test_size=0.2, random_state=42)

In [80]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None, 80, 90, 100, 110],
    'max_features': ['auto', 2, 3],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring=mape_scorer)

In [81]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1200 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 3257 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 18.3min finished


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True],
                         'max_depth': [None, 80, 90, 100, 110],
                         'max_features': ['auto', 2, 3],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [2, 8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
             verbose=2)

In [82]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 80,
 'max_features': 2,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 100}

In [83]:
best_grid = grid_search.best_estimator_

In [84]:
y_pred = best_grid.predict(X_test)

In [85]:
mean_squared_error(y_pred=y_pred, y_true=y_test)

0.058936185730447316

In [86]:
mean_absolute_error(y_pred=y_pred, y_true=y_test)

0.20207695498560402

In [87]:
mean_absolute_percentage_error(y_pred=y_pred, y_true=y_test)

1764.408322617052

### Physicochemical properties

In [11]:
shannon_entropy = pd.read_csv("data/pfeature/ha_avp_ic50_shannon_entropy.csv")
shannon_entropy = shannon_entropy.drop(['ID', 'Sequence'], axis=1)
residue_repeats = pd.read_csv("data/pfeature/ha_avp_ic50_residue_repeat.csv")
residue_repeats = residue_repeats.drop(['ID', 'Sequence'], axis=1).reset_index(drop=True)
sec_struct = pd.read_csv("data/pfeature/ha_avp_ic50_sec_struct.csv")
sec_struct = sec_struct.drop(['ID'], axis=1)

In [13]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [14]:
params = ['molecular_weight', 'aromaticity', 'instability_index',
          'isoelectric_point', 'helix', 'turn', 'sheet', 'with_reduced_cysteines',
          'with_disulfid_bridges', 'gravy', 'net_charge_at_pH7point4']

In [15]:
prop = []
for seq in df.Sequence:
    X = ProteinAnalysis(seq)
    molecular_weight = X.molecular_weight()
    aromaticity = X.aromaticity()
    instability_index = X.instability_index()
    isoelectric_point = X.isoelectric_point()
    sec_struc = X.secondary_structure_fraction()
    helix = sec_struc[0]
    turn = sec_struc[1]
    sheet = sec_struc[2]
    epsilon_prot = X.molar_extinction_coefficient()
    with_reduced_cysteines = epsilon_prot[0]
    with_disulfid_bridges = epsilon_prot[1]
    gravy = X.gravy() # hydrophobicity related
    # flexibility = X.flexibility()
    # X.protein_scale()
    net_charge_at_pH7point4 = X.charge_at_pH(7.4)

    prop.append([molecular_weight, aromaticity, instability_index, isoelectric_point, helix, turn, sheet,
                 with_reduced_cysteines, with_disulfid_bridges, gravy, net_charge_at_pH7point4])
prop = pd.DataFrame(prop, columns=params)

In [None]:
physicochem_prop = pd.concat([prop, shannon_entropy, residue_repeats, sec_struct], axis=1)