In [1]:
cd ../../src

/Users/in-divye.singh/Documents/Projects/MIC_predictor/src


In [23]:
import biovec
import numpy as np
import pandas as pd
from itertools import chain, combinations
from collections import Counter

from utils import *

from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [3]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [4]:
avp_ic50 = pd.read_csv("../data/raw/AVP-IC50Pred_train.csv")
ha_avp = pd.read_csv("../data/raw/HA_AVP.csv")

In [5]:
df = pd.concat([avp_ic50[['Sequence','MIC']], ha_avp], axis=0).drop_duplicates(['Sequence']).reset_index(drop=True)
df = sequence_filtering(df)

In [6]:
df

Unnamed: 0,Sequence,MIC
0,AAQRRGRVGRNPNQVGD,442.00000
1,HRILARIRQMMT,435.50000
2,RNPSQVGD,383.00000
3,RVGRNPNQVGD,374.00000
4,AAQRRGRIGRNPSQVGD,358.00000
...,...,...
707,NGAICWGPCPTAFRQIGNCGHFKVRCCKIR,0.11515
708,CFPYITRPGTYHDWWYTRKNRQ,0.30000
709,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,0.01131
710,GTNLSVPNPLGFFPDHQLDPAFGANSNNPDWDFNPNKDHWPEANKVG,0.00008


In [7]:
def get_physicochemical_properties(df):
    params = ['aromaticity', 'helix', 'turn', 'sheet', 'gravy', 'net_charge_at_pH7point4']

    prop = []
    for seq in df.Sequence:
        X = ProteinAnalysis(seq)
        aromaticity = X.aromaticity()
        sec_struc = X.secondary_structure_fraction()
        helix = sec_struc[0]
        turn = sec_struc[1]
        sheet = sec_struc[2]
        gravy = X.gravy() # hydrophobicity related
        net_charge_at_pH7point4 = X.charge_at_pH(7.4)

        prop.append([aromaticity, helix, turn, sheet, gravy, net_charge_at_pH7point4])
    return pd.DataFrame(prop, columns=params)

In [8]:
aa_freq = reduce_by_kmer_frequency(df)

In [9]:
uniprot_embedding = biovec.models.load_protvec("../data/embeddings/uniprot__kmer_3_contextWindow_10_vector_100_reduction_None")

avg_protvec = convert_sequences_to_avg_vectors(df['Sequence'], uniprot_embedding, kmer=3)
avg_protvec = avg_protvec.reset_index(drop=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
Creating vectors: 100%|██████████| 712/712 [00:00<00:00, 1134.56sequence/s]


In [10]:
physicochemical_prop = get_physicochemical_properties(df)

In [11]:
X = pd.concat([aa_freq, avg_protvec, physicochemical_prop[['helix','turn','sheet']]], axis=1)

In [12]:
y = df['MIC']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None, 80, 90, 100, 110],
    'max_features': ['auto', 2, 3],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring=mape_scorer)

In [25]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1200 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed: 20.3min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed: 28.2min
[Parallel(n_jobs=-1)]: Done 3257 tasks      | elapsed: 36.2min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 37.2min finished


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True],
                         'max_depth': [None, 80, 90, 100, 110],
                         'max_features': ['auto', 2, 3],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [2, 8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
             verbose=2)

In [26]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 100,
 'max_features': 'auto',
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'n_estimators': 100}

In [27]:
best_grid = grid_search.best_estimator_

In [28]:
y_pred = best_grid.predict(X_test)

In [29]:
mean_absolute_percentage_error(y_test, y_pred)

4107993.486153219

In [30]:
from scipy.stats import pearsonr

In [31]:
pearsonr(y_test, y_pred)

(0.5461122270142738, 1.7328486501462713e-12)

In [32]:
ape = 100*np.abs(y_test-y_pred)/y_test

In [33]:
list(zip(y_test, y_pred, ape))

[(0.79, 3.4222041096681113, 333.19039362887486),
 (3.52, 10.354519251082245, 194.16247872392742),
 (23.5, 33.060188653540905, 40.681653844854914),
 (13.0, 6.628704503228716, 49.0099653597791),
 (2.0, 2.1931448141955263, 9.657240709776316),
 (2.02, 24.362031419696965, 1106.041159390939),
 (20.0, 25.087989780699854, 25.439948903499268),
 (21.5, 30.446322477269113, 41.610802219856346),
 (100.0, 35.35089322799421, 64.64910677200578),
 (2.0, 2.8983153881673895, 44.91576940836948),
 (160.0, 39.81454272813851, 75.11591079491343),
 (54.4, 71.82567693972138, 32.032494374487825),
 (100.0, 36.77939858082251, 63.22060141917749),
 (0.006999999999999999, 2.7031973863561434, 38517.10551937348),
 (0.597, 6.464236116500166, 982.7866191792574),
 (0.026000000000000002, 37.832245446588416, 145408.63633303234),
 (118.0, 78.18551060694084, 33.74109270598234),
 (3.38, 22.22963790591631, 557.6815948495951),
 (8.0, 10.834700611111106, 35.43375763888883),
 (6.8, 6.5444870039682534, 3.757544059290388),
 (1.34, 1

In [35]:
param_grid = {
    'C':[0.001,0.01,0.1,1,10,100,1000],
    'kernel':['rbf','poly','sigmoid','linear'],
    'degree':[1,2,3,4,5,6],
    'gamma': [10, 1, 0.1, 0.01, 0.001, 0.0001]
}
svr = SVR()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = svr, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring=mape_scorer)

In [36]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1008 candidates, totalling 5040 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 204 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 610 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 1906 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 2796 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 3850 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done 5040 out of 5040 | elapsed:   36.1s finished


GridSearchCV(cv=5, estimator=SVR(), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'degree': [1, 2, 3, 4, 5, 6],
                         'gamma': [10, 1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'poly', 'sigmoid', 'linear']},
             scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
             verbose=2)

In [37]:
grid_search.best_params_

{'C': 10, 'degree': 1, 'gamma': 1, 'kernel': 'rbf'}

In [38]:
best_grid = grid_search.best_estimator_

In [39]:
y_pred = best_grid.predict(X_test)

In [40]:
mean_absolute_percentage_error(y_test, y_pred)

2074026.1243741622

In [41]:
from scipy.stats import pearsonr

In [42]:
pearsonr(y_test, y_pred)

(0.4311805449345907, 7.614421999300924e-08)

In [21]:
ape = 100*np.abs(y_test-y_pred)/y_test

In [45]:
np.corrcoef(y_test, y_pred)

array([[1.        , 0.43118054],
       [0.43118054, 1.        ]])