In [1]:
from tqdm import tqdm
import biovec
import numpy as np
import pandas as pd
from itertools import chain

In [2]:
from utils import *

In [3]:
from sklearn.svm import SVR
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

In [4]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [5]:
mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [6]:
avp_ic50 = pd.read_csv("../data/raw/AVP-IC50Pred_train.csv")
ha_avp = pd.read_csv("../data/raw/HA_AVP.csv")

df = pd.concat([avp_ic50[['Sequence', 'MIC']], ha_avp], axis=0).drop_duplicates(['Sequence']).reset_index(drop=True)
df = sequence_filtering(df)

### Amino acid frequency

In [7]:
############# Amino acid frequency #############
aa_freq = reduce_by_kmer_frequency(df).sort_index(axis=1)

In [8]:
aa_freq

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
0,0.117647,0.000000,0.058824,0.000000,0.000000,0.176471,0.000000,0.000000,0.000000,0.000000,0.000000,0.117647,0.058824,0.117647,0.235294,0.000000,0.000000,0.117647,0.000000,0.000000
1,0.083333,0.000000,0.000000,0.000000,0.000000,0.000000,0.083333,0.166667,0.000000,0.083333,0.166667,0.000000,0.000000,0.083333,0.250000,0.000000,0.083333,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.125000,0.000000,0.000000,0.125000,0.000000,0.000000,0.000000,0.000000,0.000000,0.125000,0.125000,0.125000,0.125000,0.125000,0.000000,0.125000,0.000000,0.000000
3,0.000000,0.000000,0.090909,0.000000,0.000000,0.181818,0.000000,0.000000,0.000000,0.000000,0.000000,0.181818,0.090909,0.090909,0.181818,0.000000,0.000000,0.181818,0.000000,0.000000
4,0.117647,0.000000,0.058824,0.000000,0.000000,0.176471,0.000000,0.058824,0.000000,0.000000,0.000000,0.058824,0.058824,0.117647,0.235294,0.058824,0.000000,0.058824,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,0.066667,0.166667,0.000000,0.000000,0.066667,0.133333,0.033333,0.100000,0.066667,0.000000,0.000000,0.066667,0.066667,0.033333,0.100000,0.000000,0.033333,0.033333,0.033333,0.000000
708,0.000000,0.045455,0.045455,0.000000,0.045455,0.045455,0.045455,0.045455,0.045455,0.000000,0.000000,0.045455,0.090909,0.045455,0.136364,0.000000,0.136364,0.000000,0.090909,0.136364
709,0.027778,0.000000,0.027778,0.166667,0.027778,0.000000,0.027778,0.055556,0.055556,0.166667,0.000000,0.083333,0.000000,0.111111,0.000000,0.111111,0.027778,0.000000,0.083333,0.027778
710,0.063830,0.000000,0.106383,0.021277,0.085106,0.085106,0.042553,0.000000,0.042553,0.063830,0.000000,0.170213,0.148936,0.021277,0.000000,0.042553,0.021277,0.042553,0.042553,0.000000


In [53]:
X_train, X_test, y_train, y_test = train_test_split(aa_freq, df['MIC'], test_size=0.2, random_state=42)

In [54]:
X_train.shape, X_test.shape

((569, 20), (143, 20))

In [55]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None, 80, 90, 100, 110],
    'max_features': ['auto', 2, 3],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring=mape_scorer)

In [56]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1200 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   31.7s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 3257 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 18.9min finished


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True],
                         'max_depth': [None, 80, 90, 100, 110],
                         'max_features': ['auto', 2, 3],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [2, 8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
             verbose=2)

In [57]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 100,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 12,
 'n_estimators': 100}

In [58]:
best_grid = grid_search.best_estimator_

In [59]:
y_pred = best_grid.predict(X_test)

In [60]:
mean_squared_error(y_pred=y_pred, y_true=y_test)

2597.27446707504

In [61]:
mean_absolute_error(y_pred=y_pred, y_true=y_test)

32.453585672231924

In [62]:
mean_absolute_percentage_error(y_pred=y_pred, y_true=y_test)

7593631.210854171

In [63]:
list(zip(y_test, y_pred))

[(0.79, 2.018852304550799),
 (3.52, 10.712942831309299),
 (23.5, 27.45219120503155),
 (13.0, 7.433246882957075),
 (2.0, 3.6416490222416567),
 (2.02, 27.65092008286623),
 (20.0, 14.552816210817536),
 (21.5, 18.88854133430531),
 (100.0, 51.673077413617136),
 (2.0, 1.7979614730142055),
 (160.0, 34.36659837975608),
 (54.4, 67.66182585190553),
 (100.0, 35.58160974328284),
 (0.006999999999999999, 0.35905439811914314),
 (0.597, 1.892415887868338),
 (0.026000000000000002, 46.32432267804663),
 (118.0, 131.50139049755558),
 (3.38, 34.049513769454094),
 (8.0, 14.651683336982718),
 (6.8, 2.8825029055319358),
 (1.34, 12.64190961010057),
 (24.6, 23.220844013914135),
 (0.255, 6.011388849520255),
 (0.62, 6.26458727435579),
 (100.0, 39.20268130476415),
 (0.001, 0.829832318245407),
 (117.0, 95.5682335114649),
 (0.75, 54.446626380699975),
 (10.0, 14.510006864622078),
 (5.47, 11.249712864502841),
 (200.0, 48.39115883982813),
 (41.1, 31.7951546585322),
 (10.5, 33.180508960090904),
 (0.003, 3.75977381168893

### Prot2Vec

In [64]:
############# Prot2Vec #############
uniprot_embedding = biovec.models.load_protvec("../data/embeddings/uniprot__kmer_3_contextWindow_10_vector_100_reduction_None")

vectors, _ = convert_sequences_to_vectors(df['Sequence'], uniprot_embedding, words_to_vec, kmer=3)
vectors = vectors.reset_index(drop=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
Creating vectors: 100%|██████████| 712/712 [00:01<00:00, 397.31sequence/s]


In [65]:
vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.035865,0.056587,-0.912851,0.710750,0.600315,0.412987,0.214245,0.087536,-0.006778,-0.553902,...,0.234127,0.001601,0.024704,0.530668,-0.046863,-0.315385,-0.518008,0.482755,-0.339732,0.713406
1,-0.792693,-0.036186,-0.534990,-0.369818,-0.248938,0.263291,-0.287422,0.382745,-0.394377,-0.173330,...,0.962917,-0.311649,0.264908,0.946307,-0.269523,-0.548274,0.092901,-0.110641,0.260107,0.217777
2,-0.026501,-0.291550,-0.201532,0.452551,0.174593,0.538034,0.044253,-0.211522,0.271494,-0.058814,...,0.417982,-0.270066,-0.058481,-0.181214,0.178602,0.231382,-0.294070,-0.129272,-0.005186,0.004585
3,-0.143584,-0.150013,-0.288958,0.442270,0.260510,0.756453,0.238232,0.169671,0.388300,0.292154,...,0.072858,-0.408436,0.111759,-0.007251,-0.259025,-0.079575,-0.197588,0.014973,-0.254488,0.185948
4,0.033022,-0.003353,-0.648364,0.625699,0.353737,0.439275,-0.089358,-0.031649,-0.188973,-0.858152,...,0.604532,-0.018100,0.191170,0.319797,0.211116,0.092421,-0.444209,-0.073724,0.109172,0.685706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,-1.073433,-0.501622,-0.422303,1.193956,1.306920,1.063010,-1.103380,1.403239,0.657871,1.634630,...,0.178000,0.022571,0.205378,-0.446653,1.526093,1.593848,-1.880108,0.481448,1.041668,-1.731232
708,-0.361206,0.054028,-0.058891,0.335209,0.404084,0.369986,0.566835,0.704883,-0.241717,0.570128,...,0.412215,0.215851,0.387135,0.530442,0.762547,0.872716,0.294733,0.212151,1.096743,-0.768404
709,-0.143712,1.136464,-0.832907,0.998365,-2.254707,0.462950,1.306185,2.521483,1.922461,-0.804041,...,1.742563,-1.963696,0.209050,0.058067,0.555618,1.645272,-0.723226,-0.605143,2.387918,-1.566553
710,0.156833,-0.221012,1.236367,0.925153,0.096579,3.274387,2.485103,0.659247,0.876062,-0.048977,...,1.047525,-0.615316,-0.329523,-0.631238,-1.097751,2.060387,-0.853296,0.338695,2.203732,-2.769299


In [66]:
X_train, X_test, y_train, y_test = train_test_split(vectors, df['MIC'], test_size=0.2, random_state=42)

In [67]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None, 80, 90, 100, 110],
    'max_features': ['auto', 2, 3],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring=mape_scorer)

In [68]:
grid_search.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 1200 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   44.4s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed: 22.9min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed: 38.0min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed: 52.8min
[Parallel(n_jobs=-1)]: Done 3257 tasks      | elapsed: 68.1min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 69.5min finished


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True],
                         'max_depth': [None, 80, 90, 100, 110],
                         'max_features': ['auto', 2, 3],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [2, 8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
             verbose=2)

In [69]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 90,
 'max_features': 'auto',
 'min_samples_leaf': 5,
 'min_samples_split': 8,
 'n_estimators': 100}

In [70]:
best_grid = grid_search.best_estimator_

In [71]:
y_pred = best_grid.predict(X_test)

In [72]:
mean_squared_error(y_pred=y_pred, y_true=y_test)

3305.244344633785

In [73]:
mean_absolute_error(y_pred=y_pred, y_true=y_test)

34.62796288814881

In [74]:
mean_absolute_percentage_error(y_pred=y_pred, y_true=y_test)

4057040.5750611406