In [1]:
from tqdm import tqdm
import biovec
import numpy as np
import pandas as pd
from itertools import chain

In [2]:
from utils import *

In [3]:
from sklearn.svm import SVR
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

In [4]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [5]:
mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [6]:
avp_ic50 = pd.read_csv("../data/raw/AVP-IC50Pred_train.csv")
ha_avp = pd.read_csv("../data/raw/HA_AVP.csv")

df = pd.concat([avp_ic50[['Sequence', 'MIC']], ha_avp], axis=0).drop_duplicates(['Sequence']).reset_index(drop=True)
df = sequence_filtering(df)

In [7]:
uniprot_embedding = biovec.models.load_protvec("../data/embeddings/uniprot__kmer_3_contextWindow_10_vector_100_reduction_None")

vectors = convert_sequences_to_avg_vectors(df['Sequence'], uniprot_embedding)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
Creating vectors: 100%|██████████| 712/712 [00:00<00:00, 1144.81sequence/s]


In [8]:
from scipy.stats import boxcox
from scipy.special import inv_boxcox

In [9]:
mic_boxcox, lmbda = boxcox(df['MIC'])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(vectors, mic_boxcox, test_size=0.2, random_state=42)

In [11]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None, 80, 90, 100, 110],
    'max_features': ['auto', 2, 3],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring=mape_scorer)

In [12]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1200 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed: 19.6min
[Parallel(n_jobs=-1)]: Done 3257 tasks      | elapsed: 24.7min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 25.6min finished
  array_means[:, np.newaxis]) ** 2,


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True],
                         'max_depth': [None, 80, 90, 100, 110],
                         'max_features': ['auto', 2, 3],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [2, 8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
             verbose=2)

In [13]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [14]:
best_grid = grid_search.best_estimator_

In [15]:
y_pred_boxcox = best_grid.predict(X_test)

In [16]:
y_pred = inv_boxcox(y_pred_boxcox, lmbda)

In [22]:
y_test_act = inv_boxcox(y_test, lmbda)

In [24]:
mean_squared_error(y_pred=y_pred, y_true=y_test_act)

4811.394762677909

In [25]:
mean_absolute_error(y_pred=y_pred, y_true=y_test_act)

32.959935577266556

In [26]:
mean_absolute_percentage_error(y_pred=y_pred, y_true=y_test_act)

1029463.7480233541

In [34]:
abs_perc_err = (abs(y_test_act - y_pred)/y_test_act)*100

In [36]:
list(zip(y_test_act, y_pred, abs_perc_err))

[(0.79, 1.9437657501757504, 146.0462974906013),
 (3.5200000000000005, 4.643262288651999, 31.91086047306814),
 (23.500000000000025, 14.127687707936142, 39.882179966229245),
 (13.0, 1.491247228376221, 88.52886747402907),
 (2.0000000000000004, 0.5692181163460852, 71.53909418269575),
 (2.02, 6.879034819414229, 240.54627818882318),
 (19.99999999999999, 13.097289563456949, 34.51355218271522),
 (21.500000000000018, 10.402419168623618, 51.61665502965763),
 (100.00000000000004, 18.762996019612864, 81.23700398038716),
 (2.0000000000000004, 0.7475679235729555, 62.62160382135223),
 (159.99999999999994, 19.289105201520627, 87.94430924904961),
 (54.40000000000004, 19.504564784789647, 64.14602061619553),
 (100.00000000000004, 12.282348765710818, 87.71765123428918),
 (0.006999999999999991, 0.025243425176509574, 260.620359664423),
 (0.5969999999999999, 0.05793794086694925, 90.29515228359308),
 (0.025999999999999985, 7.95696234445999, 30503.701324846134),
 (118.00000000000014, 16.41965872604659, 86.0850