In [1]:
from tqdm import tqdm
import biovec
import numpy as np
import pandas as pd
from itertools import chain

In [2]:
from utils import *

In [3]:
from sklearn.svm import SVR
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

In [4]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [5]:
mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [6]:
avp_ic50 = pd.read_csv("../data/raw/AVP-IC50Pred_train.csv")
ha_avp = pd.read_csv("../data/raw/HA_AVP.csv")

df = pd.concat([avp_ic50[['Sequence', 'MIC']], ha_avp], axis=0).drop_duplicates(['Sequence']).reset_index(drop=True)
df = sequence_filtering(df)

In [7]:
uniprot_embedding = biovec.models.load_protvec("../data/embeddings/uniprot__kmer_3_contextWindow_10_vector_100_reduction_None")

vectors = convert_sequences_to_avg_vectors(df['Sequence'], uniprot_embedding)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
Creating vectors: 100%|██████████| 712/712 [00:00<00:00, 1064.79sequence/s]


In [8]:
X_train, X_test, y_train, y_test = train_test_split(vectors, df['MIC'], test_size=0.2, random_state=42)

In [9]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None, 80, 90, 100, 110],
    'max_features': ['auto', 2, 3],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring=mape_scorer)

In [10]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1200 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   57.6s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed: 27.1min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed: 29.9min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed: 44.4min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed: 61.9min
[Parallel(n_jobs=-1)]: Done 3257 tasks      | elapsed: 81.8min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 84.5min finished


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True],
                         'max_depth': [None, 80, 90, 100, 110],
                         'max_features': ['auto', 2, 3],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [2, 8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
             verbose=2)

In [11]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 100,
 'max_features': 'auto',
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'n_estimators': 100}

In [12]:
best_grid = grid_search.best_estimator_

In [13]:
y_pred = best_grid.predict(X_test)

In [14]:
mean_squared_error(y_pred=y_pred, y_true=y_test)

3645.791203752795

In [15]:
mean_absolute_error(y_pred=y_pred, y_true=y_test)

36.63066593565872

In [16]:
mean_absolute_percentage_error(y_pred=y_pred, y_true=y_test)

2771813.3754515215