In [1]:
from tqdm import tqdm
import biovec
import numpy as np
import pandas as pd
from itertools import chain

In [2]:
from notebook.utils import *

In [3]:
from sklearn.svm import SVR
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

In [4]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [5]:
mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [7]:
avp_ic50 = pd.read_csv("data/raw/AVP-IC50Pred_train.csv")
ha_avp = pd.read_csv("data/raw/HA_AVP.csv")

df = pd.concat([avp_ic50[['Sequence', 'MIC']], ha_avp], axis=0).drop_duplicates(['Sequence']).reset_index(drop=True)
df = sequence_filtering(df)

FileNotFoundError: [Errno 2] File data/raw/AVP-IC50Pred_train.csv does not exist: 'data/raw/AVP-IC50Pred_train.csv'

### Physico-chemical properties

In [None]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [None]:
params = ['molecular_weight', 'aromaticity', 'instability_index',
          'isoelectric_point', 'helix', 'turn', 'sheet', 'with_reduced_cysteines',
          'with_disulfid_bridges', 'gravy', 'net_charge_at_pH7point4']

In [None]:
prop = []
for seq in df.Sequence:
    X = ProteinAnalysis(seq)
    molecular_weight = X.molecular_weight()
    aromaticity = X.aromaticity()
    instability_index = X.instability_index()
    isoelectric_point = X.isoelectric_point()
    sec_struc = X.secondary_structure_fraction()
    helix = sec_struc[0]
    turn = sec_struc[1]
    sheet = sec_struc[2]
    epsilon_prot = X.molar_extinction_coefficient()
    with_reduced_cysteines = epsilon_prot[0]
    with_disulfid_bridges = epsilon_prot[1]
    gravy = X.gravy() # hydrophobicity related
    # flexibility = X.flexibility()
    # X.protein_scale()
    net_charge_at_pH7point4 = X.charge_at_pH(7.4)

    prop.append([molecular_weight, aromaticity, instability_index, isoelectric_point, helix, turn, sheet,
                 with_reduced_cysteines, with_disulfid_bridges, gravy, net_charge_at_pH7point4])
prop = pd.DataFrame(prop, columns=params)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
prop_transformed = pd.DataFrame(scaler.fit_transform(prop)).reset_index(drop=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(prop, df['MIC'], test_size=0.2, random_state=42)

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None, 80, 90, 100, 110],
    'max_features': ['auto', 2, 3],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring=mape_scorer)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_

In [None]:
y_pred = best_grid.predict(X_test)

In [None]:
mean_squared_error(y_pred=y_pred, y_true=y_test)

In [None]:
mean_absolute_error(y_pred=y_pred, y_true=y_test)

In [None]:
mean_absolute_percentage_error(y_pred=y_pred, y_true=y_test)

### Physico-chemical properties + averaged protvec

In [None]:
uniprot_embedding = biovec.models.load_protvec("../data/embeddings/uniprot__kmer_3_contextWindow_10_vector_100_reduction_None")

avg_vectors = convert_sequences_to_avg_vectors(df['Sequence'], uniprot_embedding)
avg_vectors = avg_vectors.reset_index(drop=True)

In [None]:
prop_avg_vec = pd.concat([prop, avg_vectors], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(prop_avg_vec, df['MIC'], test_size=0.2, random_state=42)

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None, 80, 90, 100, 110],
    'max_features': ['auto', 2, 3],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring=mape_scorer)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_

In [None]:
y_pred = best_grid.predict(X_test)

In [None]:
mean_squared_error(y_pred=y_pred, y_true=y_test)

In [None]:
mean_absolute_error(y_pred=y_pred, y_true=y_test)

In [None]:
mean_absolute_percentage_error(y_pred=y_pred, y_true=y_test)

In [None]:
import os

In [None]:
os.path.exists('../../test.csv')

In [None]:
pwd

In [None]:
prop