## AMP - Testing Modelling

In [1]:
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import GridSearchCV, train_test_split,  StratifiedKFold

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
# To visualize pipeline diagram - 'text', or 'diagram'
from sklearn import set_config

# Import XGBoost
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
import seaborn as sns
# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score

random_state = 10

In [2]:
# Import the script from different folder
import sys  
sys.path.append('../../scripts')

import file_utilities as fu
import modelling_utilities as mu

#### Set the task

In [3]:
# task - ['acp', 'amp', 'dna_binding']
task = 'amp'

#### Get embedding folders and fasta files for the task.

In [4]:
pt_folders, fa_files = mu.get_emb_folders(task)

In [5]:
pt_folders

['../../data/amp/esm/all_data/amp_all_esm1b_mean',
 '../../data/amp/esm/all_data/amp_all_esm1v_mean',
 '../../data/amp/prose/all_data/amp_all_dlm_avg',
 '../../data/amp/prose/all_data/amp_all_dlm_max',
 '../../data/amp/prose/all_data/amp_all_dlm_sum',
 '../../data/amp/prose/all_data/amp_all_mt_avg',
 '../../data/amp/prose/all_data/amp_all_mt_max',
 '../../data/amp/prose/all_data/amp_all_mt_sum']

In [6]:
fa_files

['../../data/amp/all_data.fa']

### Define Pipelines

In [7]:
pipelines = {
    'xgb' : make_pipeline(MinMaxScaler(), 
                          XGBClassifier(random_state=random_state)),
    'lr' : make_pipeline(MinMaxScaler(),
                         LogisticRegression(max_iter=25000, random_state=random_state)),    
    'svm' : make_pipeline(MinMaxScaler(), 
                          SVC(random_state=random_state)),
    'rf' : make_pipeline(MinMaxScaler(), 
                         RandomForestClassifier(random_state=random_state))
}

### Define Hyperparameter Grids

In [8]:
xgb_grid = {
        'xgbclassifier__max_depth': [3, 5],
         'xgbclassifier__n_estimators': [100, 200],
        }

svm_grid = {
        'svc__kernel' : ['linear', 'rbf'],
        'svc__C': [0.01, 0.1, 1]
        #'svc__gamma': [5, 1, 0.1]
    }

rf_grid = {
        'randomforestclassifier__n_estimators' : [100, 150],
        #'randomforestclassifier__max_features' : ['sqrt', 0.33],
        'randomforestclassifier__min_samples_leaf' : [1, 3],
        #'randomforestclassifier__criterion' : ['gini', 'entropy'],
        'randomforestclassifier__min_samples_split' : [2, 3]
    }

lr_grid = {
        'logisticregression__C' : [0.1, 1],
        'logisticregression__solver' : ['lbfgs', saga']
    }


In [9]:
# Create hyperparameter grids dictionary
hp_grids = {
    'lr' : lr_grid,
    'svm' : svm_grid,
    'rf' : rf_grid,
    'xgb' : xgb_grid
}

In [14]:
for i in range(len(pt_folders)):
    path_fa, path_pt = fa_files[0], pt_folders[i]
    pool = os.path.split(path_pt)[1].split('_')[-1]
    print(path_pt, path_fa, pool, '\n')

../../data/amp/esm/all_data/amp_all_esm1b_mean ../../data/amp/all_data.fa mean 

../../data/amp/esm/all_data/amp_all_esm1v_mean ../../data/amp/all_data.fa mean 

../../data/amp/prose/all_data/amp_all_dlm_avg ../../data/amp/all_data.fa avg 

../../data/amp/prose/all_data/amp_all_dlm_max ../../data/amp/all_data.fa max 

../../data/amp/prose/all_data/amp_all_dlm_sum ../../data/amp/all_data.fa sum 

../../data/amp/prose/all_data/amp_all_mt_avg ../../data/amp/all_data.fa avg 

../../data/amp/prose/all_data/amp_all_mt_max ../../data/amp/all_data.fa max 

../../data/amp/prose/all_data/amp_all_mt_sum ../../data/amp/all_data.fa sum 



## Modelling Loop

In [10]:
# Initialize dictionary to keep evaluation dataframes 
# One dataframe per embeddings folder (train+test, or all_data)
df_models = {}

#for i in range(len(pt_folders)):
for i in range(1):
    
    # Train
    # second index: 0 - train, 1 - test
    path_fa, path_pt = fa_files[0], pt_folders[i]
    pool = os.path.split(path_pt)[1].split('_')[-1]
    emb_layer = 33 if 'esm' in path_pt else 'layer'
    X, y, sequence_id_train = fu.read_embeddings(path_fa, path_pt, pool, emb_layer,print_dims=False)
    
    # Train-Test split
    # Split X and y into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                    random_state=random_state,
                                                   stratify=y)

# Print number of observations in X_train, X_test, y_train, and y_test
# print(len(X_train), len(X_test), len(y_train), len(y_test))

    # Extensions for evaluations dataframes
    df_ext = os.path.split(path_pt)[1].split('_', 1)[1].split('_', 1)[1]
    
    # Printing header
    ptm = df_ext.split('_')[0]
    ptr = 'ESM' if 'esm' in ptm else 'ProSE'
    print('-' * 75)
    print(f'\tPretrained Model "{ptm}" by {ptr} - Pooling Operation: "{pool}"')
    print('-' * 75)
    
    # Grid search and fit
    fitted_models, cv_results = mu.fit_tune_CV(pipelines, hp_grids, 'accuracy', path_pt, X_train, y_train, task)
    
    # Evaluation
    df_models[f'env_{df_ext}'] = mu.evaluation(fitted_models, X_test, y_test)
  

---------------------------------------------------------------------------
	Pretrained Model "esm1b" by ESM - Pooling Operation: "mean"
---------------------------------------------------------------------------
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=200,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=10,
              reg_alpha=0, reg_lambda=1, ...)
esm1b_mean_xgb has been fitted and saved
LogisticRegression(C=1, max_iter=15000, random

In [11]:
df_models

{'env_esm1b_mean':                 best_score  f1_macro  accuracy
 model                                         
 esm1b_mean_xgb    0.938307  0.932668  0.932740
 esm1b_mean_lr     0.936984  0.933671  0.933729
 esm1b_mean_svm    0.934677  0.933663  0.933729
 esm1b_mean_rf     0.931046  0.931555  0.931751}

In [12]:
df_models['env_esm1b_mean']

Unnamed: 0_level_0,best_score,f1_macro,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
esm1b_mean_xgb,0.938307,0.932668,0.93274
esm1b_mean_lr,0.936984,0.933671,0.933729
esm1b_mean_svm,0.934677,0.933663,0.933729
esm1b_mean_rf,0.931046,0.931555,0.931751


In [13]:
df_models['env_esm1v_mean']

## Collecting Evaluation Results into a DataFrame

#### Merge all dataframes from the dictionary `df_models`

In [17]:
# Create dataframe with evaluations for all models

# initialize dataframe
eval_df_all = pd.DataFrame()
# concatenate all dataframes from dictionary df_models
# Iterate through all dictionary keys 
for i in df_models.keys():
    # Use a temporary dataframe to hold one iteraton's dataframe
    eval_df_t = df_models[i].copy().reset_index()
    eval_df_all = pd.concat([eval_df_all, eval_df_t])

# Set the column 'model' as an index
eval_df_all = eval_df_all.set_index('model')

#### Display the results.

In [None]:
# Display the dataframe
eval_df_all