## ACP - Testing Modelling

In [1]:
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import GridSearchCV, train_test_split,  StratifiedKFold

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
# To visualize pipeline diagram - 'text', or 'diagram'
from sklearn import set_config

# Import XGBoost
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
import seaborn as sns
# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score

random_state = 10

In [2]:
# Import the script from different folder
import sys  
sys.path.append('../../scripts')

import file_utilities as fu
import modelling_utilities as mu

#### Set the task

In [3]:
# task - ['acp', 'amp', 'dna_binding']
task = 'acp'

#### Get embedding folders and fasta files for the task.

In [4]:
pt_folders, fa_files = mu.get_emb_folders(task)

In [5]:
pt_folders

[('../../data/acp/esm/train/acp_train_esm1b_mean',
  '../../data/acp/esm/test/acp_test_esm1b_mean'),
 ('../../data/acp/esm/train/acp_train_esm1v_mean',
  '../../data/acp/esm/test/acp_test_esm1v_mean'),
 ('../../data/acp/prose/train/acp_train_dlm_avg',
  '../../data/acp/prose/test/acp_test_dlm_avg'),
 ('../../data/acp/prose/train/acp_train_dlm_max',
  '../../data/acp/prose/test/acp_test_dlm_max'),
 ('../../data/acp/prose/train/acp_train_dlm_sum',
  '../../data/acp/prose/test/acp_test_dlm_sum'),
 ('../../data/acp/prose/train/acp_train_mt_avg',
  '../../data/acp/prose/test/acp_test_mt_avg'),
 ('../../data/acp/prose/train/acp_train_mt_max',
  '../../data/acp/prose/test/acp_test_mt_max'),
 ('../../data/acp/prose/train/acp_train_mt_sum',
  '../../data/acp/prose/test/acp_test_mt_sum')]

In [6]:
fa_files

[('../../data/acp/train_data.fa', '../../data/acp/test_data.fa')]

### Define Pipelines

In [7]:
num_pca_components = 1000
pipelines = {
    'xgb' : make_pipeline(MinMaxScaler(), 
                          PCA(num_pca_components),
                          XGBClassifier(random_state=random_state)),
    'lr' : make_pipeline(MinMaxScaler(),
                         PCA(num_pca_components),
                         LogisticRegression(max_iter=25000, random_state=random_state)),    
    'svm' : make_pipeline(MinMaxScaler(),
                          PCA(num_pca_components),
                          SVC(random_state=random_state)),
    'rf' : make_pipeline(MinMaxScaler(),
                         PCA(num_pca_components),
                         RandomForestClassifier(random_state=random_state))
}

In [8]:
# pipelines = {
#     'xgb' : make_pipeline(MinMaxScaler(), 
#                           XGBClassifier(random_state=random_state)),
#     'lr' : make_pipeline(MinMaxScaler(),
#                          LogisticRegression(max_iter=25000, random_state=random_state))
# }

### Define Hyperparameter Grids

In [8]:
xgb_grid = {
        'xgbclassifier__max_depth': [3, 5],
         'xgbclassifier__n_estimators': [100, 200],
        }

svm_grid = {
        'svc__kernel' : ['linear', 'rbf'],
        'svc__C': [0.01, 0.1, 1]
        #'svc__gamma': [5, 1, 0.1]
    }

rf_grid = {
        'randomforestclassifier__n_estimators' : [100, 150],
        #'randomforestclassifier__max_features' : ['sqrt', 0.33],
        'randomforestclassifier__min_samples_leaf' : [1, 3],
        #'randomforestclassifier__criterion' : ['gini', 'entropy'],
        'randomforestclassifier__min_samples_split' : [2, 3]
    }

lr_grid = {
        'logisticregression__C' : [0.1, 1],
        'logisticregression__solver' : ['lbfgs', 'saga']
    }


In [9]:
# Create hyperparameter grids dictionary
hp_grids = {
    'lr' : lr_grid,
    'svm' : svm_grid,
    'rf' : rf_grid,
    'xgb' : xgb_grid
}

## Modelling Loop

In [10]:
# Initialize dictionary to keep evaluation dataframes 
# One dataframe per embeddings folder (train+test, or all_data)
df_models = {}

for i in range(len(pt_folders)):
## for i in range(1,3):
    
    # Train
    # second index: 0 - train, 1 - test
    path_fa, path_pt = fa_files[0][0], pt_folders[i][0]
    pool = os.path.split(path_pt)[1].split('_')[-1]
    emb_layer = 33 if 'esm' in path_pt else 'layer'
    X_train, y_train, sequence_id_train = fu.read_embeddings(path_fa, path_pt, pool, emb_layer,print_dims=False)
    
    # Test
    path_fa, path_pt = fa_files[0][1], pt_folders[i][1]
    X_test, y_test, sequence_id_train = fu.read_embeddings(path_fa, path_pt, pool, emb_layer, print_dims=False)  

    # Extensions for evaluations dataframes
    df_ext = os.path.split(path_pt)[1].split('_', 1)[1].split('_', 1)[1]
    
    # Printing header
    ptm = df_ext.split('_')[0]
    ptr = 'ESM' if 'esm' in ptm else 'ProSE'
    print('-' * 75)
    print(f'\tPretrained Model "{ptm}" by {ptr} - Pooling Operation: "{pool}"')
    print('-' * 75)
    
    # Grid search and fit
    fitted_models, cv_results = mu.fit_tune_CV(pipelines, hp_grids, 'accuracy', path_pt, X_train, y_train, task)
    
    # Evaluation
    df_models[f'env_{df_ext}'] = mu.evaluation(fitted_models, X_test, y_test)
  

---------------------------------------------------------------------------
	Pretrained Model "esm1b" by ESM - Pooling Operation: "mean"
---------------------------------------------------------------------------
esm1b_mean_xgb has been fitted and saved
esm1b_mean_lr has been fitted and saved
esm1b_mean_svm has been fitted and saved
esm1b_mean_rf has been fitted and saved
---------------------------------------------------------------------------
	Pretrained Model "esm1v" by ESM - Pooling Operation: "mean"
---------------------------------------------------------------------------
esm1v_mean_xgb has been fitted and saved
esm1v_mean_lr has been fitted and saved
esm1v_mean_svm has been fitted and saved
esm1v_mean_rf has been fitted and saved
---------------------------------------------------------------------------
	Pretrained Model "dlm" by ProSE - Pooling Operation: "avg"
---------------------------------------------------------------------------
dlm_avg_xgb has been fitted and saved


In [12]:
df_models.keys()

dict_keys(['env_esm1b_mean', 'env_esm1v_mean', 'env_dlm_avg', 'env_dlm_max', 'env_dlm_sum', 'env_mt_avg', 'env_mt_max', 'env_mt_sum'])

In [13]:
df_models['env_mt_avg']

Unnamed: 0_level_0,best_score,f1_macro,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mt_avg_xgb,0.707531,0.703478,0.703488
mt_avg_lr,0.741665,0.735445,0.735465
mt_avg_svm,0.754003,0.74692,0.747093
mt_avg_rf,0.664746,0.706095,0.706395


## Collecting Evaluation Results into a DataFrame

#### Merge all dataframes from the dictionary `df_models`

In [15]:
# Create dataframe with evaluations for all models

# initialize dataframe
eval_df_all = pd.DataFrame()
# concatenate all dataframes from dictionary df_models
# Iterate through all dictionary keys 
for i in df_models.keys():
    # Use a temporary dataframe to hold one iteraton's dataframe
    eval_df_t = df_models[i].copy().reset_index()
    eval_df_all = pd.concat([eval_df_all, eval_df_t])

# Set the column 'model' as an index
eval_df_all = eval_df_all.set_index('model')

#### Display the results sorted by "accuracy"

In [19]:
# Display the dataframe
eval_df_all.sort_values(by=['accuracy'], ascending=False)

Unnamed: 0_level_0,best_score,f1_macro,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dlm_avg_svm,0.757621,0.749966,0.75
mt_avg_svm,0.754003,0.74692,0.747093
dlm_max_svm,0.766321,0.741277,0.741279
dlm_sum_xgb,0.7119,0.734961,0.735465
mt_avg_lr,0.741665,0.735445,0.735465
dlm_avg_xgb,0.702466,0.732477,0.732558
dlm_avg_lr,0.743839,0.732477,0.732558
esm1b_mean_svm,0.743837,0.729466,0.729651
esm1v_mean_svm,0.729318,0.729136,0.729651
mt_sum_xgb,0.706819,0.725622,0.726744


# SAVE DATAFRAME !!!!