## DBP - Testing Modelling

In [29]:
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import GridSearchCV, train_test_split,  StratifiedKFold

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
# To visualize pipeline diagram - 'text', or 'diagram'
from sklearn import set_config

# Import XGBoost
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
import seaborn as sns
# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score

random_state = 10

In [2]:
# Import the script from different folder
import sys  
sys.path.append('../../scripts')

import file_utilities as fu
import modelling_utilities as mu

#### Set the task

In [3]:
# task - ['acp', 'amp', 'dna_binding']
task = 'dna_binding'


#### Get embedding folders and fasta files for the task.

In [4]:
pt_folders, fa_files = mu.get_emb_folders(task)

In [5]:
pt_folders

[('../../data/dna_binding/esm/train/dbp_train_esm1b_mean',
  '../../data/dna_binding/esm/test/dbp_test_esm1b_mean'),
 ('../../data/dna_binding/esm/train/dbp_train_esm1v_mean',
  '../../data/dna_binding/esm/test/dbp_test_esm1v_mean'),
 ('../../data/dna_binding/prose/train/dbp_train_dlm_avg',
  '../../data/dna_binding/prose/test/dbp_test_dlm_avg'),
 ('../../data/dna_binding/prose/train/dbp_train_dlm_max',
  '../../data/dna_binding/prose/test/dbp_test_dlm_max'),
 ('../../data/dna_binding/prose/train/dbp_train_dlm_sum',
  '../../data/dna_binding/prose/test/dbp_test_dlm_sum'),
 ('../../data/dna_binding/prose/train/dbp_train_mt_avg',
  '../../data/dna_binding/prose/test/dbp_test_mt_avg'),
 ('../../data/dna_binding/prose/train/dbp_train_mt_max',
  '../../data/dna_binding/prose/test/dbp_test_mt_max'),
 ('../../data/dna_binding/prose/train/dbp_train_mt_sum',
  '../../data/dna_binding/prose/test/dbp_test_mt_sum')]

In [6]:
fa_files

[('../../data/dna_binding/train_esm.fa', '../../data/dna_binding/test_esm.fa'),
 ('../../data/dna_binding/train_prose.fa',
  '../../data/dna_binding/test_prose.fa')]

### Define Pipelines

In [9]:
pipelines = {
    'xgb' : make_pipeline(MinMaxScaler(), 
                          XGBClassifier(random_state=random_state)),
    'lr' : make_pipeline(MinMaxScaler(),
                         LogisticRegression(max_iter=25000, random_state=random_state)),    
    'svm' : make_pipeline(MinMaxScaler(), 
                          SVC(random_state=random_state)),
    'rf' : make_pipeline(MinMaxScaler(), 
                         RandomForestClassifier(random_state=random_state))
}

In [31]:
num_pca_components = 1000
pipelines = {
    'xgb' : make_pipeline(MinMaxScaler(), 
                          PCA(num_pca_components),
                          XGBClassifier(random_state=random_state)),
    'lr' : make_pipeline(MinMaxScaler(),
                         PCA(num_pca_components),
                         LogisticRegression(max_iter=25000, random_state=random_state))   

}

### Define Hyperparameter Grids

In [32]:
xgb_grid = {
        'xgbclassifier__max_depth': [3, 5],
         'xgbclassifier__n_estimators': [100, 200],
        }

svm_grid = {
        'svc__kernel' : ['linear', 'rbf'],
        'svc__C': [0.01, 0.1, 1]
        #'svc__gamma': [5, 1, 0.1, 0.01]
    }

rf_grid = {
        'randomforestclassifier__n_estimators' : [100, 150],
        #'randomforestclassifier__max_features' : ['sqrt', 0.33],
        'randomforestclassifier__min_samples_leaf' : [1, 3],
        #'randomforestclassifier__criterion' : ['gini', 'entropy'],
        'randomforestclassifier__min_samples_split' : [2, 3]
    }

lr_grid = {
        'logisticregression__C' : [0.1, 1],
        'logisticregression__solver' : ['lbfgs', 'saga']
    }


In [33]:
# Create hyperparameter grids dictionary
hp_grids = {
    'lr' : lr_grid,
    'svm' : svm_grid,
    'rf' : rf_grid,
    'xgb' : xgb_grid
}

## Modelling Loop

In [34]:
# Initialize dictionary to keep evaluation dataframes 
# One dataframe per embeddings folder (train+test, or all_data)
df_models = {}

#for i in range(len(pt_folders)):
for i in range(1, 3):
    
    # Train
    # second index: 0 - train, 1 - test
    path_pt = pt_folders[i][0]
    fa_idx = 0 if 'esm' in path_pt else 1
    path_fa = fa_files[fa_idx][0]
    pool = os.path.split(path_pt)[1].split('_')[-1]
    emb_layer = 33 if 'esm' in path_pt else 'layer'
    X_train, y_train, sequence_id_train = fu.read_embeddings(path_fa, path_pt, pool, emb_layer,print_dims=False)
    
    # Test
    path_fa, path_pt = fa_files[fa_idx][1], pt_folders[i][1]
    X_test, y_test, sequence_id_train = fu.read_embeddings(path_fa, path_pt, pool, emb_layer, print_dims=False)  

    # Extensions for evaluations dataframes
    df_ext = os.path.split(path_pt)[1].split('_', 1)[1].split('_', 1)[1]
    
    # Printing header
    ptm = df_ext.split('_')[0]
    ptr = 'ESM' if 'esm' in ptm else 'ProSE'
    print('-' * 75)
    print(f'\tPretrained Model "{ptm}" by {ptr} - Pooling Operation: "{pool}"')
    print('-' * 75)
    
    # Grid search and fit
    fitted_models, cv_results = mu.fit_tune_CV(pipelines, hp_grids, 'accuracy', path_pt, X_train, y_train, task)
    
    # Evaluation
    df_models[f'env_{df_ext}'] = mu.evaluation(fitted_models, X_test, y_test)
  

---------------------------------------------------------------------------
	Pretrained Model "esm1v" by ESM - Pooling Operation: "mean"
---------------------------------------------------------------------------
esm1v_mean_xgb has been fitted and saved
esm1v_mean_lr has been fitted and saved
---------------------------------------------------------------------------
	Pretrained Model "dlm" by ProSE - Pooling Operation: "avg"
---------------------------------------------------------------------------
dlm_avg_xgb has been fitted and saved
dlm_avg_lr has been fitted and saved


In [35]:
df_models

{'env_esm1v_mean':                 best_score  f1_macro  accuracy
 model                                         
 esm1v_mean_xgb    0.930883  0.841339  0.842864
 esm1v_mean_lr     0.931950  0.833042  0.833253,
 'env_dlm_avg':              best_score  f1_macro  accuracy
 model                                      
 dlm_avg_xgb    0.905751  0.857198  0.859155
 dlm_avg_lr     0.904895  0.825514  0.826585}

In [36]:
df_models['env_esm1v_mean']

Unnamed: 0_level_0,best_score,f1_macro,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
esm1v_mean_xgb,0.930883,0.841339,0.842864
esm1v_mean_lr,0.93195,0.833042,0.833253


In [37]:
df_models['env_dlm_avg']

Unnamed: 0_level_0,best_score,f1_macro,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dlm_avg_xgb,0.905751,0.857198,0.859155
dlm_avg_lr,0.904895,0.825514,0.826585


## Collecting Evaluation Results into a DataFrame

#### Merge all dataframes from the dictionary `df_models`

In [38]:
# Create dataframe with evaluations for all models

# initialize dataframe
eval_df_all = pd.DataFrame()
# concatenate all dataframes from dictionary df_models
# Iterate through all dictionary keys 
for i in df_models.keys():
    # Use a temporary dataframe to hold one iteraton's dataframe
    eval_df_t = df_models[i].copy().reset_index()
    eval_df_all = pd.concat([eval_df_all, eval_df_t])

# Set the column 'model' as an index
eval_df_all = eval_df_all.set_index('model')

#### Display the results.

In [39]:
# Display the dataframe
eval_df_all

Unnamed: 0_level_0,best_score,f1_macro,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
esm1v_mean_xgb,0.930883,0.841339,0.842864
esm1v_mean_lr,0.93195,0.833042,0.833253
dlm_avg_xgb,0.905751,0.857198,0.859155
dlm_avg_lr,0.904895,0.825514,0.826585


In [27]:
eval_df_all_nopca = eval_df_all
eval_df_all_nopca

Unnamed: 0_level_0,best_score,f1_macro,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
esm1v_mean_xgb,0.93523,0.84694,0.84815
esm1v_mean_lr,0.933399,0.836427,0.836617
dlm_avg_xgb,0.915169,0.865985,0.867518
dlm_avg_lr,0.910959,0.832193,0.833627
