## DBP - Modelling

First we will import all dependencies.

In [1]:
import os
import pandas as pd

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
# To visualize pipeline diagram - 'text', or 'diagram'
from sklearn import set_config
# Import XGBoost
from xgboost import XGBClassifier

random_state = 10

In [2]:
# Import the script from different folder
import sys  
sys.path.append('../../scripts')

import file_utilities as fu
import modelling_utilities as mu

#### Set one of three project tasks (*acp*, *amp*, *dna_binding*)

In [3]:
# task - ['acp', 'amp', 'dna_binding']
task = 'dna_binding'

### Define Pipelines

Define pipelines for ML algorithms.  
As preprocessing steps we will use `MinMaxScaler()` and `PCA()`.

In [4]:
# Define number of PCA components
num_pca_components = 1000

pipelines = {
    'xgb' : make_pipeline(MinMaxScaler(), 
                          PCA(num_pca_components),
                          XGBClassifier(random_state=random_state)),
    'lr' : make_pipeline(MinMaxScaler(),
                         PCA(num_pca_components),
                         LogisticRegression(max_iter=20000, random_state=random_state)),    
    'svm' : make_pipeline(MinMaxScaler(),
                          PCA(num_pca_components),
                          SVC(random_state=random_state)),
    'rf' : make_pipeline(MinMaxScaler(),
                         PCA(num_pca_components),
                         RandomForestClassifier(random_state=random_state))
}

### Define Hyperparameter Grids

Define hyperparameter grids for chosen ML algorithms.

In [5]:
# XGBoost
xgb_grid = {
        'xgbclassifier__max_depth': [3, 5],
         'xgbclassifier__n_estimators': [100, 200],
        }
# SVC
svm_grid = {
        'svc__kernel' : ['linear', 'rbf'],
        'svc__C': [0.01, 0.1, 1]
    }
# Random Forest
rf_grid = {
        'randomforestclassifier__n_estimators' : [100, 150],
        'randomforestclassifier__min_samples_leaf' : [1, 3],
        'randomforestclassifier__min_samples_split' : [2, 3]
    }
# Logistic Regression
lr_grid = {
        'logisticregression__C' : [0.1, 1],
        'logisticregression__solver' : ['lbfgs', 'saga']
    }

#### Create dictionary for hyperparameter grids

In [6]:
# Create hyperparameter grids dictionary
hp_grids = {
    'lr' : lr_grid,
    'svm' : svm_grid,
    'rf' : rf_grid,
    'xgb' : xgb_grid
}

### Get embedding folders and fasta files for the task.

For our modelling we will need to use previously created fasta files:

In [7]:
!tree -nhDL 1 ../../data/"{task}"/ -fP *.fa | grep fa

├── [781K Sep  8 12:05]  ../../data/dna_binding/test_esm.fa
├── [1.0M Sep  3 22:42]  ../../data/dna_binding/test_prose.fa
├── [4.7M Sep  7 17:51]  ../../data/dna_binding/train_esm.fa
└── [6.0M Sep  3 22:39]  ../../data/dna_binding/train_prose.fa


<br>

and embedding `.pt` files in these folders:

In [8]:
!tree -nhDL 3 ../../data/"{task}"/ -df | grep 'esm1\|mt\|dlm'

│   │   ├── [4.0K Sep  8 12:45]  ../../data/dna_binding/esm/test/dbp_test_esm1b_mean
│   │   └── [4.0K Sep  8 12:36]  ../../data/dna_binding/esm/test/dbp_test_esm1v_mean
│       ├── [4.0K Sep  7 19:27]  ../../data/dna_binding/esm/train/dbp_train_esm1b_mean
│       └── [4.0K Sep  7 18:42]  ../../data/dna_binding/esm/train/dbp_train_esm1v_mean
    │   ├── [4.0K Sep  6 10:41]  ../../data/dna_binding/prose/test/dbp_test_dlm_avg
    │   ├── [4.0K Sep  6 10:55]  ../../data/dna_binding/prose/test/dbp_test_dlm_max
    │   ├── [4.0K Sep  6 11:10]  ../../data/dna_binding/prose/test/dbp_test_dlm_sum
    │   ├── [4.0K Sep  6 11:33]  ../../data/dna_binding/prose/test/dbp_test_mt_avg
    │   ├── [4.0K Sep  6 13:32]  ../../data/dna_binding/prose/test/dbp_test_mt_max
    │   └── [4.0K Sep  6 13:59]  ../../data/dna_binding/prose/test/dbp_test_mt_sum
        ├── [4.0K Sep  5 17:52]  ../../data/dna_binding/prose/train/dbp_train_dlm_avg
        ├── [4.0K Sep  5 19:49]  ../../data/dna_binding/prose/train/d

<br>  

To get paths for embedding folders and fasta files we will use the function `get_emb_folders()`.

In [9]:
pt_folders, fa_files = mu.get_emb_folders(task)

## Modelling Loop

The modelling loop includes the following steps:

1. Loop through train and test embedding folders
2. Run the function `read_embeddings()` for train embeddings to get `X_train` and `y_train`
3. Run the function `read_embeddings()` for test embeddings to get `X_test` and `y_test`
4. Define and print the output header
5. Use the function `fit_tune_CV()`to to do the following:
   - use above defined `pipelines` and `hp_grids` dictionaries and `GridSearchCV()` to get models
   - save the models with `joblib`
   - create a dictionary of the models for one set of embedding folders
6. Run the function `evaluation()` to create an evaluation dataframe for one set of embedding folders


In [10]:
# Initialize dictionary to keep evaluation dataframes 
# One dataframe per embeddings folder (train+test, or all_data)
df_models = {}

for i in range(len(pt_folders)):
    
    # Train
    # second index: 0 - train, 1 - test
    path_pt = pt_folders[i][0]
    # Different fasta files for ESM and ProSE
    # Fasta files index: esm - 0, prose - 1
    fa_idx = 0 if 'esm' in path_pt else 1
    path_fa = fa_files[fa_idx][0]
    pool = os.path.split(path_pt)[1].split('_')[-1]
    emb_layer = 33 if 'esm' in path_pt else 'layer'
    X_train, y_train, sequence_id_train = fu.read_embeddings(path_fa, path_pt, pool, emb_layer,print_dims=False)
    
    # Test
    path_fa, path_pt = fa_files[fa_idx][1], pt_folders[i][1]
    X_test, y_test, sequence_id_train = fu.read_embeddings(path_fa, path_pt, pool, emb_layer, print_dims=False)  

    # Extensions for evaluations dataframes
    df_ext = os.path.split(path_pt)[1].split('_', 1)[1].split('_', 1)[1]
    
    # Printing output header
    ptm = df_ext.split('_')[0]
    ptr = 'ESM' if 'esm' in ptm else 'ProSE'
    print('-' * 75)
    print(f'\tPretrained Model "{ptm}" by {ptr} - Pooling Operation: "{pool}"')
    print('-' * 75)
    
    # Grid search and fit
    fitted_models = mu.fit_tune_CV(pipelines, hp_grids, 'accuracy', path_pt, X_train, y_train, task)
    
    # Save evaluation dataframe into dictionary
    df_models[f'eval_{df_ext}'] = mu.evaluation(fitted_models, X_test, y_test)
  

---------------------------------------------------------------------------
	Pretrained Model "esm1b" by ESM - Pooling Operation: "mean"
---------------------------------------------------------------------------
esm1b_mean_xgb has been fitted and saved
esm1b_mean_lr has been fitted and saved
esm1b_mean_svm has been fitted and saved
esm1b_mean_rf has been fitted and saved
---------------------------------------------------------------------------
	Pretrained Model "esm1v" by ESM - Pooling Operation: "mean"
---------------------------------------------------------------------------
esm1v_mean_xgb has been fitted and saved
esm1v_mean_lr has been fitted and saved
esm1v_mean_svm has been fitted and saved
esm1v_mean_rf has been fitted and saved
---------------------------------------------------------------------------
	Pretrained Model "dlm" by ProSE - Pooling Operation: "avg"
---------------------------------------------------------------------------
dlm_avg_xgb has been fitted and saved




dlm_max_lr has been fitted and saved
dlm_max_svm has been fitted and saved
dlm_max_rf has been fitted and saved
---------------------------------------------------------------------------
	Pretrained Model "dlm" by ProSE - Pooling Operation: "sum"
---------------------------------------------------------------------------
dlm_sum_xgb has been fitted and saved




dlm_sum_lr has been fitted and saved
dlm_sum_svm has been fitted and saved
dlm_sum_rf has been fitted and saved
---------------------------------------------------------------------------
	Pretrained Model "mt" by ProSE - Pooling Operation: "avg"
---------------------------------------------------------------------------
mt_avg_xgb has been fitted and saved
mt_avg_lr has been fitted and saved
mt_avg_svm has been fitted and saved
mt_avg_rf has been fitted and saved
---------------------------------------------------------------------------
	Pretrained Model "mt" by ProSE - Pooling Operation: "max"
---------------------------------------------------------------------------
mt_max_xgb has been fitted and saved




mt_max_lr has been fitted and saved
mt_max_svm has been fitted and saved
mt_max_rf has been fitted and saved
---------------------------------------------------------------------------
	Pretrained Model "mt" by ProSE - Pooling Operation: "sum"
---------------------------------------------------------------------------
mt_sum_xgb has been fitted and saved
mt_sum_lr has been fitted and saved




mt_sum_svm has been fitted and saved
mt_sum_rf has been fitted and saved


<br>

Let's list the keys of the `df_models` dictionary:

In [11]:
df_models.keys()

dict_keys(['eval_esm1b_mean', 'eval_esm1v_mean', 'eval_dlm_avg', 'eval_dlm_max', 'eval_dlm_sum', 'eval_mt_avg', 'eval_mt_max', 'eval_mt_sum'])

<br>
Let's check a dataframe for a randomly chosen key (set of embedding folders)

In [12]:
import random
df_models[list(df_models.keys())[random.randint(0, len(df_models))]]

Unnamed: 0_level_0,cv_best_score,f1_macro,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mt_max_xgb,0.918308,0.880322,0.881602
mt_max_lr,0.915098,0.827865,0.828345
mt_max_svm,0.917524,0.847299,0.847711
mt_max_rf,0.914384,0.850791,0.852993


## Collecting Evaluation Results into a DataFrame

To compare evaluations for all models, collect results from all dataframes into one dataframe.  

To do that, we will merge all dataframes from the dictionary `df_models`.

In [13]:
# Create dataframe with evaluations for all models

# initialize dataframe
eval_df_all = pd.DataFrame()
# concatenate all dataframes from dictionary df_models
# Iterate through all dictionary keys 
for i in df_models.keys():
    # Use a temporary dataframe to hold one iteration's dataframe
    eval_df_t = df_models[i].copy().reset_index()
    eval_df_all = pd.concat([eval_df_all, eval_df_t])

# Set the column 'model' as an index
eval_df_all = eval_df_all.set_index('model')

#### Display the results sorted by "accuracy"

In [14]:
# Display the dataframe
eval_df_all.sort_values(by=['accuracy'], ascending=False)

Unnamed: 0_level_0,cv_best_score,f1_macro,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dlm_max_rf,0.906037,0.882461,0.883363
mt_max_xgb,0.918308,0.880322,0.881602
dlm_max_xgb,0.913171,0.875029,0.87632
esm1b_mean_svm,0.95148,0.871616,0.871696
mt_avg_xgb,0.91417,0.868554,0.870158
mt_sum_xgb,0.913385,0.865448,0.867077
dlm_avg_rf,0.883705,0.864151,0.865757
dlm_avg_xgb,0.905822,0.858953,0.860915
dlm_sum_xgb,0.902968,0.856583,0.858715
esm1v_mean_svm,0.944309,0.85709,0.85728


### Saving dataframe for future use

In [15]:
TASK = 'DBP' if task == 'dna_binding' else task.upper()
file_path = f'../../results/{TASK}_classifiers.csv'
eval_df_all.to_csv(file_path)

<br>

When you need to work with the results from that file, read it with the parameter `index_col=`:   
```python
df = pd.read_csv(file_path, index_col='model')
```