# **Example notebook for predicting substrate tolerance**

**This notebook demonstrates how to utilize LassoESM embeddings for predicting lasso peptide substrate tolerance. In this example, we use a dataset of 1,121 fusilassin variant sequences, which is stored in the "data" folder.**

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

**Data Loading**

In [2]:
# Load the dataset and embeddings
# Here, we took LassoESM embeddings as an example
data = pd.read_csv("../data/data_for_substrate_tolerance_prediction/FusA_tolerance_dataset.csv")
ys = data.iloc[:, 1].tolist()
Xs = np.load("../data/data_for_substrate_tolerance_prediction/FusA_LassoESM.npy")

**Run GridSearch**

In [3]:
# Define Models and Hyperparameters for Grid Search
model_list = [RandomForestClassifier, AdaBoostClassifier, SVC, MLPClassifier]
model_names = ['RF', 'AdaBoost', 'SVC', 'MLP']
parameters_list = [
    {'classifier__n_estimators': [20, 50, 100, 200], 'classifier__max_depth': [10, 20, 50, 100], 'classifier__max_features': ('sqrt', 'log2')},
    {'classifier__n_estimators': [20, 50, 100, 200], 'classifier__learning_rate': [0.1, 1, 5, 10]},
    {'classifier__kernel': ('linear', 'rbf', 'sigmoid', 'poly'), 'classifier__C': [0.1, 1, 10]},
    {'classifier__hidden_layer_sizes': [32, 64, 128, 256, 512, (512, 64), (256, 32), (128, 32)],
     'classifier__batch_size': [16, 32], 'classifier__learning_rate_init': [0.01, 0.001],
     'classifier__max_iter': [1000], 'classifier__early_stopping': [True]},
]

In [4]:
best_params = {}
# Perform Grid Search for Each Model
for model, name, parameters in zip(model_list, model_names, parameters_list):
    steps = [('classifier', model())]
    pipeline = Pipeline(steps)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs = -1, cv=10, scoring='balanced_accuracy')
    grid_search.fit(Xs, ys)
    
    best_params[name] = {k.replace('classifier__', ''): v for k, v in grid_search.best_params_.items()}
    print(f'Best parameters for {name}: {best_params[name]}')
    print(f'Best model for {name}: {grid_search.best_estimator_}')

Best parameters for RF: {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 50}
Best model for RF: Pipeline(steps=[('classifier',
                 RandomForestClassifier(max_depth=10, n_estimators=50))])
Best parameters for AdaBoost: {'learning_rate': 0.1, 'n_estimators': 100}
Best model for AdaBoost: Pipeline(steps=[('classifier',
                 AdaBoostClassifier(learning_rate=0.1, n_estimators=100))])
Best parameters for SVC: {'C': 1, 'kernel': 'linear'}
Best model for SVC: Pipeline(steps=[('classifier', SVC(C=1, kernel='linear'))])
Best parameters for MLP: {'batch_size': 32, 'early_stopping': True, 'hidden_layer_sizes': (256, 32), 'learning_rate_init': 0.001, 'max_iter': 1000}
Best model for MLP: Pipeline(steps=[('classifier',
                 MLPClassifier(batch_size=32, early_stopping=True,
                               hidden_layer_sizes=(256, 32), max_iter=1000))])


**Model Evaluation**

In [5]:
# Define Function for Model Evaluation
def cv_res(Xs, ys, best_params):
    random_seed = 42  
    
    RF = RandomForestClassifier(**best_params['RF'], random_state=random_seed)
    Ada = AdaBoostClassifier(**best_params['AdaBoost'], random_state=random_seed)
    SVM = SVC(**best_params['SVC'], random_state=random_seed)
    MLP = MLPClassifier(**best_params['MLP'], random_state=random_seed)

    cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=random_seed)
    MLP_CV = cross_val_score(MLP, Xs, ys, cv=cv, n_jobs=-1, scoring='balanced_accuracy')
    RF_CV = cross_val_score(RF, Xs, ys, cv=cv, n_jobs=-1, scoring='balanced_accuracy')
    Ada_CV = cross_val_score(Ada, Xs, ys, cv=cv, n_jobs=-1, scoring='balanced_accuracy')
    SVC_CV = cross_val_score(SVM, Xs, ys, cv=cv, n_jobs=-1, scoring='balanced_accuracy')

    cv_res = {
        'RF': np.mean(RF_CV.reshape(-1, 10), axis=1),
        'AdaBoost': np.mean(Ada_CV.reshape(-1, 10), axis=1),
        'SVC': np.mean(SVC_CV.reshape(-1, 10), axis=1),
        'MLP': np.mean(MLP_CV.reshape(-1, 10), axis=1)
    }
    
    return cv_res

In [6]:
# Model Evaluation
if __name__ == '__main__':
    res_LassoESM = cv_res(Xs, ys, best_params)
    print('Evaluation Done')
    
    LassoESM = pd.DataFrame(res_LassoESM)
    LassoESM['Model'] = 'Lasso_ESM'

Evaluation Done


In [7]:
print(LassoESM)

         RF  AdaBoost       SVC       MLP      Model
0  0.719914  0.728364  0.738522  0.720585  Lasso_ESM
1  0.725885  0.732725  0.734655  0.718578  Lasso_ESM
2  0.725690  0.723805  0.732975  0.720056  Lasso_ESM
3  0.731835  0.737689  0.738182  0.715817  Lasso_ESM
4  0.728194  0.740278  0.742119  0.710994  Lasso_ESM
