In [1]:
# We used the attached jupyter notebook from the work of 'Ricci-Lopez, Joel, et al. 
# "Improving structure-based virtual screening with ensemble docking and machine learning." 
# Journal of Chemical Information and Modeling 61.11 (2021): 5362-5376.'
# These jupyter notebooks could be found in https://github.com/jRicciL/ML-ensemble-docking
# For example, for CDK2, jupyter notebooks in cdk2/5_Machine_Learning/ were adopted 
# and modified here for classification evaluation.

In [2]:
import pandas as pd
import numpy as np
import sys
import random
import seaborn as sns
import glob
sys.path.append('./')
from helper_modules.run_or_load import run_or_load_joblib
from helper_modules.plotting_metrics import PlotMetric
from helper_modules.friedman_and_nemenyi_test import *
%run helper_modules/Helper_functions_for_nRepeats_x_kCV.ipynb
import warnings
warnings.filterwarnings('ignore')

  return warn(


## Construct the estimators

In [3]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression 
from xgboost import XGBClassifier


# dummy classifier
hyparams = {'strategy': "stratified"}
dclf = DummyClassifier(**hyparams)

# logistic regression
# parameters for CDK2 and FXA
hyparams = {'C'       : 0.01, 
            'penalty' : 'l2', 
            'solver'  : 'lbfgs', 
            'max_iter': 400}
lr_cdk2 = LogisticRegression(**hyparams)
lr_fxa = LogisticRegression(**hyparams)
# parameters for HSP90
hyparams_hsp90 = {'C'        : 1.0, 
            'penalty'  : 'l2', 
            'solver'   : 'lbfgs', 
            'max_iter' : 400}
lr_hsp90 = LogisticRegression(**hyparams)

# XGBoost
# parameters for CDK2
hyparams_cdk2 = {'subsample'        : 0.5, 
            'n_estimators'     : 200, 
            'max_depth'        : 20, 
            'learning_rate'    : 0.05,
            'alpha'            : 0.01,
            'gamma'            : 0.01, 
            'colsample_bytree' : 0.5,
            'eval_metric'      : 'logloss',
            'use_label_encoder': False
           }
xgb_cdk2 = XGBClassifier(**hyparams_cdk2)
# parameters for FXA
hyparams_fxa = {
    'subsample'        : 0.5, 
    'n_estimators'     : 200, 
    'max_depth'        : 10, 
    'learning_rate'    : 0.1,
    'alpha'            : 0.5,
    'gamma'            : 1, 
    'colsample_bytree' : 1,
    'eval_metric'      : 'logloss',
    'use_label_encoder': False
}
xgb_fxa = XGBClassifier(**hyparams_fxa)
# parameters for HSP90
hyparams_hsp90 = {
     'subsample'        : 0.6,
     'n_estimators'     : 500,
     'max_depth'        : 5,
     'learning_rate'    : 0.05,
     'gamma'            : 0.01,
     'colsample_bytree' : 0.5,
     'alpha'            : 0.1,
     'eval_metric'      :   'logloss',
     'use_label_encoder': False
    }

xgb_hsp90 = XGBClassifier(**hyparams_hsp90)

# ML Classifiers
# for CDK2
ml_classifiers_cdk2 = {
    'ml_lr'  : lr_cdk2,
    'ml_xgb' : xgb_cdk2,
    'ml_dclf': dclf
}
estimators_cdk2 = {**ml_classifiers_cdk2}
# for FXA
ml_classifiers_fxa = {
    'ml_lr'  : lr_fxa,
    'ml_xgb' : xgb_fxa,
    'ml_dclf': dclf
}
estimators_fxa = {**ml_classifiers_fxa}
# for HSP90
ml_classifiers_hsp90 = {
    'ml_lr'  : lr_hsp90,
    'ml_xgb' : xgb_hsp90,
    'ml_dclf': dclf
}
estimators_hsp90 = {**ml_classifiers_hsp90}

# all estimators
estimators = {'CDK2':estimators_cdk2,
              'FXA':estimators_fxa,
              'HSP90':estimators_hsp90}

sbvs_names = {'ml_lr': 'LR', 'ml_xgb': 'GBT', 'ml_dclf': 'DClf'}

In [4]:
# Save the results to a file to ommit repeate the analysis
@run_or_load_joblib
def nk_rep_cross_validation_SAVE(filename, **kwargs):
    return nk_rep_cross_validation(**kwargs)

## 30x4 cross validation

In [5]:
# 30x4 cross validation for different estimators
def get_cv30x4(X, y, estimators, pro, cache_name, RANDOM_STATE=1):
    R_a = round(y.sum() / len(y), 4)
    print(f'Ratio of actives {y.sum()}/{len(y)}:', R_a)
    metrics = dict(
        # AUC-ROC
        roc_auc   = {'metric_name': 'roc_auc'},
        # Normalized Enrichment Factor
        nef_Ra = {'metric_name': 'ef', 
                  'fraction'   : R_a, 
                  'method'     : 'normalized'}
        )
    
    n_repeats = 30
    n_splits  = 4
    evaluation_name = f'{n_repeats}x{n_splits}cv'
    cv30x4, y_preds, splits = nk_rep_cross_validation_SAVE(
        filename = cache_name,
        estimators = estimators, 
        X = X,
        y = y, 
        metrics   = metrics, 
        n_repeats = n_repeats, 
        n_splits  = n_splits,
        y_preds_return = True,
        random_state   = RANDOM_STATE
    )

    # Rename columns 
    cv30x4 = cv30x4.rename(columns = sbvs_names)
    return cv30x4

### Using the features from Ricci et al. 

In [6]:
for pro in ['CDK2', 'FXA', 'HSP90']:
    file_name = [x for x in glob.glob(f'{pro}/df_DkSc*.pkl')][0]
    print('Input files',file_name)
    df_dk_res = pd.read_pickle(file_name)
    # Extract the features columns: Docking scores
    X = df_dk_res.drop('activity', axis = 1).values
    # Extract the response variable: Activity
    y = df_dk_res['activity'].values
    cache_name = pro+'/Ricci_original'
    estimator = estimators[pro]
    out = get_cv30x4(X, y, estimator, pro, cache_name)
    out.to_csv(f'{pro}/{pro}_Ricci_ML.csv')

Input files CDK2/df_DkSc_results_COCRYS_CSAR_DEKOIS_DUD.pkl
Ratio of actives 415/3466: 0.1197
File loaded: CDK2/Ricci_original
Input files FXA/df_DkSc_results_COCRYS_DEKOIS_DUD.pkl
Ratio of actives 300/6233: 0.0481
File loaded: FXA/Ricci_original
Input files HSP90/df_DkSc_results_COCRYS_DEKOIS_DUD.pkl
Ratio of actives 256/2302: 0.1112
File loaded: HSP90/Ricci_original


### Using features from 3T for 20A pocket

In [7]:
for pro in ['CDK2', 'FXA', 'HSP90']:
    file_name = f'{pro}/{pro}_scrambled_training_data.pkl'
    print('Input files',file_name)
    df_dk_res = pd.read_pickle(file_name)
    X = df_dk_res['X']
    X[X > 20] = 20.0
    X[X < -20] = -20.0
    y = df_dk_res['y']
    X = X.astype(float)
    y = y.astype(float)    
    cache_name = pro+'/3T'
    estimator = estimators[pro]
    out = get_cv30x4(X, y, estimator, pro, cache_name)
    out.to_csv(f'{pro}/{pro}_scrambled_3T_ML.csv')

Input files CDK2/CDK2_scrambled_training_data.pkl
Ratio of actives 442.0/3763: 0.1175
File loaded: CDK2/3T
Input files FXA/FXA_scrambled_training_data.pkl
Ratio of actives 289.0/7191: 0.0402
File loaded: FXA/3T
Input files HSP90/HSP90_scrambled_training_data.pkl
Ratio of actives 298.0/2452: 0.1215
File loaded: HSP90/3T


### Using features from 3T for 25A pocket in CDK2

In [8]:
pro = 'CDK2'
file_name = f'{pro}/{pro}_25A_scrambled_training_data.pkl'
print('Input files',file_name)
df_dk_res = pd.read_pickle(file_name)
X = df_dk_res['X']
X[X > 20] = 20.0
X[X < -20] = -20.0
y = df_dk_res['y']
X = X.astype(float)
y = y.astype(float)    
cache_name = pro+'/3T_25A'
estimator = estimators[pro]
out = get_cv30x4(X, y, estimator, pro, cache_name)
out.to_csv(f'{pro}/{pro}_25A_scrambled_3T_ML.csv')

Input files CDK2/CDK2_25A_scrambled_training_data.pkl
Ratio of actives 442.0/3764: 0.1174
File loaded: CDK2/3T_25A


### Using features from 3T for rigid pocket in HSP90

In [9]:
pro = 'HSP90'
file_name = f'{pro}/{pro}_rigid_scrambled_training_data.pkl'
print('Input files',file_name)
df_dk_res = pd.read_pickle(file_name)
X = df_dk_res['X']
X[X > 20] = 20.0
X[X < -20] = -20.0
y = df_dk_res['y']
X = X.astype(float)
y = y.astype(float)    
cache_name = pro+'/3T_rigid'
estimator = estimators[pro]
out = get_cv30x4(X, y, estimator, pro, cache_name)
out.to_csv(f'{pro}/{pro}_rigid_scrambled_3T_ML.csv')

Input files HSP90/HSP90_rigid_scrambled_training_data.pkl
Ratio of actives 298.0/2452: 0.1215
File loaded: HSP90/3T_rigid
