# Machine-learning classification models for ligand pose selection

Please cite: Tran-Nguyen, V.K. & Taboureau, O. ClassyPose: A Machine-Learning Classification Model for Correct Ligand Pose Selection.

The **protocol-env** environment has to be set up beforehand. To do this, please use the file **protocol-env.yml** in our **MLSF-protocol** repository: https://github.com/vktrannguyen/MLSF-protocol.

### Step 1: Calling all Python dependencies 

In [None]:
import os
import numpy as np
import pandas as pd
import oddt
import oddt.pandas as opd

from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn import metrics
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import matthews_corrcoef, balanced_accuracy_score, recall_score, confusion_matrix

### Step 2: Loading CSV data files for training and test sets 

**1. For the pose selection/classification task**: where the **Real Class of the pose** (good or bad pose) is already known

In [None]:
train_data = pd.read_csv("Pathway_to_the_training_data_file_:_training_data_poses.csv")
Train_Class = train_data['Classification']
test_data = pd.read_csv("Pathway_to_the_test_data_file")
Test_Class = test_data['Classification']

**2. For the virtual screening task**: where the **Real Class of the pose** (good or bad pose) is not known

Attention: here we consider the **Real Class of the pose**, **not** the Real Class of the screened molecule (active or inactive/decoy)

In [None]:
train_data = pd.read_csv("Pathway_to_the_training_data_file_:_training_data_poses.csv")
Train_Class = train_data['Classification']
test_data = pd.read_csv("Pathway_to_the_test_data_file")

### Step 3: Loading PLEC fingerprints of training and test data 

In [None]:
d_train_csv = pd.read_csv('Pathway_to_the_PLEC_fingerprints_of_training_data_:_training_data_PLEC.csv', header=None)
d_test_csv = pd.read_csv('Pathway_to_the_PLEC_fingerprints_of_test_data', header=None)

### Step 4: Training and testing machine-learning classification models 

#### Option 1: Random Forest 

**1. For the pose selection/classification task**: where the **Real Class of the pose** (good or bad pose) is already known

In [None]:
#Train the RF model on the training set poses:
rf_plec = RandomForestClassifier(n_estimators = 400, max_features = 'sqrt', n_jobs = 30)
rf_plec.fit(d_train_csv, Train_Class)

#Predict the Good Pose Probability for the test set poses:
prediction_test_rf_plec_prob = rf_plec.predict_proba(d_test_csv)
plec_result_rf = pd.DataFrame({"Good_Pose_Prob": prediction_test_rf_plec_prob[:, 1], "Real_Class": Test_Class})

#Classify the test set poses:
df_Predicted_Class = []
for i in range(len(plec_result_rf)):
    if (plec_result_rf.loc[i, "Good_Pose_Prob"] > 0.5):
        df_Predicted_Class.append("Good")
    elif (plec_result_rf.loc[i, "Good_Pose_Prob"] <= 0.5):
        df_Predicted_Class.append("Bad")
plec_result_rf.insert(loc=len(plec_result_rf.columns), column='Predicted_Class', value=df_Predicted_Class)

#Save the output as a csv file:
rmsd = test_data.iloc[:, 1]
pose = test_data.iloc[:, 0]
plec_result_rf['RMSD'] = rmsd
plec_result_rf['Pose'] = pose
plec_result_rf.to_csv("Pathway_to_the_CSV_result_file")

**2. For the virtual screening task**: where the **Real Class of the pose** (good or bad pose) is not known

In [None]:
#Train the RF model on the training set poses:
rf_plec = RandomForestClassifier(n_estimators = 400, max_features = 'sqrt', n_jobs = 30)
rf_plec.fit(d_train_csv, Train_Class)

#Predict the Good Pose Probability for the test set poses:
prediction_test_rf_plec_prob = rf_plec.predict_proba(d_test_csv)
plec_result_rf = pd.DataFrame({"Good_Pose_Prob": prediction_test_rf_plec_prob[:, 1]})

#Save the output as a csv file:
pose = test_data.iloc[:, 0]
plec_result_rf['Pose'] = pose
plec_result_rf.to_csv("Pathway_to_the_CSV_result_file")

#### Option 2: Extreme Gradient Boosting 

**1. For the pose selection/classification task**: where the **Real Class of the pose** (good or bad pose) is already known

In [None]:
#Train the XGB model on the training set poses:
xgb_plec = XGBClassifier(n_jobs = 40)
xgb_plec.fit(np.array(d_train_csv), Train_Class)

#Predict the Good Pose Probability for the test set poses:
prediction_test_xgb_plec_prob = xgb_plec.predict_proba(np.array(d_test_csv))
plec_result_xgb = pd.DataFrame({"Good_Pose_Prob": prediction_test_xgb_plec_prob[:, 1], "Real_Class": Test_Class})

#Classify the test set poses:
df_Predicted_Class = []
for i in range(len(plec_result_xgb)):
    if (plec_result_xgb.loc[i, "Good_Pose_Prob"] > 0.5):
        df_Predicted_Class.append("Good")
    elif (plec_result_xgb.loc[i, "Good_Pose_Prob"] <= 0.5):
        df_Predicted_Class.append("Bad")
plec_result_xgb.insert(loc=len(plec_result_xgb.columns), column='Predicted_Class', value=df_Predicted_Class)

#Save the output as a csv file:
rmsd = test_data.iloc[:, 1]
pose = test_data.iloc[:, 0]
plec_result_xgb['RMSD'] = rmsd
plec_result_xgb['Pose'] = pose
plec_result_xgb.to_csv("Pathway_to_the_CSV_result_file")

**2. For the virtual screening task**: where the Real Class of the pose (good or bad pose) is not known

In [None]:
#Train the XGB model on the training set poses:
xgb_plec = XGBClassifier(n_jobs = 40)
xgb_plec.fit(np.array(d_train_csv), Train_Class)

#Predict the Good Pose Probability for the test set poses:
prediction_test_xgb_plec_prob = xgb_plec.predict_proba(np.array(d_test_csv))
plec_result_xgb = pd.DataFrame({"Good_Pose_Prob": prediction_test_xgb_plec_prob[:, 1]})

#Save the output as a csv file:
pose = test_data.iloc[:, 0]
plec_result_xgb['Pose'] = pose
plec_result_xgb.to_csv("Pathway_to_the_CSV_result_file")

#### Option 3: Support Vector Machine 

**1. For the pose selection/classification task**: where the **Real Class of the pose** (good or bad pose) is already known

In [None]:
#Train the SVM model on the training set poses:
svm_plec = SVC(degree = 3, kernel = "rbf", probability = True)
svm_plec.fit(d_train_csv, Train_Class)

#Predict the Good Pose Probability for the test set poses:
prediction_test_svm_plec_prob = svm_plec.predict_proba(d_test_csv)
plec_result_svm  = pd.DataFrame({"Good_Pose_Prob": prediction_test_svm_plec_prob[:, 1], "Real_Class": Test_Class})

#Classify the test set poses:
df_Predicted_Class = []
for i in range(len(plec_result_svm)):
    if (plec_result_svm.loc[i, "Good_Pose_Prob"] > 0.5):
        df_Predicted_Class.append("Good")
    elif (plec_result_svm.loc[i, "Good_Pose_Prob"] <= 0.5):
        df_Predicted_Class.append("Bad")
plec_result_svm.insert(loc=len(plec_result_svm.columns), column='Predicted_Class', value=df_Predicted_Class)

#Save the output as a csv file:
rmsd = test_data.iloc[:, 1]
pose = test_data.iloc[:, 0]
plec_result_svm['RMSD'] = rmsd
plec_result_svm['Pose'] = pose
plec_result_svm.to_csv("Pathway_to_the_CSV_result_file")

**2. For the virtual screening task**: where the Real Class of the pose (good or bad pose) is not known

In [None]:
#Train the SVM model on the training set poses:
svm_plec = SVC(degree = 3, kernel = "rbf", probability = True)
svm_plec.fit(d_train_csv, Train_Class)

#Predict the Good Pose Probability for the test set poses:
prediction_test_svm_plec_prob = svm_plec.predict_proba(d_test_csv)
plec_result_svm  = pd.DataFrame({"Good_Pose_Prob": prediction_test_svm_plec_prob[:, 1]})

#Save the output as a csv file:
pose = test_data.iloc[:, 0]
plec_result_svm['Pose'] = pose
plec_result_svm.to_csv("Pathway_to_the_CSV_result_file")

#### Option 4: Artificial Neural Network 

**1. For the pose selection/classification task**: where the **Real Class of the pose** (good or bad pose) is already known

In [None]:
#Train the ANN model on the training set poses:
ann_plec = MLPClassifier(max_iter = 9000)
ann_plec.fit(d_train_csv, Train_Class)

#Predict the Good Pose Probability for the test set poses:
prediction_test_ann_plec_prob = ann_plec.predict_proba(d_test_csv)
plec_result_ann = pd.DataFrame({"Good_Pose_Prob": prediction_test_ann_plec_prob[:, 1], "Real_Class": Test_Class})

#Classify the test set poses:
df_Predicted_Class = []
for i in range(len(plec_result_ann)):
    if (plec_result_ann.loc[i, "Good_Pose_Prob"] > 0.5):
        df_Predicted_Class.append("Good")
    elif (plec_result_ann.loc[i, "Good_Pose_Prob"] <= 0.5):
        df_Predicted_Class.append("Bad")
plec_result_ann.insert(loc=len(plec_result_ann.columns), column='Predicted_Class', value=df_Predicted_Class)

#Save the output as a csv file:
rmsd = test_data.iloc[:, 1]
pose = test_data.iloc[:, 0]
plec_result_ann['RMSD'] = rmsd
plec_result_ann['Pose'] = pose
plec_result_ann.to_csv("Pathway_to_the_CSV_result_file")

**2. For the virtual screening task**: where the Real Class of the pose (good or bad pose) is not known

In [None]:
#Train the ANN model on the training set poses:
ann_plec = MLPClassifier(max_iter = 9000)
ann_plec.fit(d_train_csv, Train_Class)

#Predict the Good Pose Probability for the test set poses:
prediction_test_ann_plec_prob = ann_plec.predict_proba(d_test_csv)
plec_result_ann = pd.DataFrame({"Good_Pose_Prob": prediction_test_ann_plec_prob[:, 1]})

#Save the output as a csv file:
pose = test_data.iloc[:, 0]
plec_result_ann['Pose'] = pose
plec_result_ann.to_csv("Pathway_to_the_CSV_result_file")

### Step 5: Computing evaluation metrics from CSV result files 

**Only for the pose selection/classification task**

In [None]:
#For CSV result files of our classification models (RF, XGB, SVM, ANN)
df = pd.read_csv('Pathway_to_the_CSV_result_file', sep = ',')
real_class = df['Real_Class']
predicted_class = df['Predicted_Class']
score = df['Good_Pose_Prob'] 

#For CSV result files of Smina, RF-Score-VS, CNN-Score, modify the last line as follows: score = df['Score']

In [None]:
#Compute the ROC-AUC
fpr, tpr, thresholds = metrics.roc_curve(real_class, score, pos_label = "Good")
auc(fpr, tpr)

In [None]:
#Compute the PR-AUC
precision, recall, thresholds = precision_recall_curve(real_class, score, pos_label = "Good")
auc(recall, precision)

In [None]:
#Compute the balanced accuracy
balanced_accuracy_score(real_class, predicted_class)

In [None]:
#Compute the Matthews correlation coefficient
matthews_corrcoef(real_class, predicted_class)

In [None]:
#Compute the specificity
tn, fp, fn, tp = confusion_matrix(real_class, predicted_class).ravel()
specificity = tn/(tn+fp)
specificity

In [None]:
#Compute the recall
recall_score(real_class, predicted_class, pos_label = "Good")