# ClassyPose: a machine-learning classification model for correct ligand pose prediction

Please cite: XXXX

### Step 1: Calling all Python dependencies 

In [None]:
import os
import numpy as np
import pandas as pd
import oddt
import oddt.pandas as opd

from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn import metrics
from sklearn.metrics import precision_recall_curve, auc

### Step 2: Loading CSV data files for training and test sets 

In [None]:
train_data = pd.read_csv("Pathway_to_the_training_data_file_:_training_data_poses.csv")
Train_Class = train_data['Classification']
test_data = pd.read_csv("Pathway_to_the_test_data_file")
Test_Class = test_data['Classification']

### Step 3: Loading PLEC fingerprints of training and test data 

In [None]:
d_train_csv = pd.read_csv('Pathway_to_the_PLEC_fingerprints_of_training_data_:_training_data_PLEC.csv', header=None)
d_test_csv = pd.read_csv('Pathway_to_the_PLEC_fingerprints_of_test_data', header=None)

### Step 4: Training and testing machine-learning classification models 

#### Option 1: Random Forest 

In [None]:
#Train the RF model on the training molecules:
rf_plec = RandomForestClassifier(n_estimators = 400, max_features = 'sqrt', n_jobs = 30)
rf_plec.fit(d_train_csv, Train_Class)

#Test the RF model on the test molecules:
prediction_test_rf_plec_class = rf_plec.predict(d_test_csv)
prediction_test_rf_plec_prob = rf_plec.predict_proba(d_test_csv)

#Get virtual screening results on the test molecules and export results to a csv file:
plec_result_rf = pd.DataFrame({"Good_Pose_Prob": prediction_test_rf_plec_prob[:, 1], "Real_Class": Test_Class})

rmsd = test_data.iloc[:, 1]
pose = test_data.iloc[:, 0]

plec_result_rf['RMSD'] = rmsd
plec_result_rf['Pose'] = pose

plec_result_rf.to_csv("Pathway_to_the_CSV_result_file")

#### Option 2: Extreme Gradient Boosting 

In [None]:
#Train the XGB model on the training molecules:
xgb_plec = XGBClassifier(n_jobs = 40)
xgb_plec.fit(np.array(d_train_csv), Train_Class)

#Test the XGB model on the test molecules:
prediction_test_xgb_plec_class = xgb_plec.predict(np.array(d_test_csv))
prediction_test_xgb_plec_prob = xgb_plec.predict_proba(np.array(d_test_csv))

#Get virtual screening results on the test molecules and export results to a csv file:
plec_result_xgb = pd.DataFrame({"Good_Pose_Prob": prediction_test_xgb_plec_prob[:, 1], "Real_Class": Test_Class})

rmsd = test_data.iloc[:, 1]
pose = test_data.iloc[:, 0]

plec_result_xgb['RMSD'] = rmsd
plec_result_xgb['Pose'] = pose

plec_result_xgb.to_csv("Pathway_to_the_CSV_result_file")

#### Option 3: Support Vector Machine 

In [None]:
#Train the SVM model on the training molecules:
svm_plec = SVC(degree = 3, kernel = "rbf", probability = True)
svm_plec.fit(d_train_csv, Train_Class)

#Test the SVM model on the test molecules:
prediction_test_svm_plec_class = svm_plec.predict(d_test_csv)
prediction_test_svm_plec_prob = svm_plec.predict_proba(d_test_csv)

#Get virtual screening results on the test molecules and export results to a csv file:
plec_result_svm  = pd.DataFrame({"Good_Pose_Prob": prediction_test_svm_plec_prob[:, 1], "Real_Class": Test_Class})

rmsd = test_data.iloc[:, 1]
pose = test_data.iloc[:, 0]

plec_result_svm['RMSD'] = rmsd
plec_result_svm['Pose'] = pose

plec_result_svm.to_csv("Pathway_to_the_CSV_result_file")

#### Option 4: Artificial Neural Network 

In [None]:
#Train the ANN model on the training molecules:
ann_plec = MLPClassifier(max_iter = 9000)
ann_plec.fit(d_train_csv, Train_Class)

#Test the ANN model on the test molecules:
prediction_test_ann_plec_class = ann_plec.predict(d_test_csv)
prediction_test_ann_plec_prob = ann_plec.predict_proba(d_test_csv)

#Get virtual screening results on the test molecules and export results to a csv file:
plec_result_ann = pd.DataFrame({"Good_Pose_Prob": prediction_test_ann_plec_prob[:, 1], "Real_Class": Test_Class})

rmsd = test_data.iloc[:, 1]
pose = test_data.iloc[:, 0]

plec_result_ann['RMSD'] = rmsd
plec_result_ann['Pose'] = pose

plec_result_ann.to_csv("Pathway_to_the_CSV_result_file")

### Step 5: Computing ROC-AUC and PR-AUC values from CSV result files 

In [None]:
df = pd.read_csv('Pathway_to_the_CSV_result_file', sep = ',')
real_class = df['Real_Class']
score = df['Good_Pose_Prob'] 

In [None]:
#Compute the ROC-AUC
fpr, tpr, thresholds = metrics.roc_curve(real_class, score, pos_label = "Good")
auc(fpr, tpr)

In [None]:
#Compute the PR-AUC
precision, recall, thresholds = precision_recall_curve(real_class, score, pos_label = "Good")
auc(recall, precision)