In [None]:
!pip install rdkit

In [2]:
# Mounting the Google Drive

"""
Use in case of Google Drive: Create Replicase folder in drive,
upload all data to data folder in Replicase,
upload all python files to python folder in Replicase
"""
import os, sys
from google.colab import drive

drive.mount('/content/drive', force_remount = True)
sys.path.insert(0, '/content/drive/My Drive/Replicase/')


"""
Use in case of jupyter notebooks
"""
# import sys
# sys.path.insert(0, '<Your dir path>')

# defining the file path
data_filepath = "/content/drive/My Drive/Replicase/data/"
python_dir_path = "/content/drive/MyDrive/Replicase/python-files/"
output_filepath = "/content/drive/MyDrive/Replicase/output/"

Mounted at /content/drive


In [3]:
sys.path.insert(1, python_dir_path)

In [4]:
PRE_PROCESSED_REPLICASE_DATA_FILE = "replicase_data_preprocessed.csv"
PRE_PROCESSED_3CLPRO_DATA_FILE = "3cl-pro_data_preprocessed.csv"
STD_REPLICASE_DATA_FILE = "Replicase_stand_smi_data.csv"
STD_3CLPRO_DATA_FILE = "3cl-pro_stand_smi_data.csv"

REPLICASE = "replicase/"
_3CL_PRO = "3cl-pro/"

MACC_KEYS_DATA_FILE = "macc-keys-data.csv"
RDK_FPTS_DATA_FILE = "rdk-fpts-data.csv"
AP_FPTS_DATA_FILE = "ap-fpts-data.csv"
TT_FPTS_DATA_FILE = "tt-fpts-data.csv"
MORGAN_FPTS_DATA_FILE = "morgan-fpts-data.csv"
FC_FPTS_DATA_FILE = "fc-fpts-data.csv"

# If this code is run on local computer, change the path to match your dir path
pre_process_replicase_data_path = data_filepath + PRE_PROCESSED_REPLICASE_DATA_FILE
pre_process_3clpro_data_path = data_filepath + PRE_PROCESSED_3CLPRO_DATA_FILE
std_replicase_data_path = data_filepath + STD_REPLICASE_DATA_FILE
std_3clpro_data_path = data_filepath + STD_3CLPRO_DATA_FILE

# Output paths
replicase_macc_keys_output_path = output_filepath + REPLICASE + MACC_KEYS_DATA_FILE
replicase_rdk_fpts_output_path = output_filepath + REPLICASE + RDK_FPTS_DATA_FILE
replicase_ap_fpts_output_path = output_filepath + REPLICASE + AP_FPTS_DATA_FILE
replicase_tt_fpts_output_path = output_filepath + REPLICASE + TT_FPTS_DATA_FILE
replicase_morgan_fpts_output_path = output_filepath + REPLICASE + MORGAN_FPTS_DATA_FILE
replicase_fc_fpts_output_path = output_filepath + REPLICASE + FC_FPTS_DATA_FILE

In [5]:
# Other imports
import tempfile
import math

import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools
from rdkit.Chem import Draw
from rdkit.Chem import MolStandardize
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem import MACCSkeys
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from tqdm import tqdm
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.drawOptions.comicMode=True
import rdkit

print(rdkit.__version__)

import seaborn as sns
import matplotlib.pyplot as plt

2023.09.2


In [6]:
STANDARDIZED_SMILES = "standardized_smiles"
BIOACTIVITY_CLASS = "bioactivity_class"
PCHEMBL_VALUE = "pchembl_value"
CANONICAL_SMILES = "canonical_smiles"
MOLECULE_CHEMBL_ID = "molecule_chembl_id"
STANDARDIZED_MOLECULE = "standardized_molecule"

ACTIVE = "active"
INACTIVE = "inactive"

In [7]:
# Check if pre processed files are present
'''
 Only std_replicase_data and std_3clpro_data are required after this code block
 If standard data already exists, assign the variables directly
'''

if not os.path.exists(pre_process_replicase_data_path) or not os.path.exists(pre_process_3clpro_data_path):
  pre_replicase_data, pre_3clpro_data = main()

  pre_replicase_data.to_csv(pre_process_replicase_data_path)
  pre_3clpro_data.to_csv(pre_process_3clpro_data_path)

if not os.path.exists(std_replicase_data_path):
  pre_replicase_data = pd.read_csv(pre_process_replicase_data_path)
  std_replicase_data = standard_preprocess(pre_replicase_data)
  std_replicase_data.to_csv(std_replicase_data_path)
else:
  std_replicase_data = pd.read_csv(std_replicase_data_path)

if not os.path.exists(std_3clpro_data_path):
  pre_3clpro_data = pd.read_csv(pre_process_3clpro_data_path)
  std_3clpro_data = standard_preprocess(pre_3clpro_data)
  std_3clpro_data.to_csv(std_3clpro_data_path)
else:
  std_3clpro_data = pd.read_csv(std_3clpro_data_path)

In [8]:
replicase_morgan_fpts = pd.read_csv(replicase_morgan_fpts_output_path, index_col = False)
replicase_morgan_fpts[PCHEMBL_VALUE] = std_replicase_data[PCHEMBL_VALUE]
replicase_morgan_fpts[BIOACTIVITY_CLASS] = replicase_morgan_fpts[PCHEMBL_VALUE].apply(lambda x: 1 if x > 5 else 0)
replicase_morgan_fpts.head(10)

Unnamed: 0,Morgan_1,Morgan_2,Morgan_3,Morgan_4,Morgan_5,Morgan_6,Morgan_7,Morgan_8,Morgan_9,Morgan_10,...,Morgan_2041,Morgan_2042,Morgan_2043,Morgan_2044,Morgan_2045,Morgan_2046,Morgan_2047,Morgan_2048,pchembl_value,bioactivity_class
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.82,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5.0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.1,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5.27,1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.96,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,4.87,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,4.54,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,4.4,0


In [9]:
X, Y = replicase_morgan_fpts.iloc[:, ~replicase_morgan_fpts.columns.isin([BIOACTIVITY_CLASS, PCHEMBL_VALUE])], replicase_morgan_fpts.loc[:, replicase_morgan_fpts.columns == BIOACTIVITY_CLASS]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 3)

In [13]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_recall_fscore_support, classification_report
from sklearn.tree import export_graphviz

# accuracy_score, f1_score, precision_score, recall_score,

def plot_confusion_matrix(cm, display_labels, title):
    plt.figure(figsize=(8,8))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_labels)
    disp.plot()
    plt.title('Confusion Matrix : {}'.format(title))
    plt.grid(False)
    plt.xticks(rotation ='vertical')
    plt.savefig(title)

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB

classification_reports = []

def train_plot_rf(X_train, y_train, X_test, y_test, idx):
  rf_model = RandomForestClassifier(n_estimators = 30, max_features = "sqrt", random_state = 42)
  rf_model.fit(X_train, y_train.values.ravel())
  rf_predictions = rf_model.predict(X_test)

  cls_report = classification_report(y_test, rf_predictions, output_dict= True)
  classification_reports.append(cls_report)

  rf_cm = confusion_matrix(y_test, rf_predictions)
  plot_confusion_matrix(rf_cm, ["0", "1"], "Random Forest: {}".format(idx))

def train_plot_gb(X_train, y_train, X_test, y_test, idx):
  clf = GradientBoostingClassifier(n_estimators = 40, learning_rate = 0.1, max_depth=1, random_state=0)
  clf.fit(X_train, y_train.values.ravel())
  clf_predictions = clf.predict(X_test)

  cls_report = classification_report(y_test, clf_predictions, output_dict= True)
  classification_reports.append(cls_report)

  clf_cm = confusion_matrix(y_test, clf_predictions)
  plot_confusion_matrix(clf_cm, ["0", "1"], "Gradient Boosting: {}".format(idx))

def train_plot_bnb(X_train, y_train, X_test, y_test, idx):
  bnb = BernoulliNB(alpha = 6e-9, force_alpha= False)
  bnb.fit(X_train, y_train.values.ravel())
  bnb_predictions = bnb.predict(X_test)

  cls_report = classification_report(y_test, bnb_predictions, output_dict= True)
  classification_reports.append(cls_report)

  bnb_cm = confusion_matrix(y_test, bnb_predictions)
  plot_confusion_matrix(bnb_cm, ["0", "1"], "Bernoulli NB: {}".format(idx))


In [None]:
for idx, file_path in enumerate([replicase_ap_fpts_output_path, replicase_tt_fpts_output_path, replicase_morgan_fpts_output_path]):
  data = pd.read_csv(file_path, index_col = False)
  data[PCHEMBL_VALUE] = std_replicase_data[PCHEMBL_VALUE]
  data[BIOACTIVITY_CLASS] = data[PCHEMBL_VALUE].apply(lambda x: 1 if x > 5 else 0)

  X, Y = data.iloc[:, ~data.columns.isin([BIOACTIVITY_CLASS, PCHEMBL_VALUE])], data.loc[:, data.columns == BIOACTIVITY_CLASS]
  X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 3)

  train_plot_rf(X_train, y_train, X_test, y_test, idx)
  train_plot_gb(X_train, y_train, X_test, y_test, idx)
  train_plot_bnb(X_train, y_train, X_test, y_test, idx)


In [None]:
reordered_data = [
    {"model": "Random Forest", "feature": "Atom-Pair", "data": classification_reports[0]},
    {"model": "Gradient Boosting", "feature": "Atom-Pair", "data": classification_reports[1]},
    {"model": "BernoulliNB", "feature": "Atom-Pair", "data": classification_reports[2]},
    {"model": "Random Forest", "feature": "Topological", "data": classification_reports[3]},
    {"model": "Gradient Boosting", "feature": "Topological", "data": classification_reports[4]},
    {"model": "BernoulliNB", "feature": "Topological", "data": classification_reports[5]},
    {"model": "Random Forest", "feature": "Morgan", "data": classification_reports[6]},
    {"model": "Gradient Boosting", "feature": "Morgan", "data": classification_reports[7]},
    {"model": "BernoulliNB", "feature": "Morgan", "data": classification_reports[8]}
]

# Creating a DataFrame with the reordered data
df_reordered = pd.DataFrame(columns=['Model', 'Feature Type', 'Precision 0', 'Recall 0', 'F1-Score 0', 'Precision 1', 'Recall 1', 'F1-Score 1'])

for item in reordered_data:
    df_reordered = df_reordered.append({
        'Model': item["model"],
        'Feature Type': item["feature"],
        'Precision 0': item["data"]['0']['precision'],
        'Recall 0': item["data"]['0']['recall'],
        'F1-Score 0': item["data"]['0']['f1-score'],
        'Precision 1': item["data"]['1']['precision'],
        'Recall 1': item["data"]['1']['recall'],
        'F1-Score 1': item["data"]['1']['f1-score']
    }, ignore_index=True)

df_reordered