In [None]:
# read the SMILES-MoA data  
import pandas as pd
df = pd.read_csv('top_20_MOAs.txt', sep = '\t')

In [None]:
# change 20 MoAs to classes 
MOA_class_dictionary = {'EGFR inhibitor': 8,
 'HDAC inhibitor': 16,
 'PI3K inhibitor': 13,
 'acetylcholine receptor agonist': 1,
 'acetylcholine receptor antagonist': 4,
 'adrenergic receptor agonist': 18,
 'adrenergic receptor antagonist': 15,
 'bacterial cell wall synthesis inhibitor': 14,
 'benzodiazepine receptor agonist': 10,
 'calcium channel blocker': 5,
 'cyclooxygenase inhibitor': 6,
 'dopamine receptor antagonist': 12,
 'glucocorticoid receptor agonist': 9,
 'glutamate receptor antagonist': 19,
 'histamine receptor antagonist': 17,
 'phosphodiesterase inhibitor': 3,
 'serotonin receptor agonist': 7,
 'serotonin receptor antagonist': 2,
 'sodium channel blocker': 11,
 'topoisomerase inhibitor': 0}

In [None]:
# add classes column 
df['classes'] = None
for i in range(df.shape[0]):
  df.iloc[i,2] = MOA_class_dictionary[df.iloc[i,1]]

In [None]:
import rdkit
import numpy as np
from rdkit import *
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

In [None]:
# Split out the test set  
from sklearn.model_selection import train_test_split
x_train_valid, x_test, y_train_valid, y_test = train_test_split(df.SMILES, df.classes, test_size =10/100,
 stratify = df.classes, shuffle = True, random_state = 1000)

In [None]:
# kfold
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 9)
skf.get_n_splits(np.array(list(x_train_valid)), np.array(list(y_train_valid)))
train_index_list = []
valid_index_list = []
for train_index, valid_index in skf.split(np.array(list(x_train_valid)), np.array(list(y_train_valid))):
  train_index_list.append(train_index)
  valid_index_list.append(valid_index)

In [None]:
number_of_kfold = 6 # change the number from 0-8 to get 9 shuffles
x_train = list(np.array(list(x_train_valid))[train_index_list[ number_of_kfold ]])
x_valid = list(np.array(list(x_train_valid))[valid_index_list[ number_of_kfold ]])
y_train = list(np.array(list(y_train_valid))[train_index_list[ number_of_kfold ]])
y_valid = list(np.array(list(y_train_valid))[valid_index_list[ number_of_kfold ]])
x_test = list(x_test)
y_test = list(y_test)

In [None]:
# turn to cannoical  smiles
x_train = [Chem.MolToSmiles(Chem.MolFromSmiles(smi),True) for smi in x_train]
x_valid = [Chem.MolToSmiles(Chem.MolFromSmiles(smi),True) for smi in x_valid]
x_test = [Chem.MolToSmiles(Chem.MolFromSmiles(smi),True) for smi in x_test]

In [None]:
def smiles_to_array(smiles):
  molecules = Chem.MolFromSmiles(smiles) 
  fingerprints = AllChem.GetMorganFingerprintAsBitVect(molecules, 2)
  x_array = []
  arrays = np.zeros(0,)
  DataStructs.ConvertToNumpyArray(fingerprints, arrays)
  x_array.append(arrays)
  x_array = np.asarray(x_array)
  x_array = ((np.squeeze(x_array)).astype(int)) 
  return x_array

In [None]:
# get the training set 
train_x = np.zeros((len(x_train), 2048), dtype = np.float32)
for f in range(train_x.shape[0]):
  train_x[f] = smiles_to_array(x_train[f])

In [None]:
# get the validation set 
valid_x = np.zeros((len(x_valid), 2048), dtype = np.float32)
for f in range(valid_x.shape[0]):
  valid_x[f] = smiles_to_array(x_valid[f])

In [None]:
# get the test set 
test_x = np.zeros((len(x_test), 2048), dtype = np.float32)
for f in range(test_x.shape[0]):
  test_x[f] = smiles_to_array(x_test[f])

In [None]:
y_train = np.array(y_train).astype(int)
y_valid = np.array(y_valid).astype(int)
y_test = np.array(y_test).astype(int)

In [None]:
# Create class weights
from sklearn.utils import class_weight
y_unique = np.unique(np.array(y_train))
class_weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = y_unique,
                y = np.array(y_train)) 
class_weights_dict45 = dict(enumerate(class_weights))

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.metrics import classification_report

In [None]:
assert list(y_test[0:5]) == [14, 12, 6, 13, 14]
therandomforest = RandomForestClassifier(random_state = 0, class_weight = class_weights_dict45)
print(therandomforest.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, therandomforest.predict(test_x)))

In [None]:
assert list(y_test[0:5])  == [14, 12, 6, 13, 14]
from sklearn.neighbors import KNeighborsClassifier
theneighbor = KNeighborsClassifier(n_neighbors = 1, algorithm = 'kd_tree')
print(theneighbor.fit(train_x , y_train).score(test_x, y_test),
   classification_report(y_test, theneighbor.predict(test_x)))

In [None]:
assert  list(y_test[0:5]) == [14, 12, 6, 13, 14]
from sklearn.linear_model import LogisticRegression
thelogisticregression = LogisticRegression(random_state = 0, class_weight = class_weights_dict45) 
print(thelogisticregression.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, thelogisticregression.predict(test_x)))

In [None]:
assert list(y_test[0:5])  == [14, 12, 6, 13, 14]
from lightgbm import LGBMClassifier
thelgbclassifier = LGBMClassifier(class_weight = class_weights_dict45)
print(thelgbclassifier.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, thelgbclassifier.predict(test_x), ))

In [None]:
from catboost import CatBoostClassifier
assert list(y_test[0:5]) == [14, 12, 6, 13, 14]
thecatboost = CatBoostClassifier(verbose = 0, class_weights = class_weights_dict45, task_type = "GPU")
print(thecatboost.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, thecatboost.predict(test_x)))

In [None]:
assert list(y_test[0:5]) == [14, 12, 6, 13, 14]
from sklearn.ensemble import BaggingClassifier
thebagging = BaggingClassifier(base_estimator = therandomforest, random_state = 0)
print(thebagging.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, thebagging.predict(test_x),))

In [None]:
assert list(y_test[0:5]) == [14, 12, 6, 13, 14]
from sklearn.ensemble import StackingClassifier
estimators = [
('therandomforest', therandomforest), 
('thecatboost', thecatboost), 
('thelogisticregression', thelogisticregression)]
thestacking = StackingClassifier(estimators = estimators, final_estimator = therandomforest)
print(thestacking.fit(train_x, y_train).score(test_x, y_test),
  classification_report(y_test, thestacking.predict(test_x),))

In [None]:
assert list(y_test[0:5]) == [14, 12, 6, 13, 14]
thevoting = VotingClassifier(estimators = [
('therandomforest', therandomforest), 
('thecatboost', thecatboost), 
('thelogisticregression', thelogisticregression)], 
voting = 'soft', n_jobs = -1)
print(thevoting.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, thevoting.predict(test_x)))

In [None]:
assert list(y_test[0:5]) == [14, 12, 6, 13, 14]
from sklearn.ensemble import AdaBoostClassifier  
theadaboost = AdaBoostClassifier(base_estimator = therandomforest)
print(theadaboost.fit(train_x, y_train).score(test_x, y_test),
      classification_report(y_test, theadaboost.predict(test_x)))

In [None]:
# References
# https://future-chem.com/rdkit-google-colab/#toc5
# https://www.rdkit.org/docs/index.html