In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import sklearn
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
import pickle

# read data

In [2]:
file = pd.read_csv('/Users/yanlixu/Desktop/git_code/machine_learning_prediction/molnet_bace.csv')
dataset = file[['mol', 'Class']]
train_data_x, test_data_x, train_y, test_y = train_test_split(dataset['mol'], dataset['Class'], test_size = 0.2, random_state = 1 )


In [3]:
len(train_data_x), len(test_data_x)

(1210, 303)

# calculate fingerprint

In [4]:
train_mols = [Chem.MolFromSmiles(smi) for smi in train_data_x] # RDKit Mol object
train_fps = [Chem.AllChem.GetMorganFingerprintAsBitVect(mol, 4, 2048) for mol in train_mols]
train_x = np.asarray(train_fps, dtype = float)

test_mols = [Chem.MolFromSmiles(smi) for smi in test_data_x] # RDKit Mol object
test_fps = [Chem.AllChem.GetMorganFingerprintAsBitVect(mol, 4, 2048) for mol in test_mols]
test_x = np.asarray(test_fps, dtype = float)

# Grid search

In [5]:
scoring = {'AUC':'roc_auc',
           'ACC':make_scorer(accuracy_score),
           'SEN':make_scorer(recall_score)}

xgb_param_grid = {'n_estimators':[2, 4, 10, 30, 50, 70, 100],
                  'max_depth':[1, 2, 3, 5, 6, 7, 8, 9, 10]}

svm_param_dict = {'C':[0.1, 0.5, 1, 2, 3, 4, 5],
                  'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
                  'probability':[True]}

rf_param_dict = {'n_estimators':[50, 100, 120, 150, 160, 180, 200, 500],
                 'max_depth':[10, 50, 100, 150, 200],
                 'max_features': ["auto","sqrt","log2"]}

knn_param_dict = {'n_neighbors':[5, 10, 15, 20],
                  'weights':['uniform', 'distance']}

In [6]:
#xgb_classifier = xgb.XGBClassifier(use_label_encoder=False)
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='auc')

xgb_gs = GridSearchCV(xgb_classifier,
                      xgb_param_grid,
                      scoring = scoring,
                      cv = 5,
                      n_jobs = 12,
                      refit = 'AUC',
                      return_train_score = True)

xgb_gs_ecfp = xgb_gs.fit(train_x, train_y)
xgb_model = xgb_gs_ecfp.best_estimator_

In [7]:
# svm
svm_classifier = SVC()
svm_gs = GridSearchCV(estimator = svm_classifier, param_grid = svm_param_dict, scoring = scoring, n_jobs = 10, cv = 10, refit = 'AUC', return_train_score = True)

svm_gs_fit = svm_gs.fit(train_x, train_y)
svm_model = svm_gs_fit.best_estimator_


In [8]:
# random forest
rf_classifier = RandomForestClassifier()
rf_gs = GridSearchCV(estimator = rf_classifier,
                     param_grid = rf_param_dict,
                     scoring = scoring,
                     n_jobs = 10,
                     cv = 5, 
                     refit = 'AUC',
                     return_train_score = True)
rf_gs_fit = rf_gs.fit(train_x, train_y)
rf_model = rf_gs_fit.best_estimator_

In [9]:
# random forest
knn_classifier = KNeighborsClassifier()
knn_gs = GridSearchCV(estimator = knn_classifier,
                     param_grid = knn_param_dict,
                     scoring = scoring,
                     n_jobs = 10,
                     cv = 5, 
                     refit = 'AUC',
                     return_train_score = True)
knn_gs_fit = knn_gs.fit(train_x, train_y)
knn_model = knn_gs_fit.best_estimator_

In [10]:
xgb_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='auc', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=7, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=50, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [11]:
svm_model

SVC(C=1, kernel='poly', probability=True)

In [12]:
rf_model

RandomForestClassifier(max_depth=200, max_features='log2', n_estimators=160)

In [13]:
knn_model

KNeighborsClassifier(weights='distance')

# cross validation

In [14]:
xgb_cv = cross_validate(xgb_model,
                        train_x,
                        train_y,
                        cv = 5,
                        n_jobs = 12,
                        scoring = scoring,
                        return_train_score = True)

xgb_cv_train_auc = np.mean(xgb_cv['train_AUC'])
xgb_cv_test_auc = np.mean(xgb_cv['test_AUC'])
xgb_cv_train_acc = np.mean(xgb_cv['train_ACC'])
xgb_cv_test_acc = np.mean(xgb_cv['test_ACC'])

xgb_cv_train_sen = np.mean(xgb_cv['train_SEN'])
xgb_cv_test_sen = np.mean(xgb_cv['test_SEN'])
xgb_cv_train_spc = (xgb_cv_train_acc * len(train_y) - xgb_cv_train_sen * train_y.sum())/(len(train_y)-train_y.sum())
xgb_cv_test_spc = (xgb_cv_test_acc * len(train_y) - xgb_cv_test_sen * train_y.sum())/(len(train_y)-train_y.sum())

In [15]:
svm_cv = cross_validate(estimator = svm_model,
                             X = train_x,
                             y = train_y,
                             scoring = scoring,
                             cv = 5,
                             n_jobs = 10,
                             return_train_score = True)

svm_cv_train_auc = np.mean(svm_cv['train_AUC'])
svm_cv_test_auc = np.mean(svm_cv['test_AUC'])
svm_cv_train_acc = np.mean(svm_cv['train_ACC'])
svm_cv_test_acc = np.mean(svm_cv['test_ACC'])

svm_cv_train_sen = np.mean(svm_cv['train_SEN'])
svm_cv_test_sen = np.mean(svm_cv['test_SEN'])
svm_cv_train_spc = (svm_cv_train_acc * len(train_y) - svm_cv_train_sen * train_y.sum())/(len(train_y)-train_y.sum())
svm_cv_test_spc = (svm_cv_test_acc * len(train_y) - svm_cv_test_sen * train_y.sum())/(len(train_y)-train_y.sum())

In [16]:
rf_cv = cross_validate(estimator = rf_model,
                             X = train_x,
                             y = train_y,
                             scoring = scoring,
                             cv = 5,
                             n_jobs = 10,
                             return_train_score = True)

rf_cv_train_auc = np.mean(rf_cv['train_AUC'])
rf_cv_test_auc = np.mean(rf_cv['test_AUC'])
rf_cv_train_acc = np.mean(rf_cv['train_ACC'])
rf_cv_test_acc = np.mean(rf_cv['test_ACC'])

rf_cv_train_sen = np.mean(rf_cv['train_SEN'])
rf_cv_test_sen = np.mean(rf_cv['test_SEN'])
rf_cv_train_spc = (rf_cv_train_acc * len(train_y) - rf_cv_train_sen * train_y.sum())/(len(train_y)-train_y.sum())
rf_cv_test_spc = (rf_cv_test_acc * len(train_y) - rf_cv_test_sen * train_y.sum())/(len(train_y)-train_y.sum())

In [17]:
knn_cv = cross_validate(estimator = knn_model,
                             X = train_x,
                             y = train_y,
                             scoring = scoring,
                             cv = 5,
                             n_jobs = 10,
                             return_train_score = True)

knn_cv_train_auc = np.mean(knn_cv['train_AUC'])
knn_cv_test_auc = np.mean(knn_cv['test_AUC'])
knn_cv_train_acc = np.mean(knn_cv['train_ACC'])
knn_cv_test_acc = np.mean(knn_cv['test_ACC'])

knn_cv_train_sen = np.mean(knn_cv['train_SEN'])
knn_cv_test_sen = np.mean(knn_cv['test_SEN'])
knn_cv_train_spc = (knn_cv_train_acc * len(train_y) - knn_cv_train_sen * train_y.sum())/(len(train_y)-train_y.sum())
knn_cv_test_spc = (knn_cv_test_acc * len(train_y) - knn_cv_test_sen * train_y.sum())/(len(train_y)-train_y.sum())

# external test

In [18]:
xgb_ext_pred_prob = xgb_model.predict_proba(test_x)
xgb_ext_pred_list = []
for i, ext_score in enumerate(xgb_ext_pred_prob):
    ext_score = ext_score[1]
    xgb_ext_pred_list.append(ext_score)

svm_ext_pred_prob = svm_model.predict_proba(test_x)
svm_ext_pred_list = []
for i, ext_score in enumerate(svm_ext_pred_prob):
    ext_score = ext_score[1]
    svm_ext_pred_list.append(ext_score)
    
rf_ext_pred_prob = rf_model.predict_proba(test_x)
rf_ext_pred_list = []
for i, ext_score in enumerate(rf_ext_pred_prob):
    ext_score = ext_score[1]
    rf_ext_pred_list.append(ext_score)
    
knn_ext_pred_prob = knn_model.predict_proba(test_x)
knn_ext_pred_list = []
for i, ext_score in enumerate(knn_ext_pred_prob):
    ext_score = ext_score[1]
    knn_ext_pred_list.append(ext_score)


ext_test_df =  pd.DataFrame({'mol':xgb_ext_pred_list,
                            'true_class':test_y.tolist(),
                            'xgb_test_prob': xgb_ext_pred_list,
                            'xgb_test_class': [int(item) for item in np.round(xgb_ext_pred_list)],
                            'svm_test_prob': svm_ext_pred_list,
                            'svm_test_class': [int(item) for item in np.round(xgb_ext_pred_list)],
                            'rf_test_prob': rf_ext_pred_list,
                            'rf_test_class': [int(item) for item in np.round(xgb_ext_pred_list)],
                            'knn_test_prob': knn_ext_pred_list,
                            'knn_test_class': [int(item) for item in np.round(xgb_ext_pred_list)]                             
                            })
ext_test_df

Unnamed: 0,mol,true_class,xgb_test_prob,xgb_test_class,svm_test_prob,svm_test_class,rf_test_prob,rf_test_class,knn_test_prob,knn_test_class
0,0.689886,1,0.689886,1,0.854470,1,0.725000,1,0.600661,1
1,0.054249,0,0.054249,0,0.130663,0,0.087500,0,0.000000,0
2,0.027988,0,0.027988,0,0.101019,0,0.081250,0,0.000000,0
3,0.937078,1,0.937078,1,0.947126,1,0.900000,1,1.000000,1
4,0.068259,0,0.068259,0,0.109701,0,0.100000,0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...
298,0.159404,0,0.159404,0,0.332690,0,0.334375,0,0.410490,0
299,0.136806,0,0.136806,0,0.308127,0,0.381250,0,0.600632,0
300,0.964712,1,0.964712,1,0.940730,1,0.975000,1,1.000000,1
301,0.010074,0,0.010074,0,0.108345,0,0.120313,0,0.000000,0


In [19]:
ext_test_df.to_csv('molnet_bace_classification_models_prediction_results.csv')

In [20]:
xgb_ext_pred_array = np.array(xgb_ext_pred_list)
xgb_ext_auc = roc_auc_score(test_y, xgb_ext_pred_list)
xgb_ext_acc = accuracy_score(test_y, np.round(xgb_ext_pred_array))
xgb_ext_sen = recall_score(test_y, np.round(xgb_ext_pred_array))
xgb_ext_spc = (xgb_ext_acc * len(test_y) - xgb_ext_sen * test_y.sum())/(len(test_y)-test_y.sum())

In [21]:
svm_ext_pred_array = np.array(svm_ext_pred_list)
svm_ext_auc = roc_auc_score(test_y, svm_ext_pred_list)
svm_ext_acc = accuracy_score(test_y, np.round(svm_ext_pred_array))
svm_ext_sen = recall_score(test_y, np.round(svm_ext_pred_array))
svm_ext_spc = (svm_ext_acc * len(test_y) - svm_ext_sen * test_y.sum())/(len(test_y)-test_y.sum())

In [22]:
rf_ext_pred_array = np.array(rf_ext_pred_list)
rf_ext_auc = roc_auc_score(test_y, rf_ext_pred_list)
rf_ext_acc = accuracy_score(test_y, np.round(rf_ext_pred_array))
rf_ext_sen = recall_score(test_y, np.round(rf_ext_pred_array))
rf_ext_spc = (rf_ext_acc * len(test_y) - rf_ext_sen * test_y.sum())/(len(test_y)-test_y.sum())

In [23]:
knn_ext_pred_array = np.array(knn_ext_pred_list)
knn_ext_auc = roc_auc_score(test_y, knn_ext_pred_list)
knn_ext_acc = accuracy_score(test_y, np.round(knn_ext_pred_array))
knn_ext_sen = recall_score(test_y, np.round(knn_ext_pred_array))
knn_ext_spc = (knn_ext_acc * len(test_y) - knn_ext_sen * test_y.sum())/(len(test_y)-test_y.sum())

# final results summary

In [24]:
xgb_perf = {'AUC':[xgb_cv_train_auc, xgb_cv_test_auc, xgb_ext_auc],
            'ACC':[xgb_cv_train_acc, xgb_cv_test_acc, xgb_ext_acc],
            'SEN':[xgb_cv_train_sen, xgb_cv_test_sen, xgb_ext_sen],
            'SPC':[xgb_cv_train_spc, xgb_cv_test_spc, xgb_ext_spc]}
xgb_perf_df = pd.DataFrame.from_dict(xgb_perf)
xgb_perf_df.index = ['train_cv','test_cv', 'test_ext']
round(xgb_perf_df, 2)

Unnamed: 0,AUC,ACC,SEN,SPC
train_cv,1.0,0.98,0.98,0.98
test_cv,0.9,0.8,0.78,0.82
test_ext,0.88,0.83,0.86,0.8


In [26]:
svm_perf = {'AUC':[svm_cv_train_auc, svm_cv_test_auc, svm_ext_auc],
            'ACC':[svm_cv_train_acc, svm_cv_test_acc, svm_ext_acc],
            'SEN':[svm_cv_train_sen, svm_cv_test_sen, svm_ext_sen],
            'SPC':[svm_cv_train_spc, svm_cv_test_spc, svm_ext_spc]}
svm_perf_df = pd.DataFrame.from_dict(svm_perf)
svm_perf_df.index = ['train_cv','test_cv', 'test_ext']
round(svm_perf_df, 2)

Unnamed: 0,AUC,ACC,SEN,SPC
train_cv,0.99,0.94,0.92,0.96
test_cv,0.9,0.81,0.74,0.86
test_ext,0.88,0.83,0.83,0.82


In [27]:
rf_perf = {'AUC':[rf_cv_train_auc, rf_cv_test_auc, rf_ext_auc],
            'ACC':[rf_cv_train_acc, rf_cv_test_acc, rf_ext_acc],
            'SEN':[rf_cv_train_sen, rf_cv_test_sen, rf_ext_sen],
            'SPC':[rf_cv_train_spc, rf_cv_test_spc, rf_ext_spc]}
rf_perf_df = pd.DataFrame.from_dict(rf_perf)
rf_perf_df.index = ['train_cv','test_cv', 'test_ext']
round(rf_perf_df, 2)

Unnamed: 0,AUC,ACC,SEN,SPC
train_cv,1.0,0.99,1.0,0.99
test_cv,0.89,0.81,0.78,0.84
test_ext,0.88,0.83,0.84,0.83


In [29]:
knn_perf = {'AUC':[knn_cv_train_auc, knn_cv_test_auc, knn_ext_auc],
            'ACC':[knn_cv_train_acc, knn_cv_test_acc, knn_ext_acc],
            'SEN':[knn_cv_train_sen, knn_cv_test_sen, knn_ext_sen],
            'SPC':[knn_cv_train_spc, knn_cv_test_spc, knn_ext_spc]}
knn_perf_df = pd.DataFrame.from_dict(knn_perf)
knn_perf_df.index = ['train_cv','test_cv', 'test_ext']
round(knn_perf_df, 2)

Unnamed: 0,AUC,ACC,SEN,SPC
train_cv,1.0,0.99,0.99,1.0
test_cv,0.87,0.8,0.8,0.8
test_ext,0.87,0.81,0.86,0.75


In [30]:
classification_res = pd.concat([xgb_perf_df, svm_perf_df, rf_perf_df, knn_perf_df], keys=['xgb', 'svm', 'rf', 'knn'])
classification_res.to_csv('molnet_bace_xgb_svm_rf_knn_classification_metrics.csv')

# save model

In [51]:
with open('/Users/yanlixu/Desktop/git_code/machine_learning_prediction/xgb_class_molnet_bace.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)
with open('/Users/yanlixu/Desktop/git_code/machine_learning_prediction/svm_class_molnet_bace.pkl', 'wb') as f:
    pickle.dump(svm_model, f)
with open('/Users/yanlixu/Desktop/git_code/machine_learning_prediction/rf_class_molnet_bace.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
with open('/Users/yanlixu/Desktop/git_code/machine_learning_prediction/knn_class_molnet_bace.pkl', 'wb') as f:
    pickle.dump(knn_model, f)