In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import sklearn
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
import pickle

# read data

In [2]:
file = pd.read_csv('/Users/yanlixu/Desktop/git_code/machine_learning_prediction/molnet_bace.csv')
dataset = file[['mol', 'Class']]
train_data_x, test_data_x, train_y, test_y = train_test_split(dataset['mol'], dataset['Class'], test_size = 0.2, random_state = 1 )


In [3]:
len(train_data_x), len(test_data_x)

(1210, 303)

# calculate fingerprint

In [4]:
train_mols = [Chem.MolFromSmiles(smi) for smi in train_data_x] # RDKit Mol object
train_fps = [Chem.AllChem.GetMorganFingerprintAsBitVect(mol, 4, 2048) for mol in train_mols]
train_x = np.asarray(train_fps, dtype = float)

test_mols = [Chem.MolFromSmiles(smi) for smi in test_data_x] # RDKit Mol object
test_fps = [Chem.AllChem.GetMorganFingerprintAsBitVect(mol, 4, 2048) for mol in test_mols]
test_x = np.asarray(test_fps, dtype = float)

# Grid search

In [5]:
scoring = {'AUC':'roc_auc',
           'ACC':make_scorer(accuracy_score),
           'SEN':make_scorer(recall_score)}

xgb_param_grid = {'n_estimators':[2, 4, 10, 30, 50, 70, 100],
                  'max_depth':[1, 2, 3, 5, 6, 7, 8, 9, 10]}


In [6]:
#xgb_classifier = xgb.XGBClassifier(use_label_encoder=False)
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='auc')

xgb_gs = GridSearchCV(xgb_classifier,
                      xgb_param_grid,
                      scoring = scoring,
                      cv = 5,
                      n_jobs = 12,
                      refit = 'AUC',
                      return_train_score = True)

In [7]:
xgb_gs_ecfp = xgb_gs.fit(train_x, train_y)
xgb_model = xgb_gs_ecfp.best_estimator_

In [8]:
xgb_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='auc', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=7, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=50, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

# cross validation

In [9]:
xgb_cv = cross_validate(xgb_model,
                        train_x,
                        train_y,
                        cv = 5,
                        n_jobs = 12,
                        scoring = scoring,
                        return_train_score = True)

In [10]:
xgb_cv_train_auc = np.mean(xgb_cv['train_AUC'])
xgb_cv_test_auc = np.mean(xgb_cv['test_AUC'])
xgb_cv_train_acc = np.mean(xgb_cv['train_ACC'])
xgb_cv_test_acc = np.mean(xgb_cv['test_ACC'])

xgb_cv_train_sen = np.mean(xgb_cv['train_SEN'])
xgb_cv_test_sen = np.mean(xgb_cv['test_SEN'])
xgb_cv_train_spc = (xgb_cv_train_acc * len(train_y) - xgb_cv_train_sen * train_y.sum())/(len(train_y)-train_y.sum())
xgb_cv_test_spc = (xgb_cv_test_acc * len(train_y) - xgb_cv_test_sen * train_y.sum())/(len(train_y)-train_y.sum())

xgb_cross_validation = {'AUC':[xgb_cv_train_auc, xgb_cv_test_auc],
            'ACC':[xgb_cv_train_acc, xgb_cv_test_acc],
            'SEN':[xgb_cv_train_sen, xgb_cv_test_sen],
            'SPC':[xgb_cv_train_spc, xgb_cv_test_spc]}

xgb_cross_validation_df = pd.DataFrame.from_dict(xgb_cross_validation)
xgb_cross_validation_df.index = ['train', 'test']
xgb_cross_validation_df

Unnamed: 0,AUC,ACC,SEN,SPC
train,0.998897,0.982438,0.981851,0.982929
test,0.895242,0.802479,0.77674,0.824


# external test

In [11]:
xgb_ext_pred_prob = xgb_model.predict_proba(test_x)
xgb_ext_pred_list = []
for i, ext_score in enumerate(xgb_ext_pred_prob):
    ext_score = ext_score[1]
    xgb_ext_pred_list.append(ext_score)

In [12]:
xgb_ext_pred_array = np.array(xgb_ext_pred_list)
xgb_ext_auc = roc_auc_score(test_y, xgb_ext_pred_list)
xgb_ext_acc = accuracy_score(test_y, np.round(xgb_ext_pred_array))
xgb_ext_sen = recall_score(test_y, np.round(xgb_ext_pred_array))
xgb_ext_spc = (xgb_ext_acc * len(test_y) - xgb_ext_sen * test_y.sum())/(len(test_y)-test_y.sum())

In [15]:
xgb_ext_df =  pd.DataFrame({'mol':xgb_ext_pred_list,
                            'true_class':test_y.tolist(),
                            'test_class': [int(item) for item in np.round(xgb_ext_pred_list)]})
xgb_ext_df

Unnamed: 0,mol,true_class,test_class
0,0.689886,1,1
1,0.054249,0,0
2,0.027988,0,0
3,0.937078,1,1
4,0.068259,0,0
...,...,...,...
298,0.159404,0,0
299,0.136806,0,0
300,0.964712,1,1
301,0.010074,0,0


# final results summary

In [17]:
xgb_perf = {'AUC':[xgb_cv_test_auc, xgb_ext_auc],
            'ACC':[xgb_cv_test_acc, xgb_ext_acc],
            'SEN':[xgb_cv_test_sen, xgb_ext_sen],
            'SPC':[xgb_cv_test_spc, xgb_ext_spc]}


In [18]:
xgb_perf_df = pd.DataFrame.from_dict(xgb_perf)
xgb_perf_df.index = ['cv', 'ext']
xgb_perf_df

Unnamed: 0,AUC,ACC,SEN,SPC
cv,0.895242,0.802479,0.77674,0.824
ext,0.877388,0.831683,0.864286,0.803681


In [19]:
with open('/Users/yanlixu/Desktop/git_code/machine_learning_prediction/xgb_class_molnet_bace.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)