In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import sklearn
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score
import pickle

# read data

In [2]:
file = pd.read_csv('/Users/yanlixu/Desktop/git_code/machine_learning_prediction/molnet_bace.csv')
dataset = file[['mol', 'pIC50']]
train_data_x, test_data_x, train_y, test_y = train_test_split(dataset['mol'], dataset['pIC50'], test_size = 0.2, random_state = 1 )


In [3]:
len(train_data_x), len(test_data_x)

(1210, 303)

# calculate fingerPrint

In [4]:
train_mols = [Chem.MolFromSmiles(smi) for smi in train_data_x] # RDKit Mol object
train_fps = [Chem.AllChem.GetMorganFingerprintAsBitVect(mol, 4, 2048) for mol in train_mols]
train_x = np.asarray(train_fps, dtype = float)

test_mols = [Chem.MolFromSmiles(smi) for smi in test_data_x] # RDKit Mol object
test_fps = [Chem.AllChem.GetMorganFingerprintAsBitVect(mol, 4, 2048) for mol in test_mols]
test_x = np.asarray(test_fps, dtype = float)

# Grid search

In [5]:
# 1.参数字典
xgb_param_grid = {'n_estimators':[10, 50, 100, 150, 200],
                  'max_depth':[3, 6, 8, 10]}

# 2.性能指标字典
score_dict = {'mae':make_scorer(mean_absolute_error),
              'mse':make_scorer(mean_squared_error),             
              'mape':make_scorer(mean_absolute_percentage_error),
              'r2':make_scorer(r2_score)}

In [6]:
xgb_reg = xgb.XGBRegressor()
xgb_gs = GridSearchCV(xgb_reg,
                      xgb_param_grid,
                      scoring = score_dict,
                      cv = 5,
                      refit = 'r2',
                      return_train_score = True)

In [7]:
xgb_gs_ecfp = xgb_gs.fit(train_x, train_y)
xgb_model = xgb_gs_ecfp.best_estimator_

In [8]:
xgb_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=50, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

# Cross Validation of XGBoost

In [9]:
xgb_cv = cross_validate(xgb_model,
                        train_x,
                        train_y,
                        cv = 5,
                        n_jobs = 10,
                        scoring = score_dict,
                        return_train_score = True)

In [10]:
xgb_cv_train_mae = np.mean(xgb_cv['train_mae'])
xgb_cv_train_mse = np.mean(xgb_cv['train_mse'])
xgb_cv_train_mape = np.mean(xgb_cv['train_mape'])
xgb_cv_train_r2 = np.mean(xgb_cv['train_r2'])

xgb_cv_test_mae = np.mean(xgb_cv['test_mae'])
xgb_cv_test_mse = np.mean(xgb_cv['test_mse'])
xgb_cv_test_mape = np.mean(xgb_cv['test_mape'])
xgb_cv_test_r2 = np.mean(xgb_cv['test_r2'])

 nm

IndentationError: unexpected indent (3240233288.py, line 11)

# external test

In [None]:
xgb_ext_pred = xgb_model.predict(test_x)

xgb_ext_df =  pd.DataFrame({'mol':test_data_x.tolist(),
                            'true_pIC50':test_y.tolist(),
                            'test_pIC50':xgb_ext_pred})
xgb_ext_df

# final results summary

In [None]:
xgb_ext_mae = mean_absolute_error(test_y, xgb_ext_pred)
xgb_ext_mse = mean_squared_error(test_y, xgb_ext_pred)
xgb_ext_mape = mean_absolute_percentage_error(test_y, xgb_ext_pred)
xgb_ext_r2 = r2_score(test_y, xgb_ext_pred)

xgb_perf = {'mae':[xgb_cv_test_mae, xgb_ext_mae],
            'mse':[xgb_cv_test_mse, xgb_ext_mse],
            'mape':[xgb_cv_test_mape, xgb_ext_mape],
            'r2':[xgb_cv_test_r2, xgb_ext_r2]}

In [None]:
xgb_perf_df = pd.DataFrame.from_dict(xgb_perf)
xgb_perf_df.index = ['cv', 'ext']
xgb_perf_df

# save model

In [None]:
with open('/Users/yanlixu/Desktop/git_code/machine_learning_prediction/xgb_reg_molnet_bace.pkl', 'wb') as file:
    pickle.dump(xgb_model, file)