In [1]:
! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh
! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')
! conda install -c rdkit rdkit -y

--2021-12-26 03:59:29--  https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.131.3, 104.16.130.3, 2606:4700::6810:8203, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.131.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85055499 (81M) [application/x-sh]
Saving to: ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh’


2021-12-26 03:59:29 (238 MB/s) - ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh’ saved [85055499/85055499]

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ | / - \ done
Solving environment: / - \ | / - done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - _libgcc_mutex==0.1=main
    - asn1crypto==1.3.0=py37_0
    - ca-certificates==2020.1.1=0
    - certifi==2019.11.28=py37_0
    - cffi==1.14.0=py37h2e261b9_0
    - chardet==3.0.4=py37_1003
    - conda-package-han

In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
import numpy as np
import sklearn
import xgboost as xgb
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score
import pickle


In [3]:
def calculate_fps(train_file, test_file, mol_smi, pIC50):
  # process train data
  train_data = pd.read_csv(train_file)
  train_data_x = train_data[mol_smi]
  train_mols = [Chem.MolFromSmiles(smi) for smi in train_data_x]
  train_morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048) for mol in train_mols]
  train_morgan_fps_array = np.asarray(train_morgan_fps, dtype=float)
  train_maccs_fps = [MACCSkeys.GenMACCSKeys(mol) for mol in train_mols]
  train_maccs_fps_array = np.asarray(train_maccs_fps, dtype=float)
  train_x = np.concatenate([train_morgan_fps_array, train_maccs_fps_array],axis=1)
  train_y = train_data[pIC50]
  # process test data
  test_data = pd.read_csv(test_file)
  test_data_x = test_data[mol_smi]
  test_mols = [Chem.MolFromSmiles(smi) for smi in test_data_x]
  test_morgan_fps = [Chem.AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048) for mol in test_mols]
  test_morgan_fps_array = np.asarray(test_morgan_fps, dtype=float)
  test_maccs_fps = [MACCSkeys.GenMACCSKeys(mol) for mol in test_mols]
  test_maccs_fps_array = np.asarray(test_maccs_fps, dtype=float)
  test_x = np.concatenate([test_morgan_fps_array, test_maccs_fps_array],axis=1)
  test_y = test_data[pIC50]
  return (train_x, train_y, test_x, test_y)

In [4]:
def xgb_reg(train_x, train_y, test_x, test_y):
  xgb_param_dict = {'n_estimators':[10, 50, 100],'max_depth':[3, 6, 7, 8, 10]}
  score_dict = {'mse':make_scorer(mean_squared_error),'mae':make_scorer(mean_absolute_error),'mape':make_scorer(mean_absolute_percentage_error),'r2':make_scorer(r2_score)}
  xgb_reg = xgb.XGBRegressor(verbosity = 0, silent=True, random_state=42)
  xgb_gs = GridSearchCV(estimator = xgb_reg,param_grid = xgb_param_dict,scoring = score_dict,cv = 5,refit = 'r2',return_train_score = True)
  xgb_gs_ecfp = xgb_gs.fit(train_x, train_y)
  xgb_model = xgb_gs_ecfp.best_estimator_
  # cross validation
  xgb_cv = cross_validate(xgb_model,train_x,train_y,cv = 5,n_jobs = 10,scoring = score_dict,return_train_score = True)
  xgb_cv_train_mae = np.mean(xgb_cv['train_mae'])
  xgb_cv_train_mse = np.mean(xgb_cv['train_mse'])
  xgb_cv_train_mape = np.mean(xgb_cv['train_mape'])
  xgb_cv_train_r2 = np.mean(xgb_cv['train_r2'])
  xgb_cv_test_mae = np.mean(xgb_cv['test_mae'])
  xgb_cv_test_mse = np.mean(xgb_cv['test_mse'])
  xgb_cv_test_mape = np.mean(xgb_cv['test_mape'])
  xgb_cv_test_r2 = np.mean(xgb_cv['test_r2'])
  # external test
  xgb_ext_pred = xgb_model.predict(test_x)
  xgb_ext_mae = mean_absolute_error(test_y, xgb_ext_pred)
  xgb_ext_mse = mean_squared_error(test_y, xgb_ext_pred)
  xgb_ext_mape = mean_absolute_percentage_error(test_y, xgb_ext_pred)
  xgb_ext_r2 = r2_score(test_y, xgb_ext_pred)

  xgb_perf = {'mae':[xgb_cv_train_mae, xgb_cv_test_mae, xgb_ext_mae],
            'mse':[xgb_cv_train_mse, xgb_cv_test_mse, xgb_ext_mse],
            'mape':[xgb_cv_train_mape, xgb_cv_test_mape, xgb_ext_mape],
            'r2':[xgb_cv_train_r2, xgb_cv_test_r2, xgb_ext_r2]}

  xgb_perf_df = pd.DataFrame.from_dict(xgb_perf)
  xgb_perf_df.index = ['train','cv', 'ext']
  return round(xgb_perf_df, 2)


In [5]:
def svm_reg(train_x, train_y, test_x, test_y):
  svm_param_dict = {'C':[1, 2, 3, 4, 5],'kernel':['poly', 'rbf', 'sigmoid'],'epsilon':[0.1, 0.5, 1.0]}
  score_dict = {'mse':make_scorer(mean_squared_error),'mae':make_scorer(mean_absolute_error),'mape':make_scorer(mean_absolute_percentage_error),'r2':make_scorer(r2_score)}
  svm_reg = svm.SVR()
  svm_gs = GridSearchCV(estimator = svm_reg,param_grid = svm_param_dict,scoring = score_dict,n_jobs = 10,cv = 10,refit = 'r2',return_train_score = True)
  svm_gs_fit = svm_gs.fit(train_x, train_y)
  svm_model = svm_gs_fit.best_estimator_
  # cross validation
  svm_best_cv = cross_validate(estimator = svm_model,X = train_x,y = train_y,scoring = score_dict,cv = 5,n_jobs = 10,return_train_score = True)
  svm_cv_train_mae = np.mean(svm_best_cv['train_mae'])
  svm_cv_train_mse = np.mean(svm_best_cv['train_mse'])
  svm_cv_train_mape = np.mean(svm_best_cv['train_mape'])
  svm_cv_train_r2 = np.mean(svm_best_cv['train_r2'])
  svm_cv_test_mae = np.mean(svm_best_cv['test_mae'])
  svm_cv_test_mse = np.mean(svm_best_cv['test_mse'])
  svm_cv_test_mape = np.mean(svm_best_cv['test_mape'])
  svm_cv_test_r2 = np.mean(svm_best_cv['test_r2'])
  # external test 
  svm_ext_pred = svm_model.predict(test_x)
  svm_ext_mae = mean_absolute_error(test_y, svm_ext_pred)
  svm_ext_mse = mean_squared_error(test_y, svm_ext_pred)
  svm_ext_mape = mean_absolute_percentage_error(test_y, svm_ext_pred)
  svm_ext_r2 = r2_score(test_y, svm_ext_pred)

  svm_perf = {'mae':[svm_cv_train_mae, svm_cv_test_mae, svm_ext_mae],
            'mse':[svm_cv_train_mse, svm_cv_test_mse, svm_ext_mse],
            'mape':[svm_cv_train_mape, svm_cv_test_mape, svm_ext_mape],
            'r2':[svm_cv_train_r2, svm_cv_test_r2, svm_ext_r2]}

  svm_perf_df = pd.DataFrame.from_dict(svm_perf)
  svm_perf_df.index = ['train','cv', 'ext']
  return round(svm_perf_df, 2)



In [6]:
def rf_reg(train_x, train_y, test_x, test_y):
  rf_param_dict = {'n_estimators':[50, 70, 100, 150, 200],'max_depth':[10, 50, 100],'max_features': ["auto","sqrt","log2"]}
  score_dict = {'mse':make_scorer(mean_squared_error),'mae':make_scorer(mean_absolute_error),'mape':make_scorer(mean_absolute_percentage_error),'r2':make_scorer(r2_score)}
  rf_reg = RandomForestRegressor()
  rf_gs = GridSearchCV(estimator = rf_reg,param_grid = rf_param_dict,scoring = score_dict,n_jobs = 10,cv = 5, refit = 'r2',return_train_score = True)
  rf_gs_fit = rf_gs.fit(train_x, train_y)
  rf_best_model = rf_gs_fit.best_estimator_
  # cross validation
  rf_best_cv = cross_validate(estimator = rf_best_model,X = train_x,y = train_y,scoring = score_dict,cv = 5,n_jobs = 10,return_train_score = True)
  rf_cv_train_mae = np.mean(rf_best_cv['train_mae'])
  rf_cv_train_mse = np.mean(rf_best_cv['train_mse'])
  rf_cv_train_mape = np.mean(rf_best_cv['train_mape'])
  rf_cv_train_r2 = np.mean(rf_best_cv['train_r2'])
  rf_cv_test_mae = np.mean(rf_best_cv['test_mae'])
  rf_cv_test_mse = np.mean(rf_best_cv['test_mse'])
  rf_cv_test_mape = np.mean(rf_best_cv['test_mape'])
  rf_cv_test_r2 = np.mean(rf_best_cv['test_r2'])
  # external test
  rf_ext_pred = rf_best_model.predict(test_x)
  rf_ext_mae = mean_absolute_error(test_y, rf_ext_pred)
  rf_ext_mse = mean_squared_error(test_y, rf_ext_pred)
  rf_ext_mape = mean_absolute_percentage_error(test_y, rf_ext_pred)
  rf_ext_r2 = r2_score(test_y, rf_ext_pred)

  rf_perf = {'mae':[rf_cv_train_mae, rf_cv_test_mae, rf_ext_mae],
              'mse':[rf_cv_train_mse, rf_cv_test_mse, rf_ext_mse],
              'mape':[rf_cv_train_mape, rf_cv_test_mape, rf_ext_mape],
              'r2':[rf_cv_train_r2, rf_cv_test_r2, rf_ext_r2]}

  rf_perf_df = pd.DataFrame.from_dict(rf_perf)
  rf_perf_df.index = ['train','cv', 'ext']
  return round(rf_perf_df, 2)




In [7]:
def knn_reg(train_x, train_y, test_x, test_y):
  knn_param_dict = {'n_neighbors':[5, 10, 15, 20],'weights':['uniform', 'distance']}
  score_dict = {'mse':make_scorer(mean_squared_error),'mae':make_scorer(mean_absolute_error),'mape':make_scorer(mean_absolute_percentage_error),'r2':make_scorer(r2_score)}
  knn_reg = KNeighborsRegressor()
  knn_gs = GridSearchCV(estimator = knn_reg,param_grid = knn_param_dict,scoring = score_dict,n_jobs = 10,cv = 10,refit = 'r2',return_train_score = True)
  knn_gs_fit = knn_gs.fit(train_x, train_y)
  knn_model = knn_gs_fit.best_estimator_
  # cross validation
  knn_best_cv = cross_validate(estimator = knn_model,X = train_x,y = train_y,scoring = score_dict,cv = 5,n_jobs = 10,return_train_score = True)
  knn_cv_train_mae = np.mean(knn_best_cv['train_mae'])
  knn_cv_train_mse = np.mean(knn_best_cv['train_mse'])
  knn_cv_train_mape = np.mean(knn_best_cv['train_mape'])
  knn_cv_train_r2 = np.mean(knn_best_cv['train_r2'])
  knn_cv_test_mae = np.mean(knn_best_cv['test_mae'])
  knn_cv_test_mse = np.mean(knn_best_cv['test_mse'])
  knn_cv_test_mape = np.mean(knn_best_cv['test_mape'])
  knn_cv_test_r2 = np.mean(knn_best_cv['test_r2'])
  # external test 
  knn_ext_pred = knn_model.predict(test_x)
  knn_ext_mae = mean_absolute_error(test_y, knn_ext_pred)
  knn_ext_mse = mean_squared_error(test_y, knn_ext_pred)
  knn_ext_mape = mean_absolute_percentage_error(test_y, knn_ext_pred)
  knn_ext_r2 = r2_score(test_y, knn_ext_pred)

  knn_perf = {'mae':[knn_cv_train_mae, knn_cv_test_mae, knn_ext_mae],
              'mse':[knn_cv_train_mse, knn_cv_test_mse, knn_ext_mse],
              'mape':[knn_cv_train_mape, knn_cv_test_mape, knn_ext_mape],
              'r2':[knn_cv_train_r2, knn_cv_test_r2, knn_ext_r2]}

  knn_perf_df = pd.DataFrame.from_dict(knn_perf)
  knn_perf_df.index = ['train','cv', 'ext']
  return round(knn_perf_df, 2)




# molnet_bace

In [None]:
train_x, train_y, test_x, test_y = calculate_fps('molnet_bace1_train_RandomSplitter.csv', 'molnet_bace1_test_RandomSplitter.csv', 'mol', 'pIC50')


In [None]:
xgb_res = xgb_reg(train_x, train_y, test_x, test_y)
svm_res = svm_reg(train_x, train_y, test_x, test_y)
rf_res = rf_reg(train_x, train_y, test_x, test_y)
knn_res = knn_reg(train_x, train_y, test_x, test_y)


In [None]:
df_molnet_bace_random = pd.concat([xgb_res, svm_res,rf_res, knn_res], keys=['molnet_bace_random_rgb', 'molnet_bace_random_svm', 'molnet_bace_random_rf', 'molnet_bace_random_knn'])

In [None]:
train_x, train_y, test_x, test_y = calculate_fps('molnet_bace1_train_ScaffoldSplitter.csv', 'molnet_bace1_test_ScaffoldSplitter.csv', 'mol', 'pIC50')
xgb_res = xgb_reg(train_x, train_y, test_x, test_y)
svm_res = svm_reg(train_x, train_y, test_x, test_y)
rf_res = rf_reg(train_x, train_y, test_x, test_y)
knn_res = knn_reg(train_x, train_y, test_x, test_y)
df_molnet_bace_scaffold = pd.concat([xgb_res, svm_res,rf_res, knn_res], keys=['molnet_bace_scaffold_rgb', 'molnet_bace_scaffold_svm', 'molnet_bace_scaffold_rf', 'molnet_bace_scaffold_knn'])

train_x, train_y, test_x, test_y = calculate_fps('molnet_bace1_train_ButinaSplitter.csv', 'molnet_bace1_test_ButinaSplitter.csv', 'mol', 'pIC50')
xgb_res = xgb_reg(train_x, train_y, test_x, test_y)
svm_res = svm_reg(train_x, train_y, test_x, test_y)
rf_res = rf_reg(train_x, train_y, test_x, test_y)
knn_res = knn_reg(train_x, train_y, test_x, test_y)
df_molnet_bace_butina = pd.concat([xgb_res, svm_res,rf_res, knn_res], keys=['molnet_bace_butina_rgb', 'molnet_bace_butina_svm', 'molnet_bace_butina_rf', 'molnet_bace_butina_knn'])



In [None]:
df_molnet_bace_scaffold

In [None]:
with pd.ExcelWriter('molnet_bace_sklearn_reg.xlsx') as writer:  
    df_molnet_bace_random.to_excel(writer, sheet_name='molnet_bace_random')
    df_molnet_bace_scaffold.to_excel(writer, sheet_name='molnet_bace_scaffold')
    df_molnet_bace_butina.to_excel(writer, sheet_name='molnet_bace_butina')

# chembl_bace

In [9]:
train_x, train_y, test_x, test_y = calculate_fps('chembl_bace1_train_RandomSplitter.csv', 'chembl_bace1_test_RandomSplitter.csv', 'canonical_smiles', 'pIC50')
xgb_res = xgb_reg(train_x, train_y, test_x, test_y)
svm_res = svm_reg(train_x, train_y, test_x, test_y)
rf_res = rf_reg(train_x, train_y, test_x, test_y)
knn_res = knn_reg(train_x, train_y, test_x, test_y)
df_chembl_bace_random = pd.concat([xgb_res, svm_res,rf_res, knn_res], keys=['chembl_bace_random_rgb', 'chembl_bace_random_svm', 'chembl_bace_random_rf', 'chembl_bace_random_knn'])


In [10]:
with pd.ExcelWriter('chembl_bace_sklearn_reg.xlsx') as writer:  
    df_chembl_bace_random.to_excel(writer, sheet_name='chembl_bace_random')

In [None]:

train_x, train_y, test_x, test_y = calculate_fps('chembl_bace1_train_ScaffoldSplitter.csv', 'chembl_bace1_test_ScaffoldSplitter.csv', 'canonical_smiles', 'pIC50')
xgb_res = xgb_reg(train_x, train_y, test_x, test_y)
svm_res = svm_reg(train_x, train_y, test_x, test_y)
rf_res = rf_reg(train_x, train_y, test_x, test_y)
knn_res = knn_reg(train_x, train_y, test_x, test_y)
df_chembl_bace_scaffold = pd.concat([xgb_res, svm_res,rf_res, knn_res], keys=['chembl_bace_scaffold_rgb', 'chembl_bace_scaffold_svm', 'chembl_bace_scaffold_rf', 'chembl_bace_scaffold_knn'])

train_x, train_y, test_x, test_y = calculate_fps('chembl_bace1_train_ButinaSplitter.csv', 'chembl_bace1_test_ButinaSplitter.csv', 'canonical_smiles', 'pIC50')
xgb_res = xgb_reg(train_x, train_y, test_x, test_y)
svm_res = svm_reg(train_x, train_y, test_x, test_y)
rf_res = rf_reg(train_x, train_y, test_x, test_y)
knn_res = knn_reg(train_x, train_y, test_x, test_y)
df_chembl_bace_butina = pd.concat([xgb_res, svm_res,rf_res, knn_res], keys=['chembl_bace_butina_rgb', 'chembl_bace_butina_svm', 'chembl_bace_butina_rf', 'chembl_bace_butina_knn'])


In [None]:
with pd.ExcelWriter('chembl_bace_sklearn_reg.xlsx') as writer:  
    df_molnet_bace_random.to_excel(writer, sheet_name='chembl_bace_random')
    df_molnet_bace_scaffold.to_excel(writer, sheet_name='chembl_bace_scaffold')
    df_molnet_bace_butina.to_excel(writer, sheet_name='chembl_bace_butina')

# chembl_cdk2

In [11]:
train_x, train_y, test_x, test_y = calculate_fps('chembl_cdk2_train_RandomSplitter.csv', 'chembl_cdk2_test_RandomSplitter.csv', 'canonical_smiles', 'pIC50')
xgb_res = xgb_reg(train_x, train_y, test_x, test_y)
svm_res = svm_reg(train_x, train_y, test_x, test_y)
rf_res = rf_reg(train_x, train_y, test_x, test_y)
knn_res = knn_reg(train_x, train_y, test_x, test_y)
df_chembl_cdk2_random = pd.concat([xgb_res, svm_res,rf_res, knn_res], keys=['chembl_cdk2_random_rgb', 'chembl_cdk2_random_svm', 'chembl_cdk2_random_rf', 'chembl_cdk2_random_knn'])


In [12]:
with pd.ExcelWriter('chembl_cdk2_sklearn_reg.xlsx') as writer:  
    df_chembl_cdk2_random.to_excel(writer, sheet_name='chembl_cdk2_random')

# chembl ACH

In [13]:
train_x, train_y, test_x, test_y = calculate_fps('chembl_ach_train_RandomSplitter.csv', 'chembl_ach_test_RandomSplitter.csv', 'canonical_smiles', 'pIC50')
xgb_res = xgb_reg(train_x, train_y, test_x, test_y)
svm_res = svm_reg(train_x, train_y, test_x, test_y)
rf_res = rf_reg(train_x, train_y, test_x, test_y)
knn_res = knn_reg(train_x, train_y, test_x, test_y)
df_chembl_ach_random = pd.concat([xgb_res, svm_res,rf_res, knn_res], keys=['chembl_ach_random_rgb', 'chembl_ach_random_svm', 'chembl_ach_random_rf', 'chembl_ach_random_knn'])


In [15]:
with pd.ExcelWriter('chembl_ach_sklearn_reg.xlsx') as writer:  
  df_chembl_ach_random.to_excel(writer, sheet_name='chembl_ach_random')