# Import the datasets

In [1]:
import pandas as pd
import numpy as np
import csv
import os
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
import optuna
from optuna.samplers import TPESampler, GridSampler, RandomSampler
import math
import timeit

In [2]:
os.getcwd()

'/Users/seanhong/Desktop/Curriculum/Merck Project/Code'

In [3]:
df_list = []
dmat_list = []
# load training datsets
for n in range(1, 16):
    x_name = 'QSAR_{}_train_x'.format(n)
    y_name = 'QSAR_{}_train_y'.format(n)
    x = pd.read_csv("Data/qsar_train/"+x_name+".csv")
    y = pd.read_csv("Data/qsar_train/"+y_name+".csv")
    df = pd.concat([y, x], axis=1)
    df_list.append(df)
    # DMatrix
    dmat = xgb.DMatrix(x, label=y)
    dmat_list.append(dmat)

In [4]:
num_datasets = len(df_list)
print("Number of datasets:", num_datasets)

Number of datasets: 15


In [5]:
size_list = []
for i in range(num_datasets):
    size_list.append(df_list[i].shape)
size_list

[(37241, 9178),
 (8716, 5556),
 (6148, 5026),
 (1815, 4187),
 (3212, 5752),
 (37388, 8624),
 (1569, 4373),
 (9965, 5593),
 (5351, 4602),
 (11151, 5463),
 (6399, 4732),
 (8651, 4992),
 (6105, 5526),
 (4165, 5713),
 (5059, 5283)]

# Random Forest (default)

In [40]:
# Define the parameter search space
dval = {
    'n_estimators': 100, # "num.trees": 500 / "n_estimators": 100
    'criterion': 'mse',
    'bootstrap': True, # replace
    'max_samples': None, # sample.fraction
    'max_features': 'sqrt', ### "mtry": "sqrt" / "max_features": "auto" ###
    'min_samples_leaf': 1, # "min.node.size" default: 5 / "min_samples_leaf": 1
    'n_jobs': -1}
# Fit the model to each dataset
def_result = pd.DataFrame(columns=['rf_rmse'])
mlist = []
mod = RandomForestRegressor(**dval)
for i in range(num_datasets):
    num_col = df_list[i].shape[1]
    yvec = df_list[i].iloc[:,0:1].values.ravel()
    rmse = cross_val_score(mod, df_list[i].iloc[:,1:num_col], yvec, scoring="neg_root_mean_squared_error", 
                           cv=KFold(n_splits=3, shuffle=True, random_state=123), n_jobs=-1)
    mlist.append(abs(rmse).mean())
def_result['rf_rmse'] = mlist

In [41]:
def_result

Unnamed: 0,rf_rmse
0,0.390391
1,0.678829
2,0.72105
3,0.349083
4,0.909799
5,0.642994
6,21.283284
7,0.664494
8,0.611572
9,0.733696


In [42]:
def_result["rf_rmse"].mean()

1.9651716576727236

In [110]:
# min_samples_leaf = 0.1
start = timeit.default_timer()
dval = {
    'n_estimators': 100, # "num.trees": 500 / "n_estimators": 100
    'criterion': 'mse',
    'bootstrap': True, # replace
    'max_samples': None, # sample.fraction
    'max_features': 'sqrt', ### "mtry": "sqrt" / "max_features": "auto" ###
    'min_samples_leaf': 0.1, # "min.node.size" default: 5 / "min_samples_leaf": 1
    'n_jobs': -1}
# Fit the model to each dataset
mlist = []
mod = RandomForestRegressor(**dval)
for i in range(num_datasets):
    num_col = df_list[i].shape[1]
    yvec = df_list[i].iloc[:,0:1].values.ravel()
    rmse = cross_val_score(mod, df_list[i].iloc[:,1:num_col], yvec, scoring="neg_root_mean_squared_error", 
                           cv=KFold(n_splits=3, shuffle=True, random_state=123), n_jobs=-1)
    mlist.append(abs(rmse).mean())
sum(mlist)/len(mlist)
stop = timeit.default_timer()
print("Mean of RMSE with min_samples_leaf = 0.1: ", sum(mlist)/len(mlist), "\n")
print("Running Time: ", stop - start)

Mean of RMSE with min_samples_leaf = 0.1:  2.9241151756005825 

Running Time:  103.33977243001573


# XGBoost (default)

In [54]:
# Define the parameter search space
dval = {
    "booster": "gbtree",
    "lambda": 1,
    "alpha": 0,
    "eta": 0.3,
    "subsample": 1,
    "max_depth": 6,
    "min_child_weight": 1,
    "colsample_bytree": 1,
    "colsample_bylevel": 1
}
n_boost = 10
# Fit the model to each dataset
mlist = []
for i in range(num_datasets):
    bst = xgb.cv(dval, dmat_list[i], num_boost_round=n_boost, nfold=3, metrics='rmse', seed=123, shuffle=True)
    mlist.append(bst.iloc[len(bst.index)-1, 2])
def_result["xgb_rmse"] = mlist

In [55]:
def_result

Unnamed: 0,rf_rmse,xgb_rmse
0,0.390391,0.445107
1,0.678829,0.73333
2,0.72105,0.759668
3,0.349083,0.414424
4,0.909799,0.969474
5,0.642994,0.697357
6,21.283284,22.101774
7,0.664494,0.743745
8,0.611572,0.678416
9,0.733696,0.86624


In [56]:
def_result['xgb_rmse'].mean()

2.073823422222222

In [57]:
def_result.to_csv("qsar_results/def_result.csv", index=False)

# Tune One Hyperparameter

## - Random Forest

### 1. n_estimators (number of trees)

In [58]:
param_list = [1, *range(10,101,10), *range(100,1001,100)]
def objective(trial):
    mlist = []
    param = {
        "n_estimators": trial.suggest_categorical("n_estimators", param_list),
        'criterion': 'mse',
        'bootstrap': True, # replace
        'max_samples': None, # sample.fraction
        'max_features': 'sqrt', ### "mtry": "sqrt" / "max_features": "auto" ###
        'min_samples_leaf': 1, # "min.node.size" default: 5 / "min_samples_leaf": 1
        'n_jobs': -1
    }
    # fit rf to each dataset    
    for i in range(num_datasets):
        ncol = df_list[i].shape[1]
        yvec = df_list[i].iloc[:,0:1].values.ravel()
        mod = RandomForestRegressor(**param)
        rmse = cross_val_score(mod, df_list[i].iloc[:,1:ncol], yvec, 
                               scoring="neg_root_mean_squared_error", 
                               cv=KFold(n_splits=3, shuffle=True, random_state=123),
                               n_jobs=-1)
        mlist.append(abs(rmse).mean())
    rmse_mean = sum(mlist)/len(mlist)   
    return rmse_mean
# define grid points
search_space = {'n_estimators': param_list}
print("Number of points: {}".format(len(param_list)))

Number of points: 21


In [59]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(param_list))
res1 = study.trials_dataframe()

[I 2020-08-11 18:51:20,282] Finished trial#0 with value: 1.9503143724498593 with parameters: {'n_estimators': 800}. Best is trial#0 with value: 1.9503143724498593.
[I 2020-08-11 19:03:04,039] Finished trial#1 with value: 1.9547921185272101 with parameters: {'n_estimators': 500}. Best is trial#0 with value: 1.9503143724498593.
[I 2020-08-11 19:05:41,197] Finished trial#2 with value: 1.9759510402587002 with parameters: {'n_estimators': 60}. Best is trial#0 with value: 1.9503143724498593.
[I 2020-08-11 19:27:42,071] Finished trial#3 with value: 1.9545900552183393 with parameters: {'n_estimators': 1000}. Best is trial#0 with value: 1.9503143724498593.
[I 2020-08-11 19:33:18,504] Finished trial#4 with value: 1.9557694073180076 with parameters: {'n_estimators': 200}. Best is trial#0 with value: 1.9503143724498593.
[I 2020-08-11 19:41:01,334] Finished trial#5 with value: 1.9534099268653788 with parameters: {'n_estimators': 300}. Best is trial#0 with value: 1.9503143724498593.
[I 2020-08-11 19

In [60]:
res1

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_n_estimators,system_attrs_grid_id,system_attrs_search_space,state
0,0,1.950314,2020-08-11 18:33:20.494682,2020-08-11 18:51:20.281746,00:17:59.787064,800,18,"{'n_estimators': [1, 10, 20, 30, 40, 50, 60, 7...",COMPLETE
1,1,1.954792,2020-08-11 18:51:20.284709,2020-08-11 19:03:04.037341,00:11:43.752632,500,15,"{'n_estimators': [1, 10, 20, 30, 40, 50, 60, 7...",COMPLETE
2,2,1.975951,2020-08-11 19:03:04.044936,2020-08-11 19:05:41.196917,00:02:37.151981,60,6,"{'n_estimators': [1, 10, 20, 30, 40, 50, 60, 7...",COMPLETE
3,3,1.95459,2020-08-11 19:05:41.202983,2020-08-11 19:27:42.070493,00:22:00.867510,1000,20,"{'n_estimators': [1, 10, 20, 30, 40, 50, 60, 7...",COMPLETE
4,4,1.955769,2020-08-11 19:27:42.074589,2020-08-11 19:33:18.504107,00:05:36.429518,200,12,"{'n_estimators': [1, 10, 20, 30, 40, 50, 60, 7...",COMPLETE
5,5,1.95341,2020-08-11 19:33:18.509981,2020-08-11 19:41:01.333420,00:07:42.823439,300,13,"{'n_estimators': [1, 10, 20, 30, 40, 50, 60, 7...",COMPLETE
6,6,1.954252,2020-08-11 19:41:01.338681,2020-08-11 19:50:35.290079,00:09:33.951398,400,14,"{'n_estimators': [1, 10, 20, 30, 40, 50, 60, 7...",COMPLETE
7,7,1.975986,2020-08-11 19:50:35.295438,2020-08-11 19:52:56.887697,00:02:21.592259,40,4,"{'n_estimators': [1, 10, 20, 30, 40, 50, 60, 7...",COMPLETE
8,8,2.076617,2020-08-11 19:52:56.895917,2020-08-11 19:54:35.158264,00:01:38.262347,10,1,"{'n_estimators': [1, 10, 20, 30, 40, 50, 60, 7...",COMPLETE
9,9,1.969064,2020-08-11 19:54:35.164077,2020-08-11 19:57:52.816557,00:03:17.652480,80,8,"{'n_estimators': [1, 10, 20, 30, 40, 50, 60, 7...",COMPLETE


In [61]:
idx_min = res1["value"].idxmin
res1.iloc[idx_min,:]

number                                                                       0
value                                                                  1.95031
datetime_start                                      2020-08-11 18:33:20.494682
datetime_complete                                   2020-08-11 18:51:20.281746
duration                                                0 days 00:17:59.787064
params_n_estimators                                                        800
system_attrs_grid_id                                                        18
system_attrs_search_space    {'n_estimators': [1, 10, 20, 30, 40, 50, 60, 7...
state                                                                 COMPLETE
Name: 0, dtype: object

In [62]:
# Save the result
res1.to_csv("qsar_results/rf_n_estimators.csv", index=False)

### 2. bootstrap

In [63]:
param_list = [True, False]
def objective(trial):
    mlist = []
    param = {
        "n_estimators": 100,
        'criterion': 'mse',
        'bootstrap': trial.suggest_categorical('bootstrap', param_list), # replace
        'max_features': 'sqrt', ### "mtry": "sqrt" / "max_features": "auto" ###
        'min_samples_leaf': 1, # "min.node.size" default: 5 / "min_samples_leaf": 1
        'n_jobs': -1
    }
    if param["bootstrap"] == True:
        param["max_samples"] = None
    # fit rf to each dataset    
    for i in range(num_datasets):
        ncol = df_list[i].shape[1]
        yvec = df_list[i].iloc[:,0:1].values.ravel()
        mod = RandomForestRegressor(**param)
        rmse = cross_val_score(mod, df_list[i].iloc[:,1:ncol], yvec, 
                               scoring="neg_root_mean_squared_error", 
                               cv=KFold(n_splits=3, shuffle=True, random_state=123),
                               n_jobs=-1)
        mlist.append(abs(rmse).mean())
    rmse_mean = sum(mlist)/len(mlist)   
    return rmse_mean
# calculate grid points
search_space = {'bootstrap': param_list}
print("Number of points: {}".format(len(param_list)))

Number of points: 2


In [64]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(param_list))
res2 = study.trials_dataframe()

[I 2020-08-11 21:12:05,198] Finished trial#0 with value: 1.9694209049027698 with parameters: {'bootstrap': True}. Best is trial#0 with value: 1.9694209049027698.
[I 2020-08-11 21:16:56,459] Finished trial#1 with value: 1.918351706415024 with parameters: {'bootstrap': False}. Best is trial#1 with value: 1.918351706415024.


In [65]:
# Save the result
res2.to_csv("qsar_results/rf_bootstrap.csv", index=False)

### 3. max_samples

In [66]:
param_list = [i/10 for i in range(1,10)]
param_list.append(0.99)
def objective(trial):
    mlist = []
    param = {
        "n_estimators": 100,
        'criterion': 'mse',
        'bootstrap': True, # replace
        'max_samples': trial.suggest_categorical("max_samples", param_list),
        'max_features': "sqrt", ### "mtry": "sqrt" --better / "max_features": "auto" ###
        'min_samples_leaf': 1, # "min.node.size" default: 5 / "min_samples_leaf": 1
        'n_jobs': -1
    }
    # fit rf to each dataset    
    for i in range(num_datasets):
        ncol = df_list[i].shape[1]
        yvec = df_list[i].iloc[:,0:1].values.ravel()
        mod = RandomForestRegressor(**param)
        rmse = cross_val_score(mod, df_list[i].iloc[:,1:ncol], yvec, 
                               scoring="neg_root_mean_squared_error",
                               cv=KFold(n_splits=3, shuffle=True, random_state=123),
                               n_jobs=-1)
        mlist.append(abs(rmse).mean())
    rmse_mean = sum(mlist)/len(mlist)   
    return rmse_mean
# define grid points
search_space = {'max_samples': param_list}
print("Number of points: {}".format(len(param_list)))

Number of points: 10


In [67]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(param_list))
res3 = study.trials_dataframe()

[I 2020-08-11 21:19:10,226] Finished trial#0 with value: 2.1120198174300224 with parameters: {'max_samples': 0.3}. Best is trial#0 with value: 2.1120198174300224.
[I 2020-08-11 21:21:48,464] Finished trial#1 with value: 2.0401366045348284 with parameters: {'max_samples': 0.5}. Best is trial#1 with value: 2.0401366045348284.
[I 2020-08-11 21:24:15,582] Finished trial#2 with value: 2.0740466315445754 with parameters: {'max_samples': 0.4}. Best is trial#1 with value: 2.0401366045348284.
[I 2020-08-11 21:26:20,495] Finished trial#3 with value: 2.1938561724148244 with parameters: {'max_samples': 0.2}. Best is trial#1 with value: 2.0401366045348284.
[I 2020-08-11 21:29:21,832] Finished trial#4 with value: 2.0203023447184956 with parameters: {'max_samples': 0.6}. Best is trial#4 with value: 2.0203023447184956.
[I 2020-08-11 21:32:51,773] Finished trial#5 with value: 1.965659094132609 with parameters: {'max_samples': 0.99}. Best is trial#5 with value: 1.965659094132609.
[I 2020-08-11 21:35:53,

In [68]:
# Save the result
res3.to_csv("qsar_results/rf_max_samples.csv", index=False)

### 4. max_features

In [104]:
### This is very time-consuming!!!!!!!!!!!!!
param_list = ["sqrt", "auto", "log2"]
def objective(trial):
    mlist = []
    param = {
        "n_estimators": 100,
        'criterion': 'mse',
        'bootstrap': True, # replace
        'max_samples': None,
        'max_features': trial.suggest_categorical('max_features', ["sqrt", "auto", "log2"]), ### "mtry": "sqrt" --better / "max_features": "auto" ###
        'min_samples_leaf': 1, # "min.node.size" default: 5 / "min_samples_leaf": 1
        'n_jobs': -1
    }
    # fit rf to each dataset    
    for i in range(num_datasets):
        ncol = df_list[i].shape[1]
        yvec = df_list[i].iloc[:,0:1].values.ravel()
        mod = RandomForestRegressor(**param)
        rmse = cross_val_score(mod, df_list[i].iloc[:,1:ncol], yvec, 
                               scoring="neg_root_mean_squared_error",
                               cv=KFold(n_splits=3, shuffle=True, random_state=123),
                               n_jobs=-1)
        mlist.append(abs(rmse).mean())
    rmse_mean = sum(mlist)/len(mlist)   
    return rmse_mean
# define grid points
search_space = {'max_features': param_list}
print("Number of points: {}".format(len(param_list)))

Number of points: 4


In [105]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(param_list))
res4 = study.trials_dataframe()



TypeError: '<' not supported between instances of 'float' and 'str'

In [71]:
# Save the result
res4.to_csv("qsar_results/rf_max_features.csv", index=False)

In [107]:
### This is very time-consuming!!!!!!!!!!!!!
param_list = [i/10 for i in range(1,10)]
def objective(trial):
    mlist = []
    param = {
        "n_estimators": 100,
        'criterion': 'mse',
        'bootstrap': True,
        'max_samples': None,
        'max_features': trial.suggest_discrete_uniform('max_features', 0.1, 0.9, 0.1),
        'min_samples_leaf': 1,
        'n_jobs': -1
    }
    # fit rf to each dataset    
    for i in range(num_datasets):
        ncol = df_list[i].shape[1]
        yvec = df_list[i].iloc[:,0:1].values.ravel()
        mod = RandomForestRegressor(**param)
        rmse = cross_val_score(mod, df_list[i].iloc[:,1:ncol], yvec, 
                               scoring="neg_root_mean_squared_error",
                               cv=KFold(n_splits=3, shuffle=True, random_state=123),
                               n_jobs=-1)
        mlist.append(abs(rmse).mean())
    rmse_mean = sum(mlist)/len(mlist)   
    return rmse_mean
# define grid points
search_space = {'max_features': param_list}
print("Number of points: {}".format(len(param_list)))

Number of points: 9


In [108]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(param_list))
res4 = study.trials_dataframe()

[I 2020-08-21 19:41:18,309] Finished trial#0 with value: 1.9313815606895195 with parameters: {'max_features': 0.4}. Best is trial#0 with value: 1.9313815606895195.
[I 2020-08-21 20:11:17,838] Finished trial#1 with value: 1.9288357582547493 with parameters: {'max_features': 0.5}. Best is trial#1 with value: 1.9288357582547493.
[I 2020-08-21 20:55:14,528] Finished trial#2 with value: 1.941190273991497 with parameters: {'max_features': 0.8}. Best is trial#1 with value: 1.9288357582547493.
[I 2020-08-21 21:15:17,556] Finished trial#3 with value: 1.9305536825453473 with parameters: {'max_features': 0.3}. Best is trial#1 with value: 1.9288357582547493.
[I 2020-08-21 21:25:23,690] Finished trial#4 with value: 1.9329404304290554 with parameters: {'max_features': 0.1}. Best is trial#1 with value: 1.9288357582547493.
[I 2020-08-21 21:40:42,162] Finished trial#5 with value: 1.9368850931587898 with parameters: {'max_features': 0.2}. Best is trial#1 with value: 1.9288357582547493.
[I 2020-08-21 22:

In [109]:
# Save the result
res4.to_csv("qsar_results/rf_max_features123.csv", index=False)

### 5. min_samples_leaf

In [72]:
param_list = [i/10 for i in range(1, 6)]
param_list.append(1)
def objective(trial):
    mlist = []
    param = {
        "n_estimators": 100,
        'criterion': 'mse',
        'bootstrap': True, # replace
        'max_samples': None,
        'max_features': "sqrt", ### "mtry": "sqrt" --better / "max_features": "auto" ###
        'min_samples_leaf': trial.suggest_categorical("min_samples_leaf", param_list), # "min.node.size" default: 5 / "min_samples_leaf": 1
        'n_jobs': -1
    }
    # fit rf to each dataset    
    for i in range(num_datasets):
        ncol = df_list[i].shape[1]
        yvec = df_list[i].iloc[:,0:1].values.ravel()
        mod = RandomForestRegressor(**param)
        rmse = cross_val_score(mod, df_list[i].iloc[:,1:ncol], yvec, 
                               scoring="neg_root_mean_squared_error",
                               cv=KFold(n_splits=3, shuffle=True, random_state=123),
                               n_jobs=-1)
        mlist.append(abs(rmse).mean())
    rmse_mean = sum(mlist)/len(mlist)   
    return rmse_mean
# define grid points
search_space = {'min_samples_leaf': param_list}
print("Number of points: {}".format(len(param_list)))

Number of points: 6


In [73]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(param_list))
res5 = study.trials_dataframe()

[I 2020-08-11 22:46:26,331] Finished trial#0 with value: 1.9635276150742966 with parameters: {'min_samples_leaf': 1}. Best is trial#0 with value: 1.9635276150742966.
[I 2020-08-11 22:48:07,565] Finished trial#1 with value: 3.1798516707891533 with parameters: {'min_samples_leaf': 0.2}. Best is trial#0 with value: 1.9635276150742966.
[I 2020-08-11 22:49:46,008] Finished trial#2 with value: 2.929823568260731 with parameters: {'min_samples_leaf': 0.1}. Best is trial#0 with value: 1.9635276150742966.
[I 2020-08-11 22:51:27,397] Finished trial#3 with value: 3.3842635665485243 with parameters: {'min_samples_leaf': 0.4}. Best is trial#0 with value: 1.9635276150742966.
[I 2020-08-11 22:53:14,798] Finished trial#4 with value: 3.2902037163262454 with parameters: {'min_samples_leaf': 0.3}. Best is trial#0 with value: 1.9635276150742966.
[I 2020-08-11 22:54:45,742] Finished trial#5 with value: 3.3840314069266477 with parameters: {'min_samples_leaf': 0.5}. Best is trial#0 with value: 1.9635276150742

In [74]:
# Save the result
res5.to_csv("qsar_results/rf_min_samples_leaf.csv", index=False)

## - XGBoost

### 1. num_boost_round

In [75]:
param_list = [1, *range(10,101,10), *range(100,1001,100)]
def objective(trial):
    mlist = []
    param = {
        "booster": "gbtree",
        "lambda": 1,
        "alpha": 0,
        "eta": 0.3,
        "subsample": 1,
        "max_depth": 6,
        "min_child_weight": 1,
        "colsample_bytree": 1,
        "colsample_bylevel": 1
    }
    n_boost = trial.suggest_categorical("num_boost_round", param_list)
    # run xgboost.cv for each dataset
    for i in range(num_datasets):
        bst = xgb.cv(param, dmat_list[i], num_boost_round=n_boost, nfold=3, metrics='rmse', seed=123, shuffle=True)
        mlist.append(bst.iloc[len(bst.index)-1, 2])
    rmse_mean = sum(mlist)/len(mlist)
    return rmse_mean
search_space = {"num_boost_round": param_list}
print("Number of points: {}".format(len(param_list)))

Number of points: 21


In [76]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(param_list))
res1 = study.trials_dataframe()

[I 2020-08-12 01:41:23,206] Finished trial#0 with value: 5.957756755555555 with parameters: {'num_boost_round': 1}. Best is trial#0 with value: 5.957756755555555.
[I 2020-08-12 02:35:03,132] Finished trial#1 with value: 1.9467973111111114 with parameters: {'num_boost_round': 80}. Best is trial#1 with value: 1.9467973111111114.


KeyboardInterrupt: 

In [None]:
res1.to_csv("qsar_results/xgb_num_boost_round.csv", index=False)

### 2. lambda

In [17]:
param_list = [2**(-9) * 2 ** i for i in range(len(range(-9,11,1)))]
param_list.append(0)
def objective(trial):
    mlist = []
    param = {
        "booster": "gbtree",
        "lambda": trial.suggest_categorical("lambda", param_list),
        "alpha": 0,
        "eta": 0.3,
        "subsample": 1,
        "max_depth": 6,
        "min_child_weight": 1,
        "colsample_bytree": 1,
        "colsample_bylevel": 1
    }
    n_boost = 10
    # run xgboost.cv for each dataset
    for i in range(num_datasets):
        bst = xgb.cv(param, dmat_list[i], num_boost_round=n_boost, nfold=3, metrics='rmse', seed=123, shuffle=True)
        mlist.append(bst.iloc[len(bst.index)-1, 2])
    rmse_mean = sum(mlist)/len(mlist)
    return rmse_mean
search_space = {"lambda": param_list}
print("Number of points: {}".format(len(param_list)))

Number of points: 21


In [18]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(param_list))
res2 = study.trials_dataframe()

[I 2020-08-11 01:58:25,865] Finished trial#0 with value: 2.099082177777778 with parameters: {'lambda': 4.0}. Best is trial#0 with value: 2.099082177777778.
[I 2020-08-11 02:04:57,173] Finished trial#1 with value: 2.0886802666666666 with parameters: {'lambda': 0.00390625}. Best is trial#1 with value: 2.0886802666666666.
[I 2020-08-11 02:11:31,294] Finished trial#2 with value: 2.0693317555555555 with parameters: {'lambda': 0.25}. Best is trial#2 with value: 2.0693317555555555.
[I 2020-08-11 02:17:44,991] Finished trial#3 with value: 2.7531260222222214 with parameters: {'lambda': 512.0}. Best is trial#2 with value: 2.0693317555555555.
[I 2020-08-11 02:24:01,918] Finished trial#4 with value: 2.190316088888889 with parameters: {'lambda': 32.0}. Best is trial#2 with value: 2.0693317555555555.
[I 2020-08-11 02:30:32,419] Finished trial#5 with value: 2.0684098 with parameters: {'lambda': 0.125}. Best is trial#5 with value: 2.0684098.
[I 2020-08-11 02:36:54,598] Finished trial#6 with value: 2.1

In [19]:
res2.to_csv("qsar_results/xgb_lambda.csv", index=False)

### 3. alpha

In [20]:
param_list = [2**(-9) * 2 ** i for i in range(len(range(-9,11,1)))]
param_list.append(0)
def objective(trial):
    mlist = []
    param = {
        "booster": "gbtree",
        "lambda": 1,
        "alpha": trial.suggest_categorical("alpha", param_list),
        "eta": 0.3,
        "subsample": 1,
        "max_depth": 6,
        "min_child_weight": 1,
        "colsample_bytree": 1,
        "colsample_bylevel": 1
    }
    n_boost = 10
    # run xgboost.cv for each dataset
    for i in range(num_datasets):
        bst = xgb.cv(param, dmat_list[i], num_boost_round=n_boost, nfold=3, metrics='rmse', seed=123, shuffle=True)
        mlist.append(bst.iloc[len(bst.index)-1, 2])
    rmse_mean = sum(mlist)/len(mlist)
    return rmse_mean
search_space = {"alpha": param_list}
print("Number of points: {}".format(len(param_list)))

Number of points: 21


In [21]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(param_list))
res3 = study.trials_dataframe()

[I 2020-08-11 04:11:17,194] Finished trial#0 with value: 2.178944466666666 with parameters: {'alpha': 64.0}. Best is trial#0 with value: 2.178944466666666.
[I 2020-08-11 04:17:31,111] Finished trial#1 with value: 2.0648716 with parameters: {'alpha': 0.5}. Best is trial#1 with value: 2.0648716.
[I 2020-08-11 04:23:21,542] Finished trial#2 with value: 2.0969006 with parameters: {'alpha': 8.0}. Best is trial#1 with value: 2.0648716.
[I 2020-08-11 04:29:18,814] Finished trial#3 with value: 2.235954977777778 with parameters: {'alpha': 128.0}. Best is trial#1 with value: 2.0648716.
[I 2020-08-11 04:35:00,374] Finished trial#4 with value: 2.074257 with parameters: {'alpha': 0.001953125}. Best is trial#1 with value: 2.0648716.
[I 2020-08-11 04:40:56,961] Finished trial#5 with value: 2.075487088888889 with parameters: {'alpha': 0.03125}. Best is trial#1 with value: 2.0648716.
[I 2020-08-11 04:46:44,781] Finished trial#6 with value: 2.400961555555555 with parameters: {'alpha': 512.0}. Best is tr

In [22]:
res3.to_csv("qsar_results/xgb_alpha.csv", index=False)

### 4. eta

In [23]:
param_list = [i/10 for i in range(0,11)]
def objective(trial):
    mlist = []
    param = {
        "booster": "gbtree",
        "lambda": 1,
        "alpha": 0,
        "eta": trial.suggest_categorical("eta", param_list),
        "subsample": 1,
        "max_depth": 6,
        "min_child_weight": 1,
        "colsample_bytree": 1,
        "colsample_bylevel": 1
    }
    n_boost = 10
    # run xgboost.cv for each dataset
    for i in range(num_datasets):
        bst = xgb.cv(param, dmat_list[i], num_boost_round=n_boost, nfold=3, metrics='rmse', seed=123, shuffle=True)
        mlist.append(bst.iloc[len(bst.index)-1, 2])
    rmse_mean = sum(mlist)/len(mlist)
    return rmse_mean
search_space = {"eta": param_list}
print("Number of points: {}".format(len(param_list)))

Number of points: 11


In [24]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(param_list))
res4 = study.trials_dataframe()

[I 2020-08-11 06:17:41,239] Finished trial#0 with value: 2.191839355555555 with parameters: {'eta': 0.7}. Best is trial#0 with value: 2.191839355555555.
[I 2020-08-11 06:23:33,656] Finished trial#1 with value: 3.609728866666667 with parameters: {'eta': 0.1}. Best is trial#0 with value: 2.191839355555555.
[I 2020-08-11 06:29:30,338] Finished trial#2 with value: 2.0866513111111114 with parameters: {'eta': 0.6}. Best is trial#2 with value: 2.0866513111111114.
[I 2020-08-11 06:35:28,826] Finished trial#3 with value: 2.416866711111111 with parameters: {'eta': 1.0}. Best is trial#2 with value: 2.0866513111111114.
[I 2020-08-11 06:41:23,002] Finished trial#4 with value: 8.066236022222222 with parameters: {'eta': 0.0}. Best is trial#2 with value: 2.0866513111111114.
[I 2020-08-11 06:48:16,338] Finished trial#5 with value: 2.3170490222222226 with parameters: {'eta': 0.9}. Best is trial#2 with value: 2.0866513111111114.
[I 2020-08-11 06:54:17,484] Finished trial#6 with value: 2.261756288888889 w

In [25]:
res4.to_csv("qsar_results/xgb_eta.csv", index=False)

### 5. subsample

In [26]:
param_list = [i/10 for i in range(1,11)]
def objective(trial):
    mlist = []
    param = {
        "booster": "gbtree",
        "lambda": 1,
        "alpha": 0,
        "eta": 0.3,
        "subsample": trial.suggest_categorical("subsample", param_list),
        "max_depth": 6,
        "min_child_weight": 1,
        "colsample_bytree": 1,
        "colsample_bylevel": 1
    }
    n_boost = 10
    # run xgboost.cv for each dataset
    for i in range(num_datasets):
        bst = xgb.cv(param, dmat_list[i], num_boost_round=n_boost, nfold=3, metrics='rmse', seed=123, shuffle=True)
        mlist.append(bst.iloc[len(bst.index)-1, 2])
    rmse_mean = sum(mlist)/len(mlist)
    return rmse_mean
search_space = {"subsample": param_list}
print("Number of points: {}".format(len(param_list)))

Number of points: 10


In [27]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(param_list))
res5 = study.trials_dataframe()

[I 2020-08-11 07:24:13,184] Finished trial#0 with value: 2.1579280444444446 with parameters: {'subsample': 0.6}. Best is trial#0 with value: 2.1579280444444446.
[I 2020-08-11 07:30:33,642] Finished trial#1 with value: 2.073823422222222 with parameters: {'subsample': 1.0}. Best is trial#1 with value: 2.073823422222222.
[I 2020-08-11 07:36:28,379] Finished trial#2 with value: 2.1112895333333332 with parameters: {'subsample': 0.7}. Best is trial#1 with value: 2.073823422222222.
[I 2020-08-11 07:41:17,180] Finished trial#3 with value: 2.634213933333333 with parameters: {'subsample': 0.1}. Best is trial#1 with value: 2.073823422222222.
[I 2020-08-11 07:46:37,768] Finished trial#4 with value: 2.324989577777778 with parameters: {'subsample': 0.3}. Best is trial#1 with value: 2.073823422222222.
[I 2020-08-11 07:52:39,343] Finished trial#5 with value: 2.091571288888889 with parameters: {'subsample': 0.9}. Best is trial#1 with value: 2.073823422222222.
[I 2020-08-11 07:58:19,480] Finished trial#

In [28]:
res5.to_csv("qsar_results/xgb_subsample.csv", index=False)

### 6. max_depth

In [29]:
param_list = [*range(1,16)]
def objective(trial):
    mlist = []
    param = {
        "booster": "gbtree",
        "lambda": 1,
        "alpha": 0,
        "eta": 0.3,
        "subsample": 1,
        "max_depth": trial.suggest_categorical("max_depth", param_list),
        "min_child_weight": 1,
        "colsample_bytree": 1,
        "colsample_bylevel": 1
    }
    n_boost = 10
    # run xgboost.cv for each dataset
    for i in range(num_datasets):
        bst = xgb.cv(param, dmat_list[i], num_boost_round=n_boost, nfold=3, metrics='rmse', seed=123, shuffle=True)
        mlist.append(bst.iloc[len(bst.index)-1, 2])
    rmse_mean = sum(mlist)/len(mlist)
    return rmse_mean
search_space = {"max_depth": param_list}
print("Number of points: {}".format(len(param_list)))

Number of points: 15


In [30]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(param_list))
res6 = study.trials_dataframe()

[I 2020-08-11 08:23:22,406] Finished trial#0 with value: 2.074392155555556 with parameters: {'max_depth': 11}. Best is trial#0 with value: 2.074392155555556.
[I 2020-08-11 08:30:06,595] Finished trial#1 with value: 2.073823422222222 with parameters: {'max_depth': 6}. Best is trial#1 with value: 2.073823422222222.
[I 2020-08-11 08:37:34,053] Finished trial#2 with value: 2.0570772444444447 with parameters: {'max_depth': 10}. Best is trial#2 with value: 2.0570772444444447.
[I 2020-08-11 08:41:25,356] Finished trial#3 with value: 2.289188977777778 with parameters: {'max_depth': 3}. Best is trial#2 with value: 2.0570772444444447.
[I 2020-08-11 08:48:04,216] Finished trial#4 with value: 2.0582831555555554 with parameters: {'max_depth': 8}. Best is trial#2 with value: 2.0570772444444447.
[I 2020-08-11 08:53:48,607] Finished trial#5 with value: 2.050471711111111 with parameters: {'max_depth': 7}. Best is trial#5 with value: 2.050471711111111.
[I 2020-08-11 09:03:03,432] Finished trial#6 with v

In [31]:
res6.to_csv("qsar_results/xgb_max_depth.csv", index=False)

### 7. min_child_weight

In [32]:
param_list = [2 ** i for i in range(0,8,1)]
def objective(trial):
    mlist = []
    param = {
        "booster": "gbtree",
        "lambda": 1,
        "alpha": 0,
        "eta": 0.3,
        "subsample": 1,
        "max_depth": 6,
        "min_child_weight": trial.suggest_categorical("min_child_weight", param_list),
        "colsample_bytree": 1,
        "colsample_bylevel": 1
    }
    n_boost = 10
    # run xgboost.cv for each dataset
    for i in range(num_datasets):
        bst = xgb.cv(param, dmat_list[i], num_boost_round=n_boost, nfold=3, metrics='rmse', seed=123, shuffle=True)
        mlist.append(bst.iloc[len(bst.index)-1, 2])
    rmse_mean = sum(mlist)/len(mlist)
    return rmse_mean
search_space = {"min_child_weight": param_list}
print("Number of points: {}".format(len(param_list)))

Number of points: 8


In [33]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(param_list))
res7 = study.trials_dataframe()

[I 2020-08-11 10:00:02,728] Finished trial#0 with value: 2.167804688888889 with parameters: {'min_child_weight': 64}. Best is trial#0 with value: 2.167804688888889.
[I 2020-08-11 10:06:25,709] Finished trial#1 with value: 2.0599597111111114 with parameters: {'min_child_weight': 4}. Best is trial#1 with value: 2.0599597111111114.
[I 2020-08-11 10:12:19,553] Finished trial#2 with value: 2.0824315777777778 with parameters: {'min_child_weight': 16}. Best is trial#1 with value: 2.0599597111111114.
[I 2020-08-11 10:18:19,596] Finished trial#3 with value: 2.073823422222222 with parameters: {'min_child_weight': 1}. Best is trial#1 with value: 2.0599597111111114.
[I 2020-08-11 10:24:13,568] Finished trial#4 with value: 2.0727983111111112 with parameters: {'min_child_weight': 2}. Best is trial#1 with value: 2.0599597111111114.
[I 2020-08-11 10:30:58,542] Finished trial#5 with value: 2.0855710666666667 with parameters: {'min_child_weight': 8}. Best is trial#1 with value: 2.0599597111111114.
[I 20

In [34]:
res7.to_csv("qsar_results/xgb_min_child_weight.csv", index=False)

### 8. colsample_bytree

In [6]:
param_list = [i/10 for i in range(1,11)]
def objective(trial):
    mlist = []
    param = {
        "booster": "gbtree",
        "lambda": 1,
        "alpha": 0,
        "eta": 0.3,
        "subsample": 1,
        "max_depth": 6,
        "min_child_weight": 1,
        "colsample_bytree": trial.suggest_categorical("colsample_bytree", param_list),
        "colsample_bylevel": 1
    }
    n_boost = 10
    # run xgboost.cv for each dataset
    for i in range(num_datasets):
        bst = xgb.cv(param, dmat_list[i], num_boost_round=n_boost, nfold=3, metrics='rmse', seed=123, shuffle=True)
        mlist.append(bst.iloc[len(bst.index)-1, 2])
    rmse_mean = sum(mlist)/len(mlist)
    return rmse_mean
search_space = {"colsample_bytree": param_list}
print("Number of points: {}".format(len(param_list)))

Number of points: 10


In [7]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(param_list))
res8 = study.trials_dataframe()

[I 2020-08-11 12:58:26,171] Finished trial#0 with value: 2.096077688888889 with parameters: {'colsample_bytree': 0.5}. Best is trial#0 with value: 2.096077688888889.
[I 2020-08-11 13:04:00,380] Finished trial#1 with value: 2.118594355555556 with parameters: {'colsample_bytree': 0.7}. Best is trial#0 with value: 2.096077688888889.
[I 2020-08-11 13:09:46,001] Finished trial#2 with value: 2.0585763333333333 with parameters: {'colsample_bytree': 0.8}. Best is trial#2 with value: 2.0585763333333333.
[I 2020-08-11 13:12:28,948] Finished trial#3 with value: 2.126945777777778 with parameters: {'colsample_bytree': 0.3}. Best is trial#2 with value: 2.0585763333333333.
[I 2020-08-11 13:16:43,787] Finished trial#4 with value: 2.101339888888889 with parameters: {'colsample_bytree': 0.6}. Best is trial#2 with value: 2.0585763333333333.
[I 2020-08-11 13:23:33,371] Finished trial#5 with value: 2.073823422222222 with parameters: {'colsample_bytree': 1.0}. Best is trial#2 with value: 2.0585763333333333.

In [8]:
res8.to_csv("qsar_results/xgb_colsample_bytree.csv", index=False)

### 9. colsample_bylevel

In [9]:
param_list = [i/10 for i in range(1,11)]
def objective(trial):
    mlist = []
    param = {
        "booster": "gbtree",
        "lambda": 1,
        "alpha": 0,
        "eta": 0.3,
        "subsample": 1,
        "max_depth": 6,
        "min_child_weight": 1,
        "colsample_bytree": 1,
        "colsample_bylevel": trial.suggest_categorical("colsample_bylevel", param_list)
    }
    n_boost = 10
    # run xgboost.cv for each dataset
    for i in range(num_datasets):
        bst = xgb.cv(param, dmat_list[i], num_boost_round=n_boost, nfold=3, metrics='rmse', seed=123, shuffle=True)
        mlist.append(bst.iloc[len(bst.index)-1, 2])
    rmse_mean = sum(mlist)/len(mlist)
    return rmse_mean
search_space = {"colsample_bylevel": param_list}
print("Number of points: {}".format(len(param_list)))

Number of points: 10


In [10]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(param_list))
res9 = study.trials_dataframe()

[I 2020-08-11 13:45:08,411] Finished trial#0 with value: 2.076728977777778 with parameters: {'colsample_bylevel': 0.9}. Best is trial#0 with value: 2.076728977777778.
[I 2020-08-11 13:49:48,625] Finished trial#1 with value: 2.113456466666667 with parameters: {'colsample_bylevel': 0.3}. Best is trial#0 with value: 2.076728977777778.
[I 2020-08-11 13:55:56,508] Finished trial#2 with value: 2.0778051333333334 with parameters: {'colsample_bylevel': 0.6}. Best is trial#0 with value: 2.076728977777778.
[I 2020-08-11 14:01:48,200] Finished trial#3 with value: 2.0773363333333332 with parameters: {'colsample_bylevel': 0.5}. Best is trial#0 with value: 2.076728977777778.
[I 2020-08-11 14:04:28,137] Finished trial#4 with value: 2.162110377777778 with parameters: {'colsample_bylevel': 0.1}. Best is trial#0 with value: 2.076728977777778.
[I 2020-08-11 14:08:29,229] Finished trial#5 with value: 2.0989231777777775 with parameters: {'colsample_bylevel': 0.2}. Best is trial#0 with value: 2.076728977777

In [11]:
res9.to_csv("qsar_results/xgb_colsample_bylevel.csv", index=False)

# Grid Search

## GS - Random Forest

In [97]:
nest = [10, 100, 300, 500, 700, 1000]
boot = [True, False]
maxsam = [0.1, 0.5, 0.99]
maxfeat = ["auto", "sqrt"]
minsam = [0.1, 0.3, 0.5]
def objective(trial):
    mlist = []
    param = {
        "n_estimators": trial.suggest_categorical("n_estimators", nest),
        "bootstrap": trial.suggest_categorical("bootstrap", boot),
        "max_features": trial.suggest_categorical("max_features", maxfeat),
        "min_samples_leaf": trial.suggest_categorical("min_samples_leaf", minsam),
        "n_jobs": -1}    
    if param["bootstrap"] == True:
        param["max_samples"] = trial.suggest_categorical("max_samples", maxsam)
    # fit rf to each dataset    
    for i in range(num_datasets):
        ncol = df_list[i].shape[1]
        yvec = df_list[i].iloc[:,0:1].values.ravel()
        mod = RandomForestRegressor(**param)
        rmse = cross_val_score(mod, df_list[i].iloc[:,1:ncol], yvec, 
                               scoring="neg_root_mean_squared_error", 
                               cv=KFold(n_splits=3, shuffle=True, random_state=123), 
                               n_jobs=-1)
        mlist.append(abs(rmse).mean())
    rmse_mean = sum(mlist)/len(mlist)   
    return rmse_mean
# Count how many points in the search space
search_space = {
    "n_estimators": nest,
    "bootstrap": boot,
    "max_features": maxfeat,
    "min_samples_leaf": minsam,
    "max_samples": maxsam
}
# calculate number of points in the search space
num_points= len(nest) * len(maxfeat) * len(minsam) * (1 + len(maxsam))
print("Number of points in the search space: {}".format(num_points))

Number of points in the search space: 144


In [98]:
study = optuna.create_study(direction="minimize", sampler=GridSampler(search_space))
study.optimize(objective, n_trials=num_points)
rf_gs_result = study.trials_dataframe()

[I 2020-08-19 02:12:12,959] Finished trial#0 with value: 3.3023958628410326 with parameters: {'n_estimators': 700, 'bootstrap': True, 'max_features': 'sqrt', 'min_samples_leaf': 0.3, 'max_samples': 0.99}. Best is trial#0 with value: 3.3023958628410326.
[I 2020-08-19 02:14:00,734] Finished trial#1 with value: 2.752100567190573 with parameters: {'n_estimators': 300, 'bootstrap': False, 'max_features': 'sqrt', 'min_samples_leaf': 0.1}. Best is trial#1 with value: 2.752100567190573.
[I 2020-08-19 02:15:45,743] Finished trial#2 with value: 3.383802726825465 with parameters: {'n_estimators': 700, 'bootstrap': False, 'max_features': 'sqrt', 'min_samples_leaf': 0.5}. Best is trial#1 with value: 2.752100567190573.
[I 2020-08-19 02:18:41,383] Finished trial#3 with value: 2.902746391328847 with parameters: {'n_estimators': 100, 'bootstrap': True, 'max_features': 'auto', 'min_samples_leaf': 0.1, 'max_samples': 0.5}. Best is trial#1 with value: 2.752100567190573.
[I 2020-08-19 02:20:33,144] Finishe

[I 2020-08-19 08:28:11,042] Finished trial#65 with value: 2.628981106866613 with parameters: {'n_estimators': 500, 'bootstrap': False, 'max_features': 'auto', 'min_samples_leaf': 0.1}. Best is trial#56 with value: 2.5751457430620905.
[I 2020-08-19 08:29:52,914] Finished trial#66 with value: 3.165463774956901 with parameters: {'n_estimators': 300, 'bootstrap': False, 'max_features': 'sqrt', 'min_samples_leaf': 0.3}. Best is trial#56 with value: 2.5751457430620905.
[I 2020-08-19 08:34:19,725] Finished trial#67 with value: 3.0629105400765315 with parameters: {'n_estimators': 300, 'bootstrap': True, 'max_features': 'auto', 'min_samples_leaf': 0.3, 'max_samples': 0.99}. Best is trial#56 with value: 2.5751457430620905.
[I 2020-08-19 08:37:37,473] Finished trial#68 with value: 3.375290098769382 with parameters: {'n_estimators': 300, 'bootstrap': False, 'max_features': 'auto', 'min_samples_leaf': 0.5}. Best is trial#56 with value: 2.5751457430620905.
[I 2020-08-19 09:13:52,186] Finished trial#

[I 2020-08-19 13:36:48,533] Finished trial#131 with value: 3.3848792162815835 with parameters: {'n_estimators': 10, 'bootstrap': True, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'max_samples': 0.1}. Best is trial#56 with value: 2.5751457430620905.
[I 2020-08-19 13:38:26,038] Finished trial#132 with value: 2.7861778322132187 with parameters: {'n_estimators': 10, 'bootstrap': False, 'max_features': 'sqrt', 'min_samples_leaf': 0.1}. Best is trial#56 with value: 2.5751457430620905.
[I 2020-08-19 13:40:02,458] Finished trial#133 with value: 3.3752900987693826 with parameters: {'n_estimators': 10, 'bootstrap': False, 'max_features': 'auto', 'min_samples_leaf': 0.5}. Best is trial#56 with value: 2.5751457430620905.
[I 2020-08-19 13:48:40,612] Finished trial#134 with value: 3.061614526077297 with parameters: {'n_estimators': 700, 'bootstrap': True, 'max_features': 'auto', 'min_samples_leaf': 0.3, 'max_samples': 0.99}. Best is trial#56 with value: 2.5751457430620905.
[I 2020-08-19 13:50:2

In [99]:
rf_gs_result.to_csv("qsar_results/rf_gs_result.csv", index=False)

## GS - XGBoost

In [101]:
eta_list = [i/10 for i in range(1,10)]
maxdep_list = [6, 10, 15]
min_list = [1, 16, 64, 128]

def objective(trial):
    mlist = []
    param = {
        "booster": "gbtree",
        "lambda": 1,
        "alpha": 0,
        "eta": trial.suggest_discrete_uniform("eta", 0.1, 0.9, 0.1),
        "subsample": 1,
        "max_depth": trial.suggest_categorical("max_depth", maxdep_list),
        "min_child_weight": trial.suggest_categorical("min_child_weight", min_list),
        "colsample_bytree": 1,
        "colsample_bylevel": 1
    }
    # run xgboost.cv for each dataset
    for i in range(num_datasets):
        bst = xgb.cv(param, dmat_list[i], num_boost_round=10, nfold=3, metrics='rmse', seed=123, shuffle=True)
        mlist.append(bst.iloc[len(bst.index)-1, 2])
    rmse_mean = sum(mlist)/len(mlist)
    return rmse_mean
# determine points in the search space
search_space = {
    "eta": eta_list,
    "max_depth": maxdep_list,
    "min_child_weight": min_list
}
# calculate number of points in the search space
num_points = 1
keys = search_space.keys()
for k in keys:
    num_points = num_points * len(search_space[k])
print("Number of points in the search space: {}".format(num_points))

Number of points in the search space: 108


In [102]:
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.GridSampler(search_space))
study.optimize(objective, n_trials=num_points)
xgb_gs_result = study.trials_dataframe()

[I 2020-08-19 20:47:00,078] Finished trial#0 with value: 2.2764940000000005 with parameters: {'eta': 0.9, 'max_depth': 6, 'min_child_weight': 64}. Best is trial#0 with value: 2.2764940000000005.
[I 2020-08-19 20:57:36,955] Finished trial#1 with value: 2.1286220444444446 with parameters: {'eta': 0.6, 'max_depth': 15, 'min_child_weight': 16}. Best is trial#1 with value: 2.1286220444444446.
[I 2020-08-19 21:06:30,987] Finished trial#2 with value: 3.760398644444444 with parameters: {'eta': 0.1, 'max_depth': 15, 'min_child_weight': 128}. Best is trial#1 with value: 2.1286220444444446.
[I 2020-08-19 21:17:06,043] Finished trial#3 with value: 2.283761288888889 with parameters: {'eta': 0.2, 'max_depth': 15, 'min_child_weight': 1}. Best is trial#1 with value: 2.1286220444444446.
[I 2020-08-19 21:23:44,911] Finished trial#4 with value: 2.167804688888889 with parameters: {'eta': 0.3, 'max_depth': 6, 'min_child_weight': 64}. Best is trial#1 with value: 2.1286220444444446.
[I 2020-08-19 22:05:27,74

[I 2020-08-20 07:43:13,110] Finished trial#82 with value: 2.223945644444444 with parameters: {'eta': 0.6, 'max_depth': 10, 'min_child_weight': 1}. Best is trial#62 with value: 2.0367932.
[I 2020-08-20 07:50:21,210] Finished trial#83 with value: 3.5870043333333332 with parameters: {'eta': 0.1, 'max_depth': 10, 'min_child_weight': 16}. Best is trial#62 with value: 2.0367932.
[I 2020-08-20 07:59:20,306] Finished trial#84 with value: 2.0939470666666664 with parameters: {'eta': 0.5, 'max_depth': 15, 'min_child_weight': 16}. Best is trial#62 with value: 2.0367932.
[I 2020-08-20 08:05:15,393] Finished trial#85 with value: 2.3231407777777777 with parameters: {'eta': 0.9, 'max_depth': 6, 'min_child_weight': 16}. Best is trial#62 with value: 2.0367932.
[I 2020-08-20 08:14:48,288] Finished trial#86 with value: 2.2853036888888885 with parameters: {'eta': 0.9, 'max_depth': 15, 'min_child_weight': 128}. Best is trial#62 with value: 2.0367932.
[I 2020-08-20 08:23:51,615] Finished trial#87 with value:

In [103]:
xgb_gs_result.to_csv("qsar_results/xgb_gs_result.csv", index=False)

# Bayesian Optimization

## BO - Random Forest

In [33]:
nest = [10, *range(100,1001,100)]
boot = [True, False]
maxsam = [i/10 for i in range(1,10)]
maxsam.append(0.99)
maxfeat = [i/10 for i in range(1,10)]
maxfeat.append("auto")
maxfeat.append("sqrt")
minsam = [0.1, 0.2, 0.3, 0.4, 0.5]
def objective(trial):
    mlist = []
    param = {
        "n_estimators": trial.suggest_categorical("n_estimators", nest),
        "bootstrap": trial.suggest_categorical("bootstrap", boot),
        "max_features": trial.suggest_categorical("max_features", maxfeat),
        "min_samples_leaf": trial.suggest_categorical("min_samples_leaf", minsam),
        "n_jobs": -1}    
    if param["bootstrap"] == True:
        param["max_samples"] = trial.suggest_categorical("max_samples", maxsam)
    # fit rf to each dataset    
    for i in range(num_datasets):
        ncol = df_list[i].shape[1]
        yvec = df_list[i].iloc[:,0:1].values.ravel()
        mod = RandomForestRegressor(**param)
        rmse = cross_val_score(mod, df_list[i].iloc[:,1:ncol], yvec, 
                               scoring="neg_root_mean_squared_error", 
                               cv=KFold(n_splits=3, shuffle=True, random_state=123), 
                               n_jobs=-1)
        mlist.append(abs(rmse).mean())
    rmse_mean = sum(mlist)/len(mlist)   
    return rmse_mean
# count how many point in the grid
search_space = {
    "n_estimators": nest,
    "bootstrap": boot,
    "max_features": maxfeat,
    "min_samples_leaf": minsam,
    "max_samples": maxsam
}
# calculate number of points in the search space
num_points= len(nest) * len(maxfeat) * len(minsam) * (1 + len(maxsam))
print("Number of points in the search space: {}".format(num_points))

Number of points in the search space: 6655


In [34]:
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(objective, n_trials=50)
rf_bo_result = study.trials_dataframe()

[I 2020-08-14 18:25:09,665] Finished trial#0 with value: 3.384145183257039 with parameters: {'n_estimators': 500, 'bootstrap': True, 'max_features': 0.6, 'min_samples_leaf': 0.5, 'max_samples': 0.9}. Best is trial#0 with value: 3.384145183257039.
[I 2020-08-14 18:27:12,417] Finished trial#1 with value: 3.0673866154558795 with parameters: {'n_estimators': 300, 'bootstrap': True, 'max_features': 0.2, 'min_samples_leaf': 0.2, 'max_samples': 0.6}. Best is trial#1 with value: 3.0673866154558795.
[I 2020-08-14 18:29:32,694] Finished trial#2 with value: 2.7846095932787756 with parameters: {'n_estimators': 100, 'bootstrap': False, 'max_features': 0.2, 'min_samples_leaf': 0.2}. Best is trial#2 with value: 2.7846095932787756.
[I 2020-08-14 18:31:22,004] Finished trial#3 with value: 3.38439419855127 with parameters: {'n_estimators': 500, 'bootstrap': True, 'max_features': 'auto', 'min_samples_leaf': 0.2, 'max_samples': 0.3}. Best is trial#2 with value: 2.7846095932787756.
[I 2020-08-14 18:33:07,7

In [35]:
rf_bo_result.to_csv("qsar_results/rf_bo_result.csv", index=False)

## BO - XGBoost

In [63]:
greek_list = [2**(-9) * 2 ** i for i in range(len(range(-9,11,1)))]
greek_list.append(0)
eta_list = [i/10 for i in range(1, 11)]
subsam_list = [i/10 for i in range(1,11)]
maxdep_list = [*range(1,16)]
min_list = [2 ** i for i in range(0,8,1)]
colsam_list = [i/10 for i in range(1,11)]
nboost_list = [1, *range(10,110,10)]

def objective(trial):
    mlist = []
    param = {
        "booster": "gbtree",
        "lambda": trial.suggest_categorical("lambda", greek_list),
        "alpha": trial.suggest_categorical("alpha", greek_list),
        "eta": trial.suggest_discrete_uniform("eta", 0.1, 1, 0.1),
        "subsample": trial.suggest_discrete_uniform("subsample", 0.1, 1, 0.1),
        "max_depth": trial.suggest_int("max_depth", 1, 15),
        "min_child_weight": trial.suggest_categorical("min_child_weight", min_list),
        "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree", 0.1, 1, 0.1),
        "colsample_bylevel": trial.suggest_discrete_uniform('colsample_bylevel', 0.1, 1, 0.1)
    }
    n_boost = trial.suggest_categorical("num_boost_round", nboost_list)
    # run xgboost.cv for each dataset
    for i in range(num_datasets):
        bst = xgb.cv(param, dmat_list[i], num_boost_round=n_boost, nfold=3, metrics='rmse', seed=123, shuffle=True)
        mlist.append(bst.iloc[len(bst.index)-1, 2])
    rmse_mean = sum(mlist)/len(mlist)
    return rmse_mean
# determine points in the search space
search_space = {
    "lambda": greek_list,
    "alpha": greek_list,
    "eta": eta_list,
    "subsample": subsam_list,
    "max_depth": maxdep_list,
    "min_child_weight": min_list,
    "colsample_bytree": colsam_list,
    "colsample_bylevel": colsam_list,
    "num_boost_round": nboost_list
}
# calculate number of points in the search space
num_points = 1
keys = search_space.keys()
for k in keys:
    num_points = num_points * len(search_space[k])
print("Number of points in the search space: {}".format(num_points))

Number of points in the search space: 5821200000


In [64]:
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(objective, n_trials=50)
xgb_bo_result = study.trials_dataframe()

[I 2020-08-17 00:33:36,348] Finished trial#0 with value: 3.2217637777777783 with parameters: {'lambda': 0.0625, 'alpha': 0.25, 'eta': 0.8, 'subsample': 0.4, 'max_depth': 9, 'min_child_weight': 4, 'colsample_bytree': 0.9, 'colsample_bylevel': 0.8, 'num_boost_round': 1}. Best is trial#0 with value: 3.2217637777777783.
[I 2020-08-17 00:38:55,304] Finished trial#1 with value: 2.490325777777777 with parameters: {'lambda': 512.0, 'alpha': 32.0, 'eta': 0.4, 'subsample': 0.2, 'max_depth': 2, 'min_child_weight': 32, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.2, 'num_boost_round': 80}. Best is trial#1 with value: 2.490325777777777.
[I 2020-08-17 00:43:59,750] Finished trial#2 with value: 2.322697888888889 with parameters: {'lambda': 4.0, 'alpha': 0.001953125, 'eta': 0.8, 'subsample': 0.7000000000000001, 'max_depth': 5, 'min_child_weight': 16, 'colsample_bytree': 0.30000000000000004, 'colsample_bylevel': 0.5, 'num_boost_round': 40}. Best is trial#2 with value: 2.322697888888889.
[I 2020-08-1

[I 2020-08-17 23:00:13,204] Finished trial#48 with value: 1.9633064222222218 with parameters: {'lambda': 16.0, 'alpha': 0.25, 'eta': 0.2, 'subsample': 0.6, 'max_depth': 13, 'min_child_weight': 32, 'colsample_bytree': 0.7000000000000001, 'colsample_bylevel': 1.0, 'num_boost_round': 100}. Best is trial#38 with value: 1.9090083555555555.
[I 2020-08-18 00:00:21,059] Finished trial#49 with value: 1.9779249333333333 with parameters: {'lambda': 1.0, 'alpha': 0.25, 'eta': 0.2, 'subsample': 0.6, 'max_depth': 13, 'min_child_weight': 32, 'colsample_bytree': 0.7000000000000001, 'colsample_bylevel': 1.0, 'num_boost_round': 100}. Best is trial#38 with value: 1.9090083555555555.


In [65]:
xgb_bo_result.to_csv("qsar_results/xgb_bo_result3.csv", index=False)

# Random Search

## RS - Ramdon Forest

In [36]:
nest = [10, *range(100,1001,100)]
boot = [True, False]
maxsam = [i/10 for i in range(1,10)]
maxsam.append(0.99)
maxfeat = [i/10 for i in range(1,10)]
maxfeat.append("auto")
maxfeat.append("sqrt")
minsam = [0.1, 0.2, 0.3, 0.4, 0.5]
def objective(trial):
    mlist = []
    param = {
        "n_estimators": trial.suggest_categorical("n_estimators", nest),
        "bootstrap": trial.suggest_categorical("bootstrap", boot),
        "max_features": trial.suggest_categorical("max_features", maxfeat),
        "min_samples_leaf": trial.suggest_categorical("min_samples_leaf", minsam),
        "n_jobs": -1}    
    if param["bootstrap"] == True:
        param["max_samples"] = trial.suggest_categorical("max_samples", maxsam)
    # fit rf to each dataset    
    for i in range(num_datasets):
        ncol = df_list[i].shape[1]
        yvec = df_list[i].iloc[:,0:1].values.ravel()
        mod = RandomForestRegressor(**param)
        rmse = cross_val_score(mod, df_list[i].iloc[:,1:ncol], yvec, 
                               scoring="neg_root_mean_squared_error", 
                               cv=KFold(n_splits=3, shuffle=True, random_state=123), 
                               n_jobs=-1)
        mlist.append(abs(rmse).mean())
    rmse_mean = sum(mlist)/len(mlist)   
    return rmse_mean
# count how many point in the grid
search_space = {
    "n_estimators": nest,
    "bootstrap": boot,
    "max_features": maxfeat,
    "min_samples_leaf": minsam,
    "max_samples": maxsam
}
# calculate number of points in the search space
num_points= len(nest) * len(maxfeat) * len(minsam) * (1 + len(maxsam))
print("Number of points in the search space: {}".format(num_points))

Number of points in the search space: 6655


In [37]:
study = optuna.create_study(direction='minimize', sampler=RandomSampler(seed=123))
study.optimize(objective, n_trials=100)
rf_rs_result = study.trials_dataframe()

[I 2020-08-15 03:18:03,852] Finished trial#0 with value: 3.384412111477476 with parameters: {'n_estimators': 200, 'bootstrap': True, 'max_features': 0.3, 'min_samples_leaf': 0.2, 'max_samples': 0.4}. Best is trial#0 with value: 3.384412111477476.
[I 2020-08-15 03:49:53,627] Finished trial#1 with value: 2.951359544609858 with parameters: {'n_estimators': 1000, 'bootstrap': False, 'max_features': 'auto', 'min_samples_leaf': 0.2}. Best is trial#1 with value: 2.951359544609858.
[I 2020-08-15 03:51:53,451] Finished trial#2 with value: 2.6289811068666133 with parameters: {'n_estimators': 10, 'bootstrap': False, 'max_features': 'auto', 'min_samples_leaf': 0.1}. Best is trial#2 with value: 2.6289811068666133.
[I 2020-08-15 03:53:47,280] Finished trial#3 with value: 3.1124542257459136 with parameters: {'n_estimators': 10, 'bootstrap': False, 'max_features': 'auto', 'min_samples_leaf': 0.4}. Best is trial#2 with value: 2.6289811068666133.
[I 2020-08-15 03:55:40,840] Finished trial#4 with value: 

[I 2020-08-15 09:26:09,553] Finished trial#68 with value: 3.0825463605826666 with parameters: {'n_estimators': 100, 'bootstrap': False, 'max_features': 0.8, 'min_samples_leaf': 0.4}. Best is trial#62 with value: 2.4958330223611167.
[I 2020-08-15 09:29:49,992] Finished trial#69 with value: 3.050076852572498 with parameters: {'n_estimators': 600, 'bootstrap': True, 'max_features': 0.4, 'min_samples_leaf': 0.2, 'max_samples': 0.6}. Best is trial#62 with value: 2.4958330223611167.
[I 2020-08-15 09:42:50,243] Finished trial#70 with value: 2.767073847036059 with parameters: {'n_estimators': 900, 'bootstrap': False, 'max_features': 0.4, 'min_samples_leaf': 0.2}. Best is trial#62 with value: 2.4958330223611167.
[I 2020-08-15 09:45:50,292] Finished trial#71 with value: 3.038611074532905 with parameters: {'n_estimators': 100, 'bootstrap': False, 'max_features': 0.7, 'min_samples_leaf': 0.3}. Best is trial#62 with value: 2.4958330223611167.
[I 2020-08-15 09:48:01,435] Finished trial#72 with value

In [38]:
rf_rs_result.to_csv("qsar_results/rf_rs_result.csv", index=False)

## RS - XGBoost

In [66]:
greek_list = [2**(-9) * 2 ** i for i in range(len(range(-9,11,1)))]
greek_list.append(0)
eta_list = [i/10 for i in range(1, 11)]
subsam_list = [i/10 for i in range(1,11)]
maxdep_list = [*range(1,16)]
min_list = [2 ** i for i in range(0,8,1)]
colsam_list = [i/10 for i in range(1,11)]
nboost_list = [1, *range(10,110,10)]

def objective(trial):
    mlist = []
    param = {
        "booster": "gbtree",
        "lambda": trial.suggest_categorical("lambda", greek_list),
        "alpha": trial.suggest_categorical("alpha", greek_list),
        "eta": trial.suggest_discrete_uniform("eta", 0.1, 1, 0.1),
        "subsample": trial.suggest_discrete_uniform("subsample", 0.1, 1, 0.1),
        "max_depth": trial.suggest_int("max_depth", 1, 15),
        "min_child_weight": trial.suggest_categorical("min_child_weight", min_list),
        "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree", 0.1, 1, 0.1),
        "colsample_bylevel": trial.suggest_discrete_uniform('colsample_bylevel', 0.1, 1, 0.1)
    }
    n_boost = trial.suggest_categorical("num_boost_round", nboost_list)
    # run xgboost.cv for each dataset
    for i in range(num_datasets):
        bst = xgb.cv(param, dmat_list[i], num_boost_round=n_boost, nfold=3, metrics='rmse', seed=123, shuffle=True)
        mlist.append(bst.iloc[len(bst.index)-1, 2])
    rmse_mean = sum(mlist)/len(mlist)
    return rmse_mean
# determine points in the search space
search_space = {
    "lambda": greek_list,
    "alpha": greek_list,
    "eta": eta_list,
    "subsample": subsam_list,
    "max_depth": maxdep_list,
    "min_child_weight": min_list,
    "colsample_bytree": colsam_list,
    "colsample_bylevel": colsam_list,
    "num_boost_round": nboost_list
}
# calculate number of points in the search space
num_points = 1
keys = search_space.keys()
for k in keys:
    num_points = num_points * len(search_space[k])
print("Number of points in the search space: {}".format(num_points))

Number of points in the search space: 5821200000


In [67]:
study = optuna.create_study(direction='minimize', sampler=RandomSampler(seed=123))
study.optimize(objective, n_trials=50)
xgb_rs_result = study.trials_dataframe()

[I 2020-08-18 02:25:55,300] Finished trial#0 with value: 6.158900000000001 with parameters: {'lambda': 16.0, 'alpha': 0.0078125, 'eta': 0.30000000000000004, 'subsample': 0.6, 'max_depth': 4, 'min_child_weight': 4, 'colsample_bytree': 0.5, 'colsample_bylevel': 1.0, 'num_boost_round': 1}. Best is trial#0 with value: 6.158900000000001.
[I 2020-08-18 02:27:13,791] Finished trial#1 with value: 6.190845622222222 with parameters: {'lambda': 256.0, 'alpha': 64.0, 'eta': 0.4, 'subsample': 0.4, 'max_depth': 10, 'min_child_weight': 8, 'colsample_bytree': 0.30000000000000004, 'colsample_bylevel': 0.7000000000000001, 'num_boost_round': 1}. Best is trial#0 with value: 6.158900000000001.
[I 2020-08-18 02:28:31,332] Finished trial#2 with value: 6.8879734444444445 with parameters: {'lambda': 0.03125, 'alpha': 256.0, 'eta': 0.2, 'subsample': 0.2, 'max_depth': 8, 'min_child_weight': 4, 'colsample_bytree': 0.6, 'colsample_bylevel': 0.7000000000000001, 'num_boost_round': 1}. Best is trial#0 with value: 6.1

[I 2020-08-18 21:08:57,304] Finished trial#48 with value: 2.103173111111111 with parameters: {'lambda': 0.25, 'alpha': 0.03125, 'eta': 0.4, 'subsample': 0.9, 'max_depth': 14, 'min_child_weight': 4, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.6, 'num_boost_round': 40}. Best is trial#22 with value: 1.9517619111111109.
[I 2020-08-18 21:16:17,990] Finished trial#49 with value: 2.072056622222222 with parameters: {'lambda': 256.0, 'alpha': 0.00390625, 'eta': 0.2, 'subsample': 0.4, 'max_depth': 13, 'min_child_weight': 4, 'colsample_bytree': 0.1, 'colsample_bylevel': 0.7000000000000001, 'num_boost_round': 100}. Best is trial#22 with value: 1.9517619111111109.


In [68]:
xgb_rs_result.to_csv("qsar_results/xgb_rs_result.csv", index=False)

# Test

In [13]:
# Define the parameter search space
start = timeit.default_timer()
dval = {
    'n_estimators': 500, # "num.trees": 500 / "n_estimators": 100
    'criterion': 'mse',
    'bootstrap': True, # replace
    'max_samples': None, # sample.fraction
    'max_features': 'auto', ### "mtry": "sqrt" / "max_features": "auto" ###
    'min_samples_leaf': 0.1, # "min.node.size" default: 5 / "min_samples_leaf": 1
    'n_jobs': -1}
# Fit the model to each dataset
def_result = pd.DataFrame(columns=['rf_rmse'])
mlist = []
mod = RandomForestRegressor(**dval)
for i in range(num_datasets):
    num_col = df_list[i].shape[1]
    yvec = df_list[i].iloc[:,0:1].values.ravel()
    rmse = cross_val_score(mod, df_list[i].iloc[:,1:num_col], yvec, scoring="neg_root_mean_squared_error", 
                           cv=KFold(n_splits=3, shuffle=True, random_state=123), n_jobs=-1)
    mlist.append(abs(rmse).mean())
mlist
stop = timeit.default_timer()

In [14]:
print("Time: ", stop-start)

Time:  967.6611760390006


In [15]:
sum(mlist)/len(mlist)

2.5749039184711418

In [95]:
maxfeat = ["auto", "sqrt"]
def objective(trial):
    mlist = []
    param = {
        "n_estimators": 100,
        "bootstrap": True,
        "max_features": trial.suggest_categorical("max_features", maxfeat),
        "min_samples_leaf": 0.1,
        "n_jobs": -1}    
    if param["bootstrap"] == True:
        param["max_samples"] = 0.9999
    # fit rf to each dataset    
    for i in range(num_datasets):
        ncol = df_list[i].shape[1]
        yvec = df_list[i].iloc[:,0:1].values.ravel()
        mod = RandomForestRegressor(**param)
        rmse = cross_val_score(mod, df_list[i].iloc[:,1:ncol], yvec, 
                               scoring="neg_root_mean_squared_error", 
                               cv=KFold(n_splits=3, shuffle=True, random_state=123), 
                               n_jobs=-1)
        mlist.append(abs(rmse).mean())
    rmse_mean = sum(mlist)/len(mlist)   
    return rmse_mean
# Count how many points in the search space
search_space = {
    "max_features": maxfeat
}

In [96]:
study = optuna.create_study(direction='minimize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=len(maxfeat))
test_maxfeat = study.trials_dataframe()



KeyboardInterrupt: 

## Test xgb

In [39]:
# Define the parameter search space
start = timeit.default_timer()
# Define the parameter search space
dval = {
    "booster": "gbtree",
    "lambda": 1,
    "alpha": 0,
    "eta": 0.3,
    "subsample": 1,
    "max_depth": 6,
    "min_child_weight": 1,
    "colsample_bytree": 1,
    "colsample_bylevel": 1
}
n_boost = 100
# Fit the model to each dataset
mlist = []
for i in range(num_datasets):
    bst = xgb.cv(dval, dmat_list[i], num_boost_round=n_boost, nfold=3, metrics='rmse', seed=123, shuffle=True)
    mlist.append(bst.iloc[len(bst.index)-1, 2])
stop = timeit.default_timer()

In [40]:
print("Time: ", stop-start)

Time:  408.46654528299405


In [41]:
sum(mlist)/len(mlist)

2.073823422222222

In [48]:
# Define the parameter search space
start = timeit.default_timer()
# Define the parameter search space
dval = {
    "n_estimators": 100,
    "booster": "gbtree",
    "lambda": 1,
    "alpha": 0,
    "eta": 0.3,
    "subsample": 1,
    "max_depth": 6,
    "min_child_weight": 1,
    "colsample_bytree": 1,
    "colsample_bylevel": 1,
    "n_jobs": -1
}
# Fit the model to each dataset
mlist = []
mod = xgb.XGBRegressor(**dval)
for i in range(num_datasets):
    num_col = df_list[i].shape[1]
    yvec = df_list[i].iloc[:,0:1].values.ravel()
    rmse = cross_val_score(mod, df_list[i].iloc[:,1:num_col], yvec, scoring="neg_root_mean_squared_error", 
                           cv=KFold(n_splits=3, shuffle=True, random_state=123), n_jobs=-1)
    mlist.append(abs(rmse).mean())
mlist
stop = timeit.default_timer()

KeyboardInterrupt: 

In [None]:
print("Time: ", stop-start)

In [None]:
sum(mlist)/len(mlist)