In [1]:
import numpy as np
import sklearn
import xgboost as xgb
import json

from sklearn.model_selection import GridSearchCV 
from data_generation import get_data

## Define list of sample sizes

In [2]:
sample_sizes = [250, 500, 1000, 2500, 5000, 10000]
opt_params_dict_dict = {}

## Use cross-validation to find optimal hyperparameters of XGBoost for each sample size

In [3]:
xgb_model_g = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist')
xgb_model_m = xgb.XGBRegressor(objective='reg:logistic', tree_method='hist')
xgb_model_m = xgb.XGBClassifier(objective='binary:logistic', tree_method='hist')

param_grid = {
    'n_estimators': [5, 10, 25, 50, 75, 100],
    'max_depth': [2, 3, 4, 5, 6],
    'subsample': [0.6, 0.8, 1.0],
    'learning_rate': [0.1, 0.2, 0.3],
    'reg_lambda': [0.1, 1, 10] 
}

grid_search_g = GridSearchCV(estimator=xgb_model_g, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid_search_m = GridSearchCV(estimator=xgb_model_m, param_grid=param_grid, cv=5, scoring='neg_brier_score')
grid_search_m = GridSearchCV(estimator=xgb_model_m, param_grid=param_grid, cv=5, scoring='neg_log_loss')

In [4]:
%%time
for N in sample_sizes:
    np.random.seed(seed=123)
    y_data, d_data, x_data = get_data(N)
    opt_params_dict = {}
    
    for d in [0, 1]:
        grid_search_g.fit(X=x_data[d_data==d], y=y_data[d_data==d])
        opt_params_dict[f'g{d}'] = grid_search_g.best_params_
   
    grid_search_m.fit(X=x_data, y=d_data)
    opt_params_dict['m'] = grid_search_m.best_params_
    
    opt_params_dict_dict[N] = opt_params_dict  
    print(f'Cross-validation done for N={N}')

Cross-validation done for N=250
Cross-validation done for N=500
Cross-validation done for N=1000
Cross-validation done for N=2500
Cross-validation done for N=5000
Cross-validation done for N=10000
CPU times: total: 4h 19min 36s
Wall time: 1h 7min 9s


In [5]:
with open('opt_params_xgboost.json', 'w') as json_file:
    json.dump(opt_params_dict_dict, json_file)