In [11]:
import numpy as np
import xgboost as xgb
import pickle
from sklearn.model_selection import GridSearchCV
from data_generation import get_data

def xgb_cv(y_data, d_data, x_data, cv=5):
    
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', seed=0)

    param_grid = {

        'n_estimators': [50, 100, 150],
        'max_depth': [2],
        'subsample': [0.5],
        'learning_rate': [0.05, 0.1, 0.15],
        'reg_lambda': [0.01, 0.1, 1]
    }
    
    grid_search_l = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=cv, n_jobs=1,
                                 scoring='neg_mean_squared_error', verbose=1)
    
    grid_search_m = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=cv, n_jobs=1,
                                 scoring='neg_mean_squared_error', verbose=1)
    
    xgb_params_dict = {}

    grid_search_l.fit(X=x_data, y=y_data)
    xgb_params_dict['l'] = grid_search_l.best_params_
    grid_search_m.fit(X=x_data, y=d_data)
    xgb_params_dict['m'] = grid_search_m.best_params_

    return xgb_params_dict

sample_sizes = [500, 1000, 2000]
n_MC = 1000
opt_params_xgb = {}

for N in sample_sizes:
    rng = np.random.default_rng(seed=19)
    opt_params_xgb_N = {}
    
    for j in range(n_MC): 
        y_data, d_data, x_data = get_data(N, rng)
        opt_params_xgb_N[j] = xgb_cv(y_data, d_data, x_data)

    opt_params_xgb[N] = opt_params_xgb_N
    print(f'Cross-validation done for N={N}')

with open('opt_params_xgb.pkl', 'wb') as pickle_file:
    pickle.dump(opt_params_xgb, pickle_file)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits


KeyboardInterrupt: 

In [13]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy.stats import norm

with open('results_ate_xgb.pkl', 'rb') as pickle_file:
    results_dict_ = pickle.load(pickle_file)

results_dict = {}

for N, results in results_dict_.items():
    ate_estimates, sigma_estimates, CIs, _ = results
    results_dict[N] = [ate_estimates[:, [0, 1]], sigma_estimates[:, 0], CIs]

In [14]:
results_dict

{500: [array([[ -1.85595418,  -0.53213012],
         [-71.13137732,  -1.35152796],
         [ -6.29632537,  -1.41342141]]),
  array([-4.77395901e-18, -3.55271368e-18,  1.77635684e-18]),
  array([[-0.53213012, -0.53213012],
         [-1.35152796, -1.35152796],
         [-1.41342141, -1.41342141]])],
 1000: [array([[ -0.06561836,  -1.70124888],
         [-14.01691359,  -1.291115  ],
         [ -0.14503378,  -1.70143959]]),
  array([3.21964677e-18, 1.33226763e-18, 1.35447209e-17]),
  array([[-1.70124888, -1.70124888],
         [-1.291115  , -1.291115  ],
         [-1.70143959, -1.70143959]])],
 2000: [array([[-3.39128240e+00, -9.88838169e-01],
         [-4.85154502e+01, -2.70036029e+00],
         [ 7.40243833e-04, -1.08968218e+00]]),
  array([-5.05151476e-18,  2.44249065e-18, -2.88657986e-18]),
  array([[-0.98883817, -0.98883817],
         [-2.70036029, -2.70036029],
         [-1.08968218, -1.08968218]])]}