In [None]:
import numpy as np
import sklearn
import xgboost as xgb
import json

from data_generation import m_0, g_0, get_data
from dml_algorithm import dml_ate

## Load tuned hyperparameters of XGBoost for each sample size

Think about it: Do we want to consider this for several/all sample sizes?

In [None]:
with open('opt_params_xgboost.json', 'r') as json_file:
    opt_params_dict_dict = json.load(json_file)

We consider a sample size of $N=800$. 

No cross-fitting ($K=0$) means that the ML models are fitted on the entire data set, the optimal hyperparameters for training on a set of size $N=800$ has already been determined in a $5$-fold cross-validation on a data set of size $N=1000$. 

Similarly, the optimal hyperparameters for performing cross-fitting with $K=2$ folds, i.e. training on a data set of size $N=400$ has been determined in a $5$-fold cross-validation on a data set of size $N=500$.

We will perform a $5$-fold cross-validation to determine optimal hyperparameters for ...

In [None]:
N = 800

settings = {
    0: opt_params_dict_dict['1000'],
    2: opt_params_dict_dict['500'],
    5: None
}

In [None]:
xgb_model_g = xgb.XGBRegressor(objective='reg:squarederror')
xgb_model_m = xgb.XGBClassifier(objective='binary:logistic')

param_grid = {
    'n_estimators': [5, 10, 25, 50, 75, 100, 150, 200],
    'max_depth': [2, 3, 4, 5, 6],
    'subsample': [0.6, 0.8, 1.0],
    'learning_rate': [0.01, 0.03, 0.06, 0.1, 0.2, 0.3],
    'reg_lambda': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
}

grid_search_g = GridSearchCV(estimator=xgb_model_g, param_grid=param_grid, cv=5, n_jobs=-1,
                             scoring='neg_mean_squared_error')
grid_search_m = GridSearchCV(estimator=xgb_model_m, param_grid=param_grid, cv=5, n_jobs=-1, 
                             scoring='neg_brier_score')

In [None]:
np.random.seed(123)

y_data, d_data, x_data = get_data(N)
opt_params_dict = {}
    
for d in [0, 1]:
    grid_search_g.fit(X=x_data[d_data==d], y=y_data[d_data==d])
    opt_params_dict[f'g{d}'] = grid_search_g.best_params_
   
grid_search_m.fit(X=x_data, y=d_data)
opt_params_dict['m'] = grid_search_m.best_params_
    
settings[5] = opt_params_dict

## Infeasible method-of-moments estimator

In [None]:
def mm_ate(y_data, d_data, x_data):
    return np.mean(g_0(1, x_data) - g_0(0, x_data) + d_data*(y_data-g_0(1, x_data))/m_0(x_data)
                   - (1-d_data)*(y_data-g_0(0, x_data))/(1-m_0(x_data)))

## DML estimator without cross-fitting

## MC simulation

In [None]:
np.random.seed(100)
n_MC = 5000
ate_estimates = np.empty((n_MC, 4))

for j in range(n_MC):
    y_data, d_data, x_data = get_data(int(N))
    ate_estimates[j, 0] = mm_ate(y_data, d_data, x_data)   
    for l, (K, opt_params_dict) in enumerate(settings.items()):
        model_g0, model_g1 = xgb.XGBRegressor(objective='reg:squarederror'), xgb.XGBRegressor(objective='reg:squarederror')
        model_g0.set_params(**opt_params_dict['g0'])
        model_g1.set_params(**opt_params_dict['g1'])
        model_g = [model_g0, model_g1]
        model_m = xgb.XGBClassifier(objective='binary:logistic')
        model_m.set_params(**opt_params_dict['m'])
        ate_estimates[j, l+1] = dml_ate(K, y_data, d_data, x_data, model_g, model_m, classical=False, inference=False)
        
np.save('results.npy', ate_estimates)