In [1]:
import numpy as np
import pickle
import xgboost as xgb
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from scipy.stats import norm
from data_generation import m_0, g_0, get_data
from dml_algorithm import mm_ate, dml_ate

In [2]:
rng = np.random.default_rng(seed=78)

In [25]:
N = 16000
y_data, d_data, x_data = get_data(N, rng)
poly_features = PolynomialFeatures(degree=2, include_bias=False)
x_quad_data = poly_features.fit_transform(x_data)
y_train, y_test, d_train, d_test, x_train, x_test, x_quad_train, x_quad_test = train_test_split(y_data, d_data, x_data, x_quad_data, test_size=0.2, random_state=42)

In [26]:
model_g0 = xgb.XGBRFRegressor(max_depth=6, subsample=0.8, colsample_bynode=0.7, n_estimators=1000, reg_lambda=1, reg_alpha=0.01, objective='reg:squarederror', seed=0)
model_g1 = xgb.XGBRFRegressor(max_depth=6, subsample=0.8, colsample_bynode=0.7, n_estimators=1000, reg_lambda=1, reg_alpha=0.01, objective='reg:squarederror', seed=0)
model_m = xgb.XGBRFClassifier(max_depth=6, subsample=0.8, colsample_bynode=0.7, n_estimators=1000, reg_lambda=1, reg_alpha=0.01, objective='binary:logistic', seed=0)
model_m_cal = CalibratedClassifierCV(estimator=model_m)

In [27]:
%%time
model_m.fit(x_train, d_train)
print(model_m.predict_proba(x_test)[:20,1])
#model_m_cal.fit(x_train, d_train)
#print(model_m_cal.predict_proba(x_test)[:20,1])
#model_m.fit(x_quad_train, d_train)
#print(model_m.predict_proba(x_quad_test)[:20,1])
print(m_0(x_test[:20]))

[0.6566465  0.807134   0.2769158  0.373093   0.8089837  0.4922978
 0.40978426 0.32919958 0.7093497  0.36903524 0.32982656 0.63408315
 0.22529715 0.17325783 0.2903441  0.3003086  0.23247832 0.48005807
 0.63094133 0.7413213 ]
[0.73149452 0.9054747  0.11005632 0.43683751 0.97059911 0.26402973
 0.54020435 0.34799277 0.79084278 0.49491993 0.25195059 0.6823857
 0.08928813 0.07509516 0.11885165 0.27592196 0.04746392 0.46054575
 0.7226271  0.73064065]
CPU times: total: 16.5 s
Wall time: 6.28 s


In [28]:
%%time
model_g0.fit(x_train[d_train==0], y_train[d_train==0])
print(model_g0.predict(x_test[:20]))
#model_g0.fit(x_quad_train[d_train==0], y_train[d_train==0])
#print(model_g0.predict(x_quad_test[:20]))
print(g_0(0, x_test[:20]))

[ 6.559175   8.591359   3.3236167  3.3709116 10.527134   8.588054
  7.0843935  5.160612   7.4970536  7.830377   3.084512   3.2302258
  2.2744155  5.286232   3.505466   1.9165751  8.355016   4.2793565
  9.306521   0.8866389]
[ 4.3589141   6.43442523  5.87816428  4.59430456 12.03883368  9.18358536
  6.5379819   5.5638805   8.50248119  5.0546025   3.49583151  2.33095542
  3.77047377  6.4102437   4.50997687  2.67670973 11.62362354  3.74797071
  9.53968513 -0.50503129]
CPU times: total: 15 s
Wall time: 4.9 s


In [29]:
%%time
model_g1.fit(x_train[d_train==1], y_train[d_train==1])
print(model_g1.predict(x_test[:20]))
#model_g1.fit(x_quad_train[d_train==1], y_train[d_train==1])
#print(model_g1.predict(x_quad_test[:20]))
print(g_0(1, x_test[:20]))

[ 4.8402057  7.228008   5.49326    4.388094  11.018394   5.48511
  4.886592   6.282353   8.044767   4.487993   2.001659   3.8317893
  3.2955058  4.632959   2.9888992  3.6181827  5.7510757  2.7553613
  6.5679703  3.058676 ]
[ 5.43651305  6.59414325 10.34204766  5.88382421 11.98983307  8.18305605
  5.9179107   6.36856577  9.95417768  2.01753647  3.35019767  3.42894201
  5.64900032  6.20620451  4.35357938  5.39095046 10.61935613  3.65919755
  8.38697261  2.35270802]
CPU times: total: 14.8 s
Wall time: 4.22 s


In [59]:
%%time
model_g = [model_g0, model_g1]
dml_ate(y_data, d_data, x_data, model_g, model_m)

CPU times: total: 2min 37s
Wall time: 43.8 s


(array([-0.27403875, -0.4516584 , -0.59899353]),
 4.783035130337566,
 array([-0.48366066, -0.06441685]))