In [1]:
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from scipy.stats import norm
from data_generation import m_0, g_0, get_data
from dml_algorithm import mm_ate, dml_ate

In [71]:
rng = np.random.default_rng(seed=52)

In [None]:
param_grid = {
    'n_estimators': [50, 100, 250, 500, 750, 1000, 1500],
    'max_features': ['sqrt', None],
    'max_depth': [2, 3, 5, 10, 25, 50, 75, 100, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [72]:
N = 1000
y_data, d_data, x_data = get_data(N, rng)
poly_features = PolynomialFeatures(degree=2, include_bias=False)
x_quad_data = poly_features.fit_transform(x_data)
y_train, y_test, d_train, d_test, x_train, x_test, x_quad_train, x_quad_test = train_test_split(y_data, d_data, x_data, x_quad_data, test_size=0.2, random_state=42)

In [78]:
model_g0 = RandomForestRegressor(random_state=42, n_estimators=1000, max_features=7, max_depth=None, min_samples_split=5, min_samples_leaf=1, bootstrap=True, n_jobs=-1)
model_g1 = RandomForestRegressor(random_state=42, n_estimators=3000, max_features=None, max_depth=None, min_samples_split=2, min_samples_leaf=2, bootstrap=True, n_jobs=-1)
model_m = RandomForestClassifier(random_state=42, n_estimators=500, max_features=None, max_depth=None, min_samples_split=10, min_samples_leaf=2, bootstrap=True, n_jobs=-1, criterion='log_loss')
model_m_cal = CalibratedClassifierCV(estimator=model_m)

In [79]:
%%time
model_m.fit(x_train, d_train)
print(model_m.predict_proba(x_test)[:20,1])
#model_m_cal.fit(x_train, d_train)
#print(model_m_cal.predict_proba(x_test)[:20,1])
#model_m.fit(x_quad_train, d_train)
#print(model_m.predict_proba(x_quad_test)[:20,1])
print(m_0(x_test[:20]))

[0.79725611 0.19794241 0.72364294 0.10786439 0.19233962 0.06937673
 0.37745344 0.34720019 0.35534233 0.55712368 0.38523661 0.65564241
 0.44207671 0.8740306  0.59157068 0.23672224 0.27446384 0.29161449
 0.17923891 0.95982061]
[0.80183283 0.31578077 0.93521705 0.11438564 0.11887484 0.02775527
 0.70874655 0.3218635  0.22443098 0.53061413 0.29491439 0.6279986
 0.72483571 0.92664436 0.65926708 0.09909001 0.18709786 0.11430445
 0.26947625 0.92591083]
CPU times: total: 9.66 s
Wall time: 3.42 s


In [80]:
%%time
model_g0.fit(x_train[d_train==0], y_train[d_train==0])
print(model_g0.predict(x_test[:20]))
#model_g0.fit(x_quad_train[d_train==0], y_train[d_train==0])
#print(model_g0.predict(x_quad_test[:20]))
print(g_0(0, x_test[:20]))

[4.14186313 0.16686231 1.15574408 4.06131284 4.54840574 2.7854677
 1.42812493 4.43484332 4.09689307 5.13020626 7.20812882 9.07780883
 1.08751528 4.64984098 4.02795264 4.33150031 5.96622452 6.19479607
 3.1980504  7.06568572]
[ 3.84506293 -1.24326249 -0.45863705  3.61281905  4.80426886  4.27924561
 -0.07285784  4.01833526  5.09961717  3.24182035  8.473771    9.84352192
 -1.65098076  3.47460378  1.81637801  2.88803741  6.24923843  7.01017636
  4.98867164  9.07313702]
CPU times: total: 6.09 s
Wall time: 2.62 s


In [81]:
%%time
model_g1.fit(x_train[d_train==1], y_train[d_train==1])
print(model_g1.predict(x_test[:20]))
#model_g1.fit(x_quad_train[d_train==1], y_train[d_train==1])
#print(model_g1.predict(x_quad_test[:20]))
print(g_0(1, x_test[:20]))

[4.28121007 2.68105406 3.43556799 2.27687406 3.42003553 3.15311604
 3.22819238 6.02844515 4.049979   2.19190771 5.80536054 6.47352303
 2.00865454 3.52678276 2.51892547 4.39021861 5.12010114 4.75760288
 4.09620485 8.50525225]
[ 3.07326319 -0.16413246  1.94438713  3.83527339  5.66194741  5.64682301
  2.76119456  4.96268592  3.99945626  3.35898226  7.32967574  8.78848658
 -2.00690975  3.14170146  2.39733893  4.21944944  5.95996794  6.9728264
  6.03134291  9.37462294]
CPU times: total: 19.2 s
Wall time: 7.53 s


In [82]:
%%time
model_g = [model_g0, model_g1]
dml_ate(y_data, d_data, x_data, model_g, model_m)

CPU times: total: 3min 2s
Wall time: 1min 21s


(array([ 0.17023191, -0.31031668,  0.20763143]),
 7.451553299279383,
 array([-0.29161167,  0.63207548]))

Hyperparameters

In [50]:
with open('opt_params_rf.pkl', 'rb') as pickle_file:
    rf_params_dict_dict = pickle.load(pickle_file)

In [51]:
rf_params_dict_dict[250][0]['m'].keys()

dict_keys(['n_estimators', 'min_samples_split', 'min_samples_leaf', 'max_features', 'max_depth', 'bootstrap'])

In [76]:
for rf_params_dict in rf_params_dict_dict.values():
    for name, params in rf_params_dict[0].items():
        print(name, params['min_samples_leaf'])

g0 2
g1 2
m 2
g0 1
g1 2
m 2
g0 1
g1 2
m 2
g0 1
g1 2
m 8
g0 2
g1 2
m 6
g0 2
g1 2
m 8
g0 2
g1 1
m 8
