# Testing DoubleML Package

In [54]:
import numpy as np
import import_ipynb
from cleaning import *
from doubleml import DoubleMLData

In [69]:
target = target[target['ANNMILES_sum'].notnull()]
target.loc[target['LIF_CYC'] == 10, 'LIF_CYC'] = 1
target.loc[target['LIF_CYC'] == 2, 'LIF_CYC'] = 0

dml_target = DoubleMLData(target,
                            y_col = 'ANNMILES_sum',
                            d_cols = 'LIF_CYC',
                            x_cols = ['HHSIZE', 'HOMEOWN', 'HH_HISP', 
                                      'DRVRCNT', 'HH_RACE', 'TRAVDAY',
                                      'URBAN', 'WRKCOUNT'])

In [70]:
obs = len(target)
vars = 3
theta = 3
X = np.random.normal(size = (obs, vars))
d = np.dot(X[:, :3], np.array([5, 5, 5])) + np.random.standard_normal(size = (obs, ))
y = theta * d + np.dot(X[:, :3], np.array([5, 5, 5])) + np.random.standard_normal(size = (obs,))

In [73]:
print(dml_target)


------------------ Data summary      ------------------
Outcome variable: ANNMILES_sum
Treatment variable(s): ['LIF_CYC']
Covariates: ['HHSIZE', 'HOMEOWN', 'HH_HISP', 'DRVRCNT', 'HH_RACE', 'TRAVDAY', 'URBAN', 'WRKCOUNT']
Instrument variable(s): None
No. Observations: 1473

------------------ DataFrame info    ------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1473 entries, 0 to 1493
Columns: 36 entries, HOUSEID to ANNMILES_sum
dtypes: float64(4), int64(32)
memory usage: 425.8 KB



In [76]:
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV

learner = RandomForestRegressor(n_estimators = obs, max_features = 'log2', max_depth = 5)
ml_l_target = clone(learner)
ml_m_target = clone(learner)

learner = LassoCV()
ml_l_sim = clone(learner)
ml_m_sim = clone(learner)

In [77]:
from doubleml import DoubleMLPLR

np.random.seed(20020107)

obj_dml_plr_target = DoubleMLPLR(dml_target, ml_l_target, ml_m_target)
obj_dml_plr_target.fit()
print(obj_dml_plr_target)


------------------ Data summary      ------------------
Outcome variable: ANNMILES_sum
Treatment variable(s): ['LIF_CYC']
Covariates: ['HHSIZE', 'HOMEOWN', 'HH_HISP', 'DRVRCNT', 'HH_RACE', 'TRAVDAY', 'URBAN', 'WRKCOUNT']
Instrument variable(s): None
No. Observations: 1473

------------------ Score & algorithm ------------------
Score function: partialling out
DML algorithm: dml2

------------------ Machine learner   ------------------
Learner ml_l: RandomForestRegressor(max_depth=5, max_features='log2', n_estimators=1473)
Learner ml_m: RandomForestRegressor(max_depth=5, max_features='log2', n_estimators=1473)
Out-of-sample Performance:
Learner ml_l RMSE: [[38181.94926839]]
Learner ml_m RMSE: [[0.34714015]]

------------------ Resampling        ------------------
No. folds: 5
No. repeated sample splits: 1
Apply cross-fitting: True

------------------ Fit summary       ------------------
                coef      std err         t     P>|t|       2.5 %       97.5 %
LIF_CYC  1891.003548 