In [1]:
import numpy as np
import pickle
from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, StratifiedKFold
from scipy.stats import norm
from data_generation import m_0, g_0, get_data
from dml_algorithm import mm_ate, dml_ate

In [2]:
rng = np.random.default_rng(seed=123)

In [85]:
N = 16000
y_data, d_data, x_data = get_data(N, rng)
poly_features = PolynomialFeatures(degree=2, include_bias=False)
x_quad_data = poly_features.fit_transform(x_data)
y_train, y_test, d_train, d_test, x_train, x_test, x_quad_train, x_quad_test = train_test_split(y_data, d_data, x_data, x_quad_data, test_size=0.2, random_state=42)

In [29]:
x_quad_data.shape

(1000, 65)

In [4]:
l1_ratio = [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1]
alphas = [0.01, 0.1, 1, 10]

In [86]:
model_g0, model_g1 = ElasticNetCV(l1_ratio=l1_ratio, n_alphas=10, n_jobs=-1), ElasticNetCV(l1_ratio=l1_ratio, n_alphas=10, n_jobs=-1)
model_m = LogisticRegressionCV(Cs=10, 
                               l1_ratios=[0, 0.1, 0.2, 0.4, 0.6, 0.8, 0.9, 1], 
                               penalty='elasticnet',
                               solver='saga',
                               max_iter=1000,
                               random_state=42,
                               scoring='neg_brier_score',
                               n_jobs=-1
                              )

In [88]:
%%time
model_m.fit(x_train, d_train)
print(model_m.C_, model_m.l1_ratio_)
print(model_m.predict_proba(x_test)[:20,1])
print(m_0(x_test[:20]))

[166.81005372] [0.1]
[0.51801127 0.34451408 0.23774443 0.3965201  0.20522767 0.59813138
 0.1663324  0.8087117  0.08070318 0.72326821 0.27258559 0.34066054
 0.64229072 0.11878766 0.3275425  0.63916234 0.77247125 0.62622426
 0.89183367 0.04722666]
[0.50285193 0.35140023 0.22827839 0.39088247 0.1903588  0.57576861
 0.16815063 0.80190247 0.07420542 0.70247495 0.24530227 0.32220461
 0.61745687 0.11435028 0.34021387 0.61174971 0.76068643 0.64385032
 0.89528198 0.04153242]
CPU times: total: 20.2 s
Wall time: 5.37 s


In [114]:
"""
%%time
model_m.fit(x_quad_train, d_train)
print(model_m.C_, model_m.l1_ratio_)
print(model_m.predict_proba(x_quad_test)[:20,1])
print(m_0(x_test[:20]))
"""



[0.1] [0.7]
[0.03693531 0.18404655 0.69069903 0.41196625 0.69500165 0.33635643
 0.56763541 0.40901834 0.23107324 0.76968027 0.60815894 0.08349949
 0.24164982 0.89614622 0.35362007 0.40155253 0.43111714 0.18631722
 0.32110142 0.02323991]
[0.02989884 0.18599423 0.77968065 0.31913693 0.70306652 0.32809337
 0.59935589 0.58673348 0.24429792 0.67730265 0.64347342 0.06659785
 0.33693052 0.90120211 0.24828482 0.53668259 0.53383136 0.15935715
 0.79567006 0.02883646]
CPU times: total: 4min 32s
Wall time: 1min 15s


In logistic regression, we do not include second-order interaction terms due to true model and since convergence issues and not-converged model was still very bad and model without interaction terms was much better!

In [13]:
%%time
model_g0.fit(x_train[d_train==0], y_train[d_train==0])
print(model_g0.alpha_, model_g0.l1_ratio_)
print(model_g0.predict(x_test[:20]))
print(g_0(0, x_test[:20]))

0.01 1.0
[7.79611278 7.75556974 5.3981438  4.39992203 7.10907533 2.76910759
 6.60256898 8.10212959 9.06439175 8.44250402 1.87969543 4.137215
 8.09215973 5.39006711 2.83039121 4.88957231 8.16204359 7.64343596
 3.59009818 7.8417214 ]
[8.28868829 7.6576894  6.25762424 3.82230003 8.26865423 2.06880283
 5.84620354 7.46459345 8.35423969 9.16554275 1.24079134 3.55899384
 7.87033118 4.86879995 2.46126933 4.74410786 7.94596517 7.63496295
 3.63729464 7.70693168]
CPU times: total: 266 ms
Wall time: 210 ms


In [56]:
%%time
model_g0.fit(x_quad_train[d_train==0], y_train[d_train==0])
print(model_g0.alpha_, model_g0.l1_ratio_)
print(model_g0.predict(x_quad_test[:20]))
print(g_0(0, x_test[:20]))

0.01024397620047402 1.0
[ 3.43870397  4.19615514 11.99784276  6.19868267  3.33612414 -0.28176152
  5.53985115  7.65071547  4.7185664   4.68145262  2.32741754  5.75711863
  4.0086806   7.21889631  2.16627682  1.10704232  4.75624982  6.99512764
  9.1618331   5.56914458]
[ 2.94288336  4.20702213 13.10026865  6.2328962   3.36664239  0.16046224
  5.54439129  7.37532109  4.37365257  4.64917016  2.00162939  5.99628968
  3.96666034  6.78312805  2.547261    1.3708171   4.97297045  6.87264344
  8.95137013  5.76194912]
CPU times: total: 422 ms
Wall time: 283 ms


In [22]:
%%time
model_g1.fit(x_train[d_train==1], y_train[d_train==1])
print(model_g1.alpha_, model_g1.l1_ratio_)
print(model_g1.predict(x_test[:20]))
print(g_0(1, x_test[:20]))

0.01 1.0
[ 7.46836835  3.90260928  3.55124551  6.75512096  3.39525159  4.80759195
  3.96131246  4.64873477  9.13780734  8.4576383   8.81098379  6.91774514
 11.22763516  6.84722476  7.314588    2.08498333  1.58133188  1.51819034
  6.08496573  3.55961318]
[ 7.18389654  3.15322182  2.6110236   6.20069697  4.33203052  4.76444308
  3.05270852  4.12555511  8.41837173  7.79268185  8.34195778  7.45097563
 10.71046431  5.78457246  7.8986493   2.760118    3.49642226 -0.27622715
  5.04923621  3.75324317]
CPU times: total: 219 ms
Wall time: 195 ms


In [57]:
%%time
model_g1.fit(x_quad_train[d_train==1], y_train[d_train==1])
print(model_g1.alpha_, model_g1.l1_ratio_)
print(model_g1.predict(x_quad_test[:20]))
print(g_0(1, x_test[:20]))

0.014652663416801704 1.0
[ 2.66551914  5.1881822  11.91105079  6.4885178   3.49350538  0.92269358
  4.70375675  6.90453287  3.71901553  5.21279217  2.31088464  5.97456111
  4.31397898  2.60177833  3.44384716  1.67182968  4.55291284  4.4864982
  9.38984427  5.87774926]
[ 2.2316313   5.44833314 13.64564184  6.60775482  3.5885824   1.35962177
  5.35666457  7.03479682  3.43867968  5.19398957  2.0348218   6.62276814
  4.15265674  2.19601101  3.71967268  2.01389438  4.95593371  4.83222406
  9.37266755  6.29032504]
CPU times: total: 359 ms
Wall time: 270 ms


In [25]:
def dml_ate(y_data, d_data, x_data, x_quad_data, model_g, model_m, K=5, classical=True, inference=True, alpha=0.05):
    # Generate random partition of data for cross-fitting
    N = len(y_data)
    skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

    # Compute respective ML estimators and thereupon auxiliary estimators
    theta_0_check_list = []
    if classical:
        reg_check_list, ipw_check_list = [], []
    if inference:
        scores_list = []
    
    for (train_indices, eval_indices) in skf.split(X=x_data, y=d_data):
        y_train, d_train, x_train, x_quad_train = y_data[train_indices], d_data[train_indices], x_data[train_indices], x_quad_data[train_indices] 
        y_eval, d_eval, x_eval, x_quad_eval = y_data[eval_indices], d_data[eval_indices], x_data[eval_indices], x_quad_data[eval_indices] 

        # Estimate outcome regression functions g_0(d)
        g_0_hat = []
        for d in [0, 1]:
            model_g[d].fit(X=x_quad_train[d_train==d], y=y_train[d_train==d])
            g_0_hat.append(model_g[d].predict(x_quad_eval))

        # Estimate propensity score m_0
        model_m.fit(X=x_train, y=d_train)
        m_0_hat = model_m.predict_proba(x_eval)[:,1]
            
        # Compute auxiliary estimator
        scores = g_0_hat[1] - g_0_hat[0] + d_eval*(y_eval-g_0_hat[1])/m_0_hat - (1-d_eval)*(y_eval-g_0_hat[0])/(1-m_0_hat)
        theta_0_check_list.append(np.mean(scores))

        # For variance estimation
        if inference:
            scores_list.append(scores)

        # For regression & IPW estimators
        if classical:
            reg_check_list.append(np.mean(g_0_hat[1] - g_0_hat[0])) 
            ipw_check_list.append(np.mean(d_eval*y_eval/m_0_hat - (1-d_eval)*y_eval/(1-m_0_hat)))     

    # Compute final estimator
    theta_0_hat = np.mean(theta_0_check_list)
    if classical:
        reg_hat, ipw_hat = np.mean(reg_check_list), np.mean(ipw_check_list)

    # Inference: estimate variance and construct confidence interval
    if inference:
        sigma_hat = np.sqrt(np.mean((np.array(scores_list)-theta_0_hat)**2))
        quantile = norm.ppf(1-alpha/2)
        CI = np.array([theta_0_hat-quantile*sigma_hat/np.sqrt(N), theta_0_hat+quantile*sigma_hat/np.sqrt(N)])

    # Return results
    if classical:
        if inference:
            return np.array([theta_0_hat, reg_hat, ipw_hat]), sigma_hat, CI
        else:
            return np.array([theta_0_hat, reg_hat, ipw_hat])
    else:
        if inference:
            return theta_0_hat, sigma_hat, CI
        else:
            return theta_0_hat

In [53]:
%%time
model_g = [model_g0, model_g1]
dml_ate(y_data, d_data, x_data, x_quad_data, model_g, model_m)

CPU times: total: 8.69 s
Wall time: 3.6 s


(array([0.62729263, 0.56263539, 0.25905684]),
 2.760932977028119,
 array([0.45617138, 0.79841388]))