In [1]:
import numpy as np
import pickle
from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, StratifiedKFold
from scipy.stats import norm
from data_generation import m_0, g_0, get_data
from dml_algorithm import mm_ate, dml_ate

In [2]:
rng = np.random.default_rng(seed=123)

In [36]:
N = 1000
y_data, d_data, x_data = get_data(N, rng)
poly_features = PolynomialFeatures(degree=2, include_bias=False)
x_quad_data = poly_features.fit_transform(x_data)
y_train, y_test, d_train, d_test, x_train, x_test, x_quad_train, x_quad_test = train_test_split(y_data, d_data, x_data, x_quad_data, test_size=0.2, random_state=42)

In [29]:
x_quad_data.shape

(1000, 65)

In [5]:
l1_ratio = [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1]
alphas = [0.01, 0.1, 1, 10]

In [34]:
model_g0, model_g1 = ElasticNetCV(l1_ratio=l1_ratio, n_alphas=10, max_iter=5000, n_jobs=-1), ElasticNetCV(l1_ratio=l1_ratio, n_alphas=10, max_iter=5000, n_jobs=-1)
model_m = LogisticRegressionCV(Cs=10, 
                               l1_ratios=[0, 0.1, 0.2, 0.4, 0.6, 0.8, 0.9, 1], 
                               penalty='elasticnet',
                               solver='saga',
                               max_iter=1000,
                               random_state=42,
                               scoring='neg_brier_score',
                               n_jobs=-1
                              )

In [40]:
%%time
model_m.fit(x_train, d_train)
print(model_m.C_, model_m.l1_ratio_)
print(model_m.predict_proba(x_test)[:20,1])
print(m_0(x_test[:20]))

[0.35938137] [0.4]
[0.65530287 0.34426363 0.78521342 0.42528583 0.61254706 0.16327408
 0.11498294 0.7869402  0.09314605 0.12055676 0.24243482 0.55597915
 0.346746   0.16667187 0.79156349 0.10929253 0.65779104 0.1372743
 0.29954441 0.12651258]
[0.67398685 0.27666477 0.79479683 0.47609634 0.59113221 0.13442404
 0.07547721 0.75637009 0.07581332 0.10944503 0.21667395 0.4856324
 0.24384477 0.15223573 0.81350598 0.07733635 0.59520922 0.14626221
 0.28698479 0.10549508]
CPU times: total: 2.56 s
Wall time: 899 ms


In [114]:
"""
%%time
model_m.fit(x_quad_train, d_train)
print(model_m.C_, model_m.l1_ratio_)
print(model_m.predict_proba(x_quad_test)[:20,1])
print(m_0(x_test[:20]))
"""



[0.1] [0.7]
[0.03693531 0.18404655 0.69069903 0.41196625 0.69500165 0.33635643
 0.56763541 0.40901834 0.23107324 0.76968027 0.60815894 0.08349949
 0.24164982 0.89614622 0.35362007 0.40155253 0.43111714 0.18631722
 0.32110142 0.02323991]
[0.02989884 0.18599423 0.77968065 0.31913693 0.70306652 0.32809337
 0.59935589 0.58673348 0.24429792 0.67730265 0.64347342 0.06659785
 0.33693052 0.90120211 0.24828482 0.53668259 0.53383136 0.15935715
 0.79567006 0.02883646]
CPU times: total: 4min 32s
Wall time: 1min 15s


In logistic regression, we do not include second-order interaction terms due to true model and since convergence issues and not-converged model was still very bad and model without interaction terms was much better!

In [8]:
%%time
model_g0.fit(x_train[d_train==0], y_train[d_train==0])
print(model_g0.alpha_, model_g0.l1_ratio_)
print(model_g0.predict(x_test[:20]))
print(g_0(0, x_test[:20]))

0.025849035195606464 1.0
[ 1.03364465  8.01197109  5.65525088  5.01021965  9.30271151  5.25332556
 -0.29075156  9.0914986   4.67652837  5.97021037  7.43185531  1.59118454
  3.84284257  4.76637962  4.37386203  5.55270384 11.2162964   4.65450948
  8.0948331   4.03303047]
[ 1.96422327  9.38049098  4.81121536  4.13197859  9.07879928  5.48112904
  1.64736247  9.0274919   3.77133351  6.56564005  8.27090455 -0.11310208
  2.83579848  3.93886505  3.37506486  4.75874102 10.68547325  4.37337197
  8.47965583  2.97486236]
CPU times: total: 234 ms
Wall time: 272 ms


In [37]:
%%time
model_g0.fit(x_quad_train[d_train==0], y_train[d_train==0])
print(model_g0.alpha_, model_g0.l1_ratio_)
print(model_g0.predict(x_quad_test[:20]))
print(g_0(0, x_test[:20]))

0.011058396761094985 1.0
[ 2.85169028  7.55173744 -1.30857376  3.58920301  0.09110727 10.97848495
  4.90955892  3.94447434  8.55901768  8.63218063  6.8549028   7.02917165
  2.20027856  5.82673646  0.51688255 10.18889384  4.41675355  6.16231684
  7.20989856  0.80145022]
[ 2.9069639   7.76035032 -1.12517216  3.65768366 -1.2483757  10.88963269
  4.44757963  3.74039443  8.49520391  8.45428855  7.07058588  7.12166139
  2.41714607  5.752593    0.94168628 10.4485586   4.77793121  5.90215175
  6.91013412  1.34034997]
CPU times: total: 453 ms
Wall time: 273 ms


In [23]:
%%time
model_g1.fit(x_train[d_train==1], y_train[d_train==1])
print(model_g1.alpha_, model_g1.l1_ratio_)
print(model_g1.predict(x_test[:20]))
print(g_0(1, x_test[:20]))

0.01 0.1
[ 5.16540176  7.78540839  3.79566213  4.06821275  4.4594141   8.05288531
  1.79014551  5.86656702  8.64134184  8.14282038  3.78477313 -0.57869564
  0.39006149  2.2482847   5.91850482  5.75824314  4.1980805   9.57597934
 11.24850319  3.74846144]
[ 6.24751738  5.98585963  4.83808863  3.04843426  4.21928007 10.01507928
  1.04538262  4.55878404  7.12833787  6.87695721  1.94768289 -2.0836684
 -0.59349964  1.4324121   4.4307337   6.38487366  3.44725359 10.02198399
 11.5393095   3.08063252]
CPU times: total: 125 ms
Wall time: 267 ms


In [38]:
%%time
model_g1.fit(x_quad_train[d_train==1], y_train[d_train==1])
print(model_g1.alpha_, model_g1.l1_ratio_)
print(model_g1.predict(x_quad_test[:20]))
print(g_0(1, x_test[:20]))

0.01978755694136917 1.0
[1.12354448 8.91932664 1.33544538 4.23823897 3.38980467 8.80389061
 3.71949256 3.41790846 9.48183047 8.25795606 7.81233819 7.18241624
 1.35377837 6.94168989 2.50659482 9.82551106 6.10591686 7.10986202
 5.71284254 2.02107083]
[ 1.10463491  9.3327495   0.68046758  4.65615573  1.4740322   9.51278878
  3.42169578  3.34434293  9.16902013  8.10162345  8.21442261  7.20844959
  1.16212478  6.93920743  2.83287328 11.00949914  6.24141967  7.1067113
  5.81822982  1.84928451]
CPU times: total: 344 ms
Wall time: 263 ms


In [25]:
def dml_ate(y_data, d_data, x_data, x_quad_data, model_g, model_m, K=5, classical=True, inference=True, alpha=0.05):
    # Generate random partition of data for cross-fitting
    N = len(y_data)
    skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

    # Compute respective ML estimators and thereupon auxiliary estimators
    theta_0_check_list = []
    if classical:
        reg_check_list, ipw_check_list = [], []
    if inference:
        scores_list = []
    
    for (train_indices, eval_indices) in skf.split(X=x_data, y=d_data):
        y_train, d_train, x_train, x_quad_train = y_data[train_indices], d_data[train_indices], x_data[train_indices], x_quad_data[train_indices] 
        y_eval, d_eval, x_eval, x_quad_eval = y_data[eval_indices], d_data[eval_indices], x_data[eval_indices], x_quad_data[eval_indices] 

        # Estimate outcome regression functions g_0(d)
        g_0_hat = []
        for d in [0, 1]:
            model_g[d].fit(X=x_quad_train[d_train==d], y=y_train[d_train==d])
            g_0_hat.append(model_g[d].predict(x_quad_eval))

        # Estimate propensity score m_0
        model_m.fit(X=x_train, y=d_train)
        m_0_hat = model_m.predict_proba(x_eval)[:,1]
            
        # Compute auxiliary estimator
        scores = g_0_hat[1] - g_0_hat[0] + d_eval*(y_eval-g_0_hat[1])/m_0_hat - (1-d_eval)*(y_eval-g_0_hat[0])/(1-m_0_hat)
        theta_0_check_list.append(np.mean(scores))

        # For variance estimation
        if inference:
            scores_list.append(scores)

        # For regression & IPW estimators
        if classical:
            reg_check_list.append(np.mean(g_0_hat[1] - g_0_hat[0])) 
            ipw_check_list.append(np.mean(d_eval*y_eval/m_0_hat - (1-d_eval)*y_eval/(1-m_0_hat)))     

    # Compute final estimator
    theta_0_hat = np.mean(theta_0_check_list)
    if classical:
        reg_hat, ipw_hat = np.mean(reg_check_list), np.mean(ipw_check_list)

    # Inference: estimate variance and construct confidence interval
    if inference:
        sigma_hat = np.sqrt(np.mean((np.array(scores_list)-theta_0_hat)**2))
        quantile = norm.ppf(1-alpha/2)
        CI = np.array([theta_0_hat-quantile*sigma_hat/np.sqrt(N), theta_0_hat+quantile*sigma_hat/np.sqrt(N)])

    # Return results
    if classical:
        if inference:
            return np.array([theta_0_hat, reg_hat, ipw_hat]), sigma_hat, CI
        else:
            return np.array([theta_0_hat, reg_hat, ipw_hat])
    else:
        if inference:
            return theta_0_hat, sigma_hat, CI
        else:
            return theta_0_hat

In [53]:
%%time
model_g = [model_g0, model_g1]
dml_ate(y_data, d_data, x_data, x_quad_data, model_g, model_m)

CPU times: total: 8.69 s
Wall time: 3.6 s


(array([0.62729263, 0.56263539, 0.25905684]),
 2.760932977028119,
 array([0.45617138, 0.79841388]))