In [7]:
import numpy as np
import pickle
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from data_generation import get_data
from dml_algorithm import mm_ate, dml_ate


# Load tuned hyperparameters of ElasticNet
with open('opt_params_eln.pkl', 'rb') as pickle_file:
    opt_params_eln = pickle.load(pickle_file)

In [8]:
# Get ElasticNet models from hyperparameters
def get_models(eln_params_dict):
    model_g = []
    for d in [0, 1]:
        model = ElasticNet(max_iter=10000)
        model.set_params(**eln_params_dict[f'g{d}'])
        model_g.append(model)
    model_m = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=50000, random_state=42)
    model_m.set_params(**eln_params_dict['m'])
    return model_g, model_m

In [15]:
N = 250
if True:
    rng = np.random.default_rng(seed=123)

    for j in range(1376):
        
        y_data, d_data, x_data = get_data(N, rng)
        if j == 1375:
            poly_features = PolynomialFeatures(degree=2, include_bias=False)
            x_data_quad = poly_features.fit_transform(x_data)
            scaler = StandardScaler()
            x_data_quad_stand = scaler.fit_transform(x_data_quad)

            print(mm_ate(y_data, d_data, x_data))
            model_g, model_m = get_models(opt_params_eln[N][j])
            print(dml_ate(y_data, d_data, [x_data, x_data_quad_stand], model_g, model_m))

0.5102734718726487
(array([2.78610765e+01, 1.77168079e-01, 2.39246404e+02]), 437.2282979421288, array([-26.33730886,  82.05946195]), array([0.60239927, 1.07621023, 0.15736382]))


In [16]:
from data_generation import m_0

In [20]:
m_0(x_data).min()

0.008039349330890478

In [51]:
model_m.predict_proba(x_data_quad_stand)[:,1].max()

0.9989017547763444

In [24]:
x_data_all=[x_data, x_data_quad_stand]
K=5
model_g, model_m = get_models(opt_params_eln[250][1375])
alpha=0.05
classical=True
errors=True

In [48]:
if True:
    if isinstance(x_data_all, list):
        x_data_orig, x_data = x_data_all[0], x_data_all[1]
    else:
        x_data_orig, x_data = x_data_all, x_data_all

    # Partition the data for cross-fitting
    skf = StratifiedKFold(n_splits=K, shuffle=False)

    # Compute respective ML estimators and thereupon auxiliary estimators
    theta_0_check_list = []
    scores_list = []
    if classical:
        reg_check_list, ipw_check_list = [], []
    if errors:
        rmse_list = []
    
    for i, (train_indices, eval_indices) in enumerate(skf.split(X=x_data, y=d_data)):
        y_train, d_train, x_train = y_data[train_indices], d_data[train_indices], x_data[train_indices]
        y_eval, d_eval, x_eval, x_eval_orig = y_data[eval_indices], d_data[eval_indices], x_data[eval_indices], x_data_orig[eval_indices]

        # Estimate outcome regression functions g_0(d)
        g_0_hat = []
        for d in [0, 1]:
            model_g[d].fit(X=x_train[d_train==d], y=y_train[d_train==d])
            g_0_hat.append(model_g[d].predict(x_eval))

        # Estimate propensity score m_0
        model_m.fit(X=x_train, y=d_train)
        m_0_hat = model_m.predict_proba(x_eval)[:,1]
        if i==2:
            m_0_hat_stored = m_0_hat
            d_eval_stored = d_eval
            x_eval_stored = x_eval_orig
            
        # Compute auxiliary estimator
        scores = g_0_hat[1] - g_0_hat[0] + d_eval*(y_eval-g_0_hat[1])/np.clip(m_0_hat,0.01,0.99) - (1-d_eval)*(y_eval-g_0_hat[0])/np.clip(1-m_0_hat,0.01,0.99)
        theta_0_check_list.append(np.mean(scores))
        if i==2:
            score_stored = scores

        # For variance estimation
        scores_list.append(scores)

        # For regression & IPW estimators
        if classical:
            reg_check_list.append(np.mean(g_0_hat[1] - g_0_hat[0])) 
            ipw_check_list.append(np.mean(d_eval*y_eval/m_0_hat - (1-d_eval)*y_eval/(1-m_0_hat)))

        # Assess RMSE of ML models on evaluation set
        if errors:
            rmse_g0 = root_mean_squared_error(g_0(0, x_eval_orig), g_0_hat[0])
            rmse_g1 = root_mean_squared_error(g_0(1, x_eval_orig), g_0_hat[1])
            rmse_m = root_mean_squared_error(m_0(x_eval_orig), m_0_hat)
            rmse_list.append([rmse_g0, rmse_g1, rmse_m])

    # Compute final estimator
    theta_0_hat = np.mean(theta_0_check_list)
    if classical:
        reg_hat, ipw_hat = np.mean(reg_check_list), np.mean(ipw_check_list)

    # Inference: estimate standard deviation and construct confidence interval
    sigma_hat = np.sqrt(np.mean((np.array(scores_list)-theta_0_hat)**2))
    N = len(y_data)
    quantile = norm.ppf(1-alpha/2)
    CI = np.array([theta_0_hat-quantile*sigma_hat/np.sqrt(N), theta_0_hat+quantile*sigma_hat/np.sqrt(N)])

    # Average RMSEs across folds
    if errors:
        rmse = np.mean(rmse_list, axis=0)

In [39]:
score_stored[23]

6927.042516163822

In [42]:
d_eval_stored[23]

1.0

In [47]:
m_0(x_eval_stored[23])

array([0.12315923])

In [40]:
m_0_hat_stored[23]

0.00011932991457565601

In [49]:
theta_0_check_list

[-0.09754162961176263,
 -0.0026415639213525567,
 1.5078662928177138,
 0.888006076261521,
 0.19667944063999301]

In [50]:
np.mean(theta_0_check_list)

0.49847372323722244

In [26]:
import numpy as np
from joblib import Parallel, delayed
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import root_mean_squared_error
from sklearn.base import is_regressor
from scipy.stats import norm
from data_generation import g_0, m_0