In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

https://www.tandfonline.com/doi/abs/10.1080/01621459.1999.10473858

https://www.kaggle.com/datasets/samuelzakouri/lalonde?resource=download

https://www.kaggle.com/code/taiyoutsuhara/eda-and-ps-ipw-example-with-lalonde

https://paperswithcode.com/dataset/ihdp

https://arxiv.org/pdf/1606.03976

# Import Lalonde dataset and first exploration

In [2]:
df = pd.read_csv('lalonde.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,treat,age,educ,black,hispan,married,nodegree,re74,re75,re78
0,1,1,37,11,1,0,1,1,0.0,0.0,9930.046
1,2,1,22,9,0,1,0,1,0.0,0.0,3595.894
2,3,1,30,12,1,0,0,0,0.0,0.0,24909.45
3,4,1,27,11,1,0,0,1,0.0,0.0,7506.146
4,5,1,33,8,1,0,0,1,0.0,0.0,289.7899


In [3]:
y_data = df['re78']
d_data = df['treat']
x_data = df.iloc[:, 2:10]
x_data.head()

Unnamed: 0,age,educ,black,hispan,married,nodegree,re74,re75
0,37,11,1,0,1,1,0.0,0.0
1,22,9,0,1,0,1,0.0,0.0
2,30,12,1,0,0,0,0.0,0.0
3,27,11,1,0,0,1,0.0,0.0
4,33,8,1,0,0,1,0.0,0.0


Difference-in-means estimator is misleading, since we have observational data (non-randomized treatment):

In [4]:
contingency_table = pd.crosstab(df['black'], df['treat'])
contingency_table

treat,0,1
black,Unnamed: 1_level_1,Unnamed: 2_level_1
0,342,29
1,87,156


In [5]:
# Difference-in-means estimator
np.mean(y_data[d_data==1]) - np.mean(y_data[d_data==0])

-635.0262120374209

# Prepare data for DML estimation

In [6]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [7]:
x_data = x_data.values

scaler = StandardScaler()
x_data_stand = scaler.fit_transform(x_data)

poly_features = PolynomialFeatures(degree=2, include_bias=False)
x_data_quad = poly_features.fit_transform(x_data)
scaler = StandardScaler()
x_data_quad_stand = scaler.fit_transform(x_data_quad)

In [8]:
import importlib
import dml_algorithm  
importlib.reload(dml_algorithm)

<module 'dml_algorithm' from 'C:\\Users\\henry\\MA_CausalML\\Real dataset\\dml_algorithm.py'>

In [9]:
import xgboost as xgb
from sklearn.linear_model import ElasticNet, ElasticNetCV, LogisticRegression, LogisticRegressionCV
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.svm import SVR, SVC
from sklearn.model_selection import GridSearchCV

## Elastic net

In [10]:
def eln_cv(y_data, d_data, x_data):
    eln_model_g = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], n_alphas=100, max_iter=10000, n_jobs=-1)
    eln_model_m = LogisticRegressionCV(Cs=25, l1_ratios=[0, .1, .3, .5, .7, .9, .95, .99, 1],
                                       penalty='elasticnet', solver='saga', max_iter=50000,
                                       random_state=42, scoring='neg_brier_score', n_jobs=-1)

    eln_params_dict = {}
    for d in [0, 1]:
        eln_model_g.fit(X=x_data[d_data == d], y=y_data[d_data == d])
        eln_params_dict[f'g{d}'] = {'alpha': eln_model_g.alpha_, 'l1_ratio': eln_model_g.l1_ratio_}
    eln_model_m.fit(X=x_data, y=d_data)
    eln_params_dict['m'] = {'C': eln_model_m.C_[0], 'l1_ratio': eln_model_m.l1_ratio_[0]}

    return eln_params_dict

In [11]:
eln_params_dict = eln_cv(y_data, d_data, x_data_quad_stand)
eln_params_dict

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


{'g0': {'alpha': 22.70517410136869, 'l1_ratio': 0.95},
 'g1': {'alpha': 1673.5157233722396, 'l1_ratio': 0.1},
 'm': {'C': 0.021544346900318822, 'l1_ratio': 1}}

In [12]:
def get_eln_models(eln_params_dict):
    model_g = []
    for d in [0, 1]:
        model = ElasticNet(max_iter=10000)
        model.set_params(**eln_params_dict[f'g{d}'])
        model_g.append(model)
    model_m = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=50000, random_state=42)
    model_m.set_params(**eln_params_dict['m'])
    return model_g, model_m

In [13]:
eln_model_g, eln_model_m = get_eln_models(eln_params_dict)
dml_algorithm.dml_parallel_ate(y_data, d_data, x_data_quad_stand, eln_model_g, eln_model_m)

(217.98461640830845, 15019.355493782421, array([-970.01199586, 1405.98122868]))

## Support vector machine

In [36]:
def svm_cv(y_data, d_data, x_data, cv=5):
    svm_model_g = SVR()
    svm_model_m = SVC(probability=True, random_state=42)

    param_grid_m = {
        'kernel': ['rbf'],
        'C': [0.1, 0.3, 1, 3, 10, 30, 100],
        'gamma': [0.01, 0.03, 0.1, 0.3, 1]
    }
    param_grid_g = param_grid_m.copy()
    param_grid_g['epsilon'] = [0.001, 0.003, 0.01, 0.03, 0.1]  

    grid_search_g = GridSearchCV(estimator=svm_model_g, param_grid=param_grid_g, cv=cv, n_jobs=-1,
                                 scoring='neg_mean_squared_error')
    grid_search_m = GridSearchCV(estimator=svm_model_m, param_grid=param_grid_m, cv=cv, n_jobs=-1,
                                 scoring='neg_brier_score')

    svm_params_dict = {}
    for d in [0, 1]:
        grid_search_g.fit(X=x_data[d_data == d], y=y_data[d_data == d])
        svm_params_dict[f'g{d}'] = grid_search_g.best_params_
    grid_search_m.fit(X=x_data, y=d_data)
    svm_params_dict['m'] = grid_search_m.best_params_

    return svm_params_dict

In [37]:
svm_params_dict = svm_cv(y_data, d_data, x_data_stand)
svm_params_dict

{'g0': {'C': 100, 'epsilon': 0.1, 'gamma': 0.03, 'kernel': 'rbf'},
 'g1': {'C': 100, 'epsilon': 0.1, 'gamma': 1, 'kernel': 'rbf'},
 'm': {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}}

In [38]:
def get_svm_models(svm_params_dict):
    model_g = []
    for d in [0, 1]:
        model = SVR()
        model.set_params(**svm_params_dict[f'g{d}'])
        model_g.append(model)
    model_m = SVC(probability=True, random_state=42)
    model_m.set_params(**svm_params_dict['m'])
    return model_g, model_m

In [39]:
svm_model_g, svm_model_m = get_svm_models(svm_params_dict)
dml_algorithm.dml_parallel_ate(y_data, d_data, x_data_stand, svm_model_g, svm_model_m)

(1496.9218657886418, 28979.39828924096, array([-795.28215106, 3789.12588263]))

## Random forest

## XGBoost

In [17]:
def xgb_cv(y_data, d_data, x_data, cv=5):
    xgb_model_g = xgb.XGBRegressor(objective='reg:squarederror', seed=0)
    xgb_model_m = xgb.XGBClassifier(objective='binary:logistic', seed=0)

    param_grid = {
        'n_estimators': [50, 100],
        'max_depth': [1, 2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.7, 1.0],
        'learning_rate': [0.05, 0.1, 0.2],
        'reg_lambda': [0.1, 1, 10, 100],
        'reg_alpha': [0.1, 1, 10]
    }

    grid_search_g = GridSearchCV(estimator=xgb_model_g, param_grid=param_grid, cv=cv, n_jobs=-1,
                                 scoring='neg_mean_squared_error')
    grid_search_m = GridSearchCV(estimator=xgb_model_m, param_grid=param_grid, cv=cv, n_jobs=-1,
                                 scoring='neg_brier_score')

    xgb_params_dict = {}
    for d in [0, 1]:
        grid_search_g.fit(X=x_data[d_data == d], y=y_data[d_data == d])
        xgb_params_dict[f'g{d}'] = grid_search_g.best_params_
    grid_search_m.fit(X=x_data, y=d_data)
    xgb_params_dict['m'] = grid_search_m.best_params_

    return xgb_params_dict

In [18]:
xgb_params_dict = xgb_cv(y_data, d_data, x_data)
xgb_params_dict

{'g0': {'colsample_bytree': 0.7,
  'learning_rate': 0.05,
  'max_depth': 2,
  'n_estimators': 50,
  'reg_alpha': 0.1,
  'reg_lambda': 100,
  'subsample': 1.0},
 'g1': {'colsample_bytree': 1.0,
  'learning_rate': 0.05,
  'max_depth': 2,
  'n_estimators': 100,
  'reg_alpha': 10,
  'reg_lambda': 100,
  'subsample': 0.6},
 'm': {'colsample_bytree': 0.7,
  'learning_rate': 0.2,
  'max_depth': 1,
  'n_estimators': 100,
  'reg_alpha': 0.1,
  'reg_lambda': 0.1,
  'subsample': 1.0}}

In [19]:
def get_xgb_models(xgb_params_dict):
    model_g = []
    for d in [0, 1]:
        model = xgb.XGBRegressor(objective='reg:squarederror', seed=0)
        model.set_params(**xgb_params_dict[f'g{d}'])
        model_g.append(model)
    model_m = xgb.XGBClassifier(objective='binary:logistic', seed=0)
    model_m.set_params(**xgb_params_dict['m'])
    return model_g, model_m

In [20]:
xgb_model_g, xgb_model_m = get_xgb_models(xgb_params_dict)
dml_algorithm.dml_parallel_ate(y_data, d_data, x_data, xgb_model_g, xgb_model_m)

(257.7791161890276,
 19291.598808727198,
 array([-1268.14215517,  1783.70038755]))

## Multilayer perceptron

In [23]:
def mlp_cv(y_data, d_data, x_data, cv=5):
    mlp_model_g = MLPRegressor(hidden_layer_sizes=(8,4), tol=0.0005, n_iter_no_change=5, random_state=42)
    mlp_model_m = MLPClassifier(hidden_layer_sizes=(8,4), tol=0.0005, n_iter_no_change=5, random_state=42)

    param_grid_g = {
        'alpha': [0.05, 0.1, 0.25],
        'batch_size': [2, 4],
        'max_iter': [50, 75, 100]
    }
    param_grid_m = {
        'alpha': [0.05, 0.1, 0.25],
        'batch_size': [4, 8, 16],
        'max_iter': [25, 50]
    }

    grid_search_g = GridSearchCV(estimator=mlp_model_g, param_grid=param_grid_g, cv=cv, n_jobs=-1, scoring='neg_mean_squared_error')
    grid_search_m = GridSearchCV(estimator=mlp_model_m, param_grid=param_grid_m, cv=cv, n_jobs=-1, scoring='neg_brier_score')

    mlp_params_dict = {}
    for d in [0, 1]:
        grid_search_g.fit(X=x_data[d_data == d], y=y_data[d_data == d])
        mlp_params_dict[f'g{d}'] = grid_search_g.best_params_
    grid_search_m.fit(X=x_data, y=d_data)
    mlp_params_dict['m'] = grid_search_m.best_params_

    return mlp_params_dict

In [24]:
mlp_params_dict = mlp_cv(y_data, d_data, x_data_stand)
mlp_params_dict



{'g0': {'alpha': 0.25, 'batch_size': 2, 'max_iter': 100},
 'g1': {'alpha': 0.1, 'batch_size': 2, 'max_iter': 100},
 'm': {'alpha': 0.05, 'batch_size': 8, 'max_iter': 50}}

In [25]:
def get_mlp_models(mlp_params_dict):
    model_g = []
    for d in [0, 1]:
        model = MLPRegressor(hidden_layer_sizes=(8,4), tol=0.0005, n_iter_no_change=5, random_state=42)
        model.set_params(**mlp_params_dict[f'g{d}'])
        model_g.append(model)
    model_m = MLPClassifier(hidden_layer_sizes=(8,4), tol=0.0005, n_iter_no_change=5, random_state=42)
    model_m.set_params(**mlp_params_dict['m'])
    return model_g, model_m

In [26]:
mlp_model_g, mlp_model_m = get_mlp_models(mlp_params_dict)
dml_algorithm.dml_parallel_ate(y_data, d_data, x_data_stand, mlp_model_g, mlp_model_m)

(-93.11604019540655,
 24024.352038916786,
 array([-1993.38724997,  1807.15516958]))

## Hybrid