In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import pickle
import matplotlib.pyplot as plt

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.svm import SVR, SVC

https://www.tandfonline.com/doi/abs/10.1080/01621459.1999.10473858

https://www.kaggle.com/datasets/samuelzakouri/lalonde?resource=download

https://www.kaggle.com/code/taiyoutsuhara/eda-and-ps-ipw-example-with-lalonde

https://paperswithcode.com/dataset/ihdp

https://arxiv.org/pdf/1606.03976

# Import Lalonde dataset and first exploration

In [2]:
df = pd.read_csv('lalonde.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,treat,age,educ,black,hispan,married,nodegree,re74,re75,re78
0,1,1,37,11,1,0,1,1,0.0,0.0,9930.046
1,2,1,22,9,0,1,0,1,0.0,0.0,3595.894
2,3,1,30,12,1,0,0,0,0.0,0.0,24909.45
3,4,1,27,11,1,0,0,1,0.0,0.0,7506.146
4,5,1,33,8,1,0,0,1,0.0,0.0,289.7899


In [3]:
#df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()

Unnamed: 0.1,Unnamed: 0,treat,age,educ,black,hispan,married,nodegree,re74,re75,re78
0,1,1,37,11,1,0,1,1,0.0,0.0,9930.046
1,2,1,22,9,0,1,0,1,0.0,0.0,3595.894
2,3,1,30,12,1,0,0,0,0.0,0.0,24909.45
3,4,1,27,11,1,0,0,1,0.0,0.0,7506.146
4,5,1,33,8,1,0,0,1,0.0,0.0,289.7899


In [4]:
y_data = df['re78']
d_data = df['treat']
x_data = df.iloc[:, 2:8]
x_data.head()

Unnamed: 0,age,educ,black,hispan,married,nodegree
0,37,11,1,0,1,1
1,22,9,0,1,0,1
2,30,12,1,0,0,0
3,27,11,1,0,0,1
4,33,8,1,0,0,1


Difference-in-means estimator is misleading, since we have observational data (non-randomized treatment):

In [5]:
contingency_table = pd.crosstab(df['black'], df['treat'])
contingency_table

treat,0,1
black,Unnamed: 1_level_1,Unnamed: 2_level_1
0,342,29
1,87,156


In [6]:
# Difference-in-means estimator
np.mean(y_data[d_data==1]) - np.mean(y_data[d_data==0])

-635.0262120374209

# Prepare data for DML estimation

In [7]:
x_data = x_data.values

scaler = StandardScaler()
x_data_stand = scaler.fit_transform(x_data)

poly_features = PolynomialFeatures(degree=2, include_bias=False)
x_data_quad = poly_features.fit_transform(x_data)
scaler = StandardScaler()
x_data_quad_stand = scaler.fit_transform(x_data_quad)

In [8]:
import importlib
import dml_algorithm  
importlib.reload(dml_algorithm)

<module 'dml_algorithm' from 'C:\\Users\\henry\\MA_CausalML\\Real dataset\\dml_algorithm.py'>

## Elastic net

In [9]:
with open('opt_params.pkl', 'rb') as pickle_file:
    opt_params = pickle.load(pickle_file)

opt_params

{'eln': {'g0': {'alpha': 6.382750873587745, 'l1_ratio': 0.9},
  'g1': {'alpha': 181.25818744352998, 'l1_ratio': 0.99},
  'm': {'C': 4.641588833612772, 'l1_ratio': 0.9}},
 'mlp': {'g0': {'alpha': 0.3,
   'batch_size': 2,
   'hidden_layer_sizes': (16, 8),
   'max_iter': 150},
  'g1': {'alpha': 1,
   'batch_size': 2,
   'hidden_layer_sizes': (24, 16),
   'max_iter': 150},
  'm': {'alpha': 0.03,
   'batch_size': 4,
   'hidden_layer_sizes': (16, 8),
   'max_iter': 50}},
 'svm': {'g0': {'C': 300, 'epsilon': 0.01, 'gamma': 0.01, 'kernel': 'linear'},
  'g1': {'C': 300, 'epsilon': 0.01, 'gamma': 1, 'kernel': 'rbf'},
  'm': {'C': 300, 'gamma': 0.01, 'kernel': 'rbf'}},
 'xgb': {'g0': {'colsample_bytree': 0.625,
   'learning_rate': 0.3,
   'max_depth': 1,
   'n_estimators': 25,
   'reg_alpha': 0.01,
   'reg_lambda': 100,
   'subsample': 0.6},
  'g1': {'colsample_bytree': 0.625,
   'learning_rate': 0.02,
   'max_depth': 1,
   'n_estimators': 100,
   'reg_alpha': 1,
   'reg_lambda': 1,
   'subsample

In [10]:
def get_eln_models(eln_params_dict):
    model_g = []
    for d in [0, 1]:
        model = ElasticNet(max_iter=10000)
        model.set_params(**eln_params_dict[f'g{d}'])
        model_g.append(model)
    model_m = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=50000, random_state=42)
    model_m.set_params(**eln_params_dict['m'])
    return model_g, model_m

In [12]:
eln_model_g, eln_model_m = get_eln_models(opt_params['eln'])
dml_algorithm.dml_ate_att(y_data, d_data, x_data_quad_stand, eln_model_g, eln_model_m, m_bounds=(0.01, 0.99))

([-83.42479754212309,
  42682.65474769145,
  array([-3459.52500797,  3292.67541288])],
 [2095.053572467804,
  29657.299146299207,
  array([-250.77084919, 4440.87799413])])

## Support vector machine

In [13]:
def get_svm_models(svm_params_dict):
    model_g = []
    for d in [0, 1]:
        model = SVR()
        model.set_params(**svm_params_dict[f'g{d}'])
        model_g.append(model)
    model_m = SVC(probability=True, random_state=42)
    model_m.set_params(**svm_params_dict['m'])
    return model_g, model_m

In [15]:
svm_model_g, svm_model_m = get_svm_models(opt_params['svm'])
dml_algorithm.dml_ate_att(y_data, d_data, x_data_stand, svm_model_g, svm_model_m, m_bounds=(0.01, 0.99))

([1104.0040593055337,
  26635.367869832735,
  array([-1002.79252122,  3210.80063983])],
 [1317.8333917941186,
  21744.050327307876,
  array([-402.07117645, 3037.73796004])])

## Random forest

## XGBoost

In [16]:
def get_xgb_models(xgb_params_dict):
    model_g = []
    for d in [0, 1]:
        model = xgb.XGBRegressor(objective='reg:squarederror', seed=0)
        model.set_params(**xgb_params_dict[f'g{d}'])
        model_g.append(model)
    model_m = xgb.XGBClassifier(objective='binary:logistic', seed=0)
    model_m.set_params(**xgb_params_dict['m'])
    return model_g, model_m

In [17]:
xgb_model_g, xgb_model_m = get_xgb_models(opt_params['xgb'])
dml_algorithm.dml_ate_att(y_data, d_data, x_data, xgb_model_g, xgb_model_m, m_bounds=(0.01, 0.99))

([323.7256148991961,
  22119.953497675593,
  array([-1425.91203298,  2073.36326278])],
 [1449.4539078180278,
  22230.95313207064,
  array([-308.96355686, 3207.87137249])])

## Multilayer perceptron

In [18]:
def get_mlp_models(mlp_params_dict):
    model_g = []
    for d in [0, 1]:
        model = MLPRegressor(random_state=42)
        model.set_params(**mlp_params_dict[f'g{d}'])
        model_g.append(model)
    model_m = MLPClassifier(random_state=42)
    model_m.set_params(**mlp_params_dict['m'])
    return model_g, model_m

In [19]:
mlp_model_g, mlp_model_m = get_mlp_models(opt_params['mlp'])
dml_algorithm.dml_ate_att(y_data, d_data, x_data_stand, mlp_model_g, mlp_model_m, m_bounds=(0.01, 0.99))

([349.111310856916,
  24249.436605917683,
  array([-1568.96357253,  2267.18619425])],
 [1225.7843988711334,
  20144.181087585115,
  array([-367.5741776 , 2819.14297534])])

## Hybrid