In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import xgboost as xgb
import pickle

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.svm import SVR, SVC

# Import dataset and preprocessing

http://qed.econ.queensu.ca/jae/2000-v15.6/bilias/readme.b.txt

https://colab.research.google.com/github/CausalAIBook/MetricsMLNotebooks/blob/main/CM1/python-rct-penn-precision-adj.ipynb

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/VC2015/DMLonGitHub/master/penn_jae.dat", sep='\s+')
df

Unnamed: 0,abdt,tg,inuidur1,inuidur2,female,black,hispanic,othrace,dep,q1,...,q5,q6,recall,agelt35,agegt54,durable,nondurable,lusd,husd,muld
0,10824,0,18,18,0,0,0,0,2,0,...,1,0,0,0,0,0,0,0,1,0
1,10635,2,7,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,10551,5,18,6,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
3,10824,0,1,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,10747,0,27,27,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13908,10831,5,27,27,0,0,0,0,0,0,...,1,0,1,0,1,1,0,0,1,0
13909,10677,2,4,4,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
13910,10817,4,4,4,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
13911,10691,0,27,27,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0


In [3]:
df['log_dur'] = np.log(df['inuidur1'])
df = df.drop(['abdt', 'inuidur1', 'inuidur2'], axis=1)
df

Unnamed: 0,tg,female,black,hispanic,othrace,dep,q1,q2,q3,q4,...,q6,recall,agelt35,agegt54,durable,nondurable,lusd,husd,muld,log_dur
0,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2.890372
1,2,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,1.945910
2,5,1,0,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,2.890372
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0.000000
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,3.295837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13908,5,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,0,0,1,0,3.295837
13909,2,1,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,0,1.386294
13910,4,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1.386294
13911,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,1,0,1,0,0,3.295837


In [4]:
(df[['q1', 'q2', 'q3', 'q4', 'q5', 'q6']].sum(axis=1) == 1).all()

True

In [5]:
(df['agelt35'] + df['agegt54'] <= 1).all()

True

In [6]:
# Ordinal encoding (also to remove possible multicollinearity)
def get_quarter(row):
    for col, val in enumerate(row, start=1):
        if val == 1:
            return col

def get_age(row):
    if (row == [1, 0]).all():
        return 0
    elif (row == [0, 0]).all():
        return 1
    elif (row == [0, 1]).all():
        return 2

df['quarter'] = df[['q1', 'q2', 'q3', 'q4', 'q5', 'q6']].apply(get_quarter, axis=1)
df['age'] = df[['agelt35', 'agegt54']].apply(get_age, axis=1)
df = df.drop(['q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'agelt35', 'agegt54'], axis=1)
df

Unnamed: 0,tg,female,black,hispanic,othrace,dep,recall,durable,nondurable,lusd,husd,muld,log_dur,quarter,age
0,0,0,0,0,0,2,0,0,0,0,1,0,2.890372,5,1
1,2,0,0,0,0,0,0,0,0,1,0,0,1.945910,3,0
2,5,1,0,0,0,0,1,0,0,0,0,0,2.890372,2,2
3,0,0,0,0,0,0,0,0,0,1,0,0,0.000000,5,1
4,0,0,0,0,0,0,0,0,0,1,0,0,3.295837,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13908,5,0,0,0,0,0,1,1,0,0,1,0,3.295837,5,2
13909,2,1,0,0,0,0,0,0,0,1,0,0,1.386294,4,2
13910,4,0,0,0,0,0,0,0,0,0,0,0,1.386294,5,2
13911,0,0,0,0,0,0,0,1,0,1,0,0,3.295837,4,2


According to https://www.jstor.org/stable/2678561?seq=11, Section 4.2, we merge treatment groups 4 and 6. 

In [7]:
df.groupby('tg').size()

tg
0    3354
1    1385
2    2428
3    1885
4    1745
5    1831
6    1285
dtype: int64

In [8]:
df = df.loc[df['tg'].isin([0, 4, 6]), :].copy()
df['treatment'] = df['tg'].isin([4, 6]).astype('float')
df = df.drop('tg', axis=1)
df

Unnamed: 0,female,black,hispanic,othrace,dep,recall,durable,nondurable,lusd,husd,muld,log_dur,quarter,age,treatment
0,0,0,0,0,2,0,0,0,0,1,0,2.890372,5,1,0.0
3,0,0,0,0,0,0,0,0,1,0,0,0.000000,5,1,0.0
4,0,0,0,0,0,0,0,0,1,0,0,3.295837,4,1,0.0
5,0,0,0,0,0,0,0,0,0,0,1,1.945910,2,1,1.0
11,0,0,0,0,0,0,0,0,0,0,1,2.197225,3,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13904,0,0,1,0,0,0,0,0,0,0,1,2.302585,3,0,1.0
13905,0,0,1,0,2,0,0,0,0,0,1,1.386294,2,1,1.0
13906,0,0,0,0,2,0,0,0,1,0,0,2.197225,2,0,0.0
13910,0,0,0,0,0,0,0,0,0,0,0,1.386294,5,2,1.0


In [9]:
df['treatment'].value_counts()

treatment
0.0    3354
1.0    3030
Name: count, dtype: int64

In [10]:
def preprocess():
    def get_quarter(row):
        for col, val in enumerate(row, start=1):
            if val == 1:
                return col
    
    def get_age(row):
        if (row == [1, 0]).all():
            return 0
        elif (row == [0, 0]).all():
            return 1
        elif (row == [0, 1]).all():
            return 2
            
    df = pd.read_csv("https://raw.githubusercontent.com/VC2015/DMLonGitHub/master/penn_jae.dat", sep='\s+')
    df['log_dur'] = np.log(df['inuidur1'])
    df = df.drop(['abdt', 'inuidur1', 'inuidur2'], axis=1)
    df['quarter'] = df[['q1', 'q2', 'q3', 'q4', 'q5', 'q6']].apply(get_quarter, axis=1)
    df['age'] = df[['agelt35', 'agegt54']].apply(get_age, axis=1)
    df = df.drop(['q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'agelt35', 'agegt54'], axis=1)
    df = df.loc[df['tg'].isin([0, 4, 6]), :].copy()
    df['treatment'] = df['tg'].isin([4, 6]).astype('float')
    df = df.drop('tg', axis=1)

    return df

In [11]:
df = preprocess()
df

Unnamed: 0,female,black,hispanic,othrace,dep,recall,durable,nondurable,lusd,husd,muld,log_dur,quarter,age,treatment
0,0,0,0,0,2,0,0,0,0,1,0,2.890372,5,1,0.0
3,0,0,0,0,0,0,0,0,1,0,0,0.000000,5,1,0.0
4,0,0,0,0,0,0,0,0,1,0,0,3.295837,4,1,0.0
5,0,0,0,0,0,0,0,0,0,0,1,1.945910,2,1,1.0
11,0,0,0,0,0,0,0,0,0,0,1,2.197225,3,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13904,0,0,1,0,0,0,0,0,0,0,1,2.302585,3,0,1.0
13905,0,0,1,0,2,0,0,0,0,0,1,1.386294,2,1,1.0
13906,0,0,0,0,2,0,0,0,1,0,0,2.197225,2,0,0.0
13910,0,0,0,0,0,0,0,0,0,0,0,1.386294,5,2,1.0


In [12]:
# Difference-in-means estimator (consistent)
np.mean(df['log_dur'][df['treatment']==1]) - np.mean(df['log_dur'][df['treatment']==0])

-0.0795421074545184

# Inducing selection bias

In [13]:
df.groupby('treatment').mean()

Unnamed: 0_level_0,female,black,hispanic,othrace,dep,recall,durable,nondurable,lusd,husd,muld,log_dur,quarter,age
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0.0,0.405188,0.121348,0.03548,0.005665,0.438283,0.108229,0.150865,0.106738,0.253131,0.221825,0.449911,2.05683,3.71288,0.573942
1.0,0.39736,0.114851,0.032013,0.008251,0.462046,0.128713,0.140924,0.112211,0.269307,0.221782,0.430693,1.977288,3.376568,0.552805


In [14]:
df.groupby('female')['log_dur'].mean()

female
0    1.982210
1    2.074039
Name: log_dur, dtype: float64

$\Longrightarrow$ If females select themselves into treatment, we would obtain a positive selection bias.

In [15]:
df['white'] = 1 - df[['black', 'hispanic', 'othrace']].sum(axis=1)
df.groupby('white')['log_dur'].mean()

white
0    1.760978
1    2.067870
Name: log_dur, dtype: float64

$\Longrightarrow$ If white people select themselves into treatment, we would obtain a positive selection bias.

In [16]:
df.groupby('dep')['log_dur'].mean()

dep
0    1.993755
1    2.068950
2    2.093486
Name: log_dur, dtype: float64

$\Longrightarrow$ If people with more dependents select themselves into treatment, we would obtain a positive selection bias.

In [17]:
df.groupby('recall')['log_dur'].mean()

recall
0    1.970053
1    2.385681
Name: log_dur, dtype: float64

$\Longrightarrow$ If people expecting to be recalled select themselves into treatment, we would obtain a positive selection bias. 

In [18]:
df.groupby('age')['log_dur'].mean()

age
0    1.902342
1    2.107506
2    2.322500
Name: log_dur, dtype: float64

$\Longrightarrow$ If older people select themselves into treatment, we would obtain a positive selection bias.

In [19]:
df.groupby('lusd')['log_dur'].mean()

lusd
0    2.053970
1    1.920183
Name: log_dur, dtype: float64

$\Longrightarrow$ If people living in areas with low unemployment rate and short duration of unemployment select themselves into treatment, we would obtain a negative selection bias.

In [20]:
num_criteria = (df['female'] == 0) + df[['black', 'hispanic', 'othrace']].sum(axis=1) + (df['recall'] == 0) + (df['age'] == 0) + (df['lusd'] == 1) #+ (df['dep'] == 0)
num_criteria.value_counts().sort_index()

0     121
1     912
2    2222
3    2349
4     711
5      69
Name: count, dtype: int64

In [21]:
num_criteria.corr(df['log_dur'])

-0.16727210892200833

If we discard treated samples that satisfy a higher number of constraints with a higher probability, then we expect to induce a positive selection bias on the estimated causal effect.

In [22]:
np.random.seed(123)
discarded = (df['treatment'] == 1) & (np.random.uniform(size=len(df)) <= num_criteria/5)

In [23]:
df_bias = df[~discarded]
df_bias

Unnamed: 0,female,black,hispanic,othrace,dep,recall,durable,nondurable,lusd,husd,muld,log_dur,quarter,age,treatment,white
0,0,0,0,0,2,0,0,0,0,1,0,2.890372,5,1,0.0,1
3,0,0,0,0,0,0,0,0,1,0,0,0.000000,5,1,0.0,1
4,0,0,0,0,0,0,0,0,1,0,0,3.295837,4,1,0.0,1
5,0,0,0,0,0,0,0,0,0,0,1,1.945910,2,1,1.0,1
11,0,0,0,0,0,0,0,0,0,0,1,2.197225,3,0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13898,0,1,0,0,0,0,0,0,0,0,1,3.135494,5,1,0.0,0
13900,1,0,0,1,2,0,0,1,0,0,1,1.791759,5,1,0.0,0
13906,0,0,0,0,2,0,0,0,1,0,0,2.197225,2,0,0.0,1
13910,0,0,0,0,0,0,0,0,0,0,0,1.386294,5,2,1.0,1


In [24]:
df_bias.treatment.value_counts()

treatment
0.0    3354
1.0    1595
Name: count, dtype: int64

In [25]:
np.mean(df_bias['log_dur'][df_bias['treatment']==1]) - np.mean(df_bias['log_dur'][df_bias['treatment']==0])

-0.016978358029621887

In [None]:
def induce_selection_bias():
    df = preprocess()  
    num_criteria = 
    np.random.seed(123)
    discarded = (df['treatment'] == 1) & (np.random.uniform(size=len(df)) <= num_criteria/5)
    df = df[~discarded]
    
    return df['log_dur'], df['treatment'], df.drop(['treatment', 'log_dur'], axis=1)

In [None]:
y_data, d_data, x_data = induce_selection_bias()

Difference-in-means estimator is misleading, since now we have observational data (non-randomized treatment).

In [None]:
# Difference-in-means estimator (inconsistent)
np.mean(y_data[d_data==1]) - np.mean(y_data[d_data==0])

# Prepare data for DML estimation

In [7]:
x_data = x_data.values

scaler = StandardScaler()
x_data_stand = scaler.fit_transform(x_data)

poly_features = PolynomialFeatures(degree=2, include_bias=False)
x_data_quad = poly_features.fit_transform(x_data)
scaler = StandardScaler()
x_data_quad_stand = scaler.fit_transform(x_data_quad)

In [8]:
import importlib
import dml_algorithm  
importlib.reload(dml_algorithm)

<module 'dml_algorithm' from 'C:\\Users\\henry\\MA_CausalML\\Real dataset\\dml_algorithm.py'>

## Elastic net

In [9]:
with open('opt_params.pkl', 'rb') as pickle_file:
    opt_params = pickle.load(pickle_file)

opt_params

{'eln': {'g0': {'alpha': 6.382750873587745, 'l1_ratio': 0.9},
  'g1': {'alpha': 181.25818744352998, 'l1_ratio': 0.99},
  'm': {'C': 4.641588833612772, 'l1_ratio': 0.9}},
 'mlp': {'g0': {'alpha': 0.3,
   'batch_size': 2,
   'hidden_layer_sizes': (16, 8),
   'max_iter': 150},
  'g1': {'alpha': 1,
   'batch_size': 2,
   'hidden_layer_sizes': (24, 16),
   'max_iter': 150},
  'm': {'alpha': 0.03,
   'batch_size': 4,
   'hidden_layer_sizes': (16, 8),
   'max_iter': 50}},
 'svm': {'g0': {'C': 300, 'epsilon': 0.01, 'gamma': 0.01, 'kernel': 'linear'},
  'g1': {'C': 300, 'epsilon': 0.01, 'gamma': 1, 'kernel': 'rbf'},
  'm': {'C': 300, 'gamma': 0.01, 'kernel': 'rbf'}},
 'xgb': {'g0': {'colsample_bytree': 0.625,
   'learning_rate': 0.3,
   'max_depth': 1,
   'n_estimators': 25,
   'reg_alpha': 0.01,
   'reg_lambda': 100,
   'subsample': 0.6},
  'g1': {'colsample_bytree': 0.625,
   'learning_rate': 0.02,
   'max_depth': 1,
   'n_estimators': 100,
   'reg_alpha': 1,
   'reg_lambda': 1,
   'subsample

In [10]:
def get_eln_models(eln_params_dict):
    model_g = []
    for d in [0, 1]:
        model = ElasticNet(max_iter=10000)
        model.set_params(**eln_params_dict[f'g{d}'])
        model_g.append(model)
    model_m = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=50000, random_state=42)
    model_m.set_params(**eln_params_dict['m'])
    return model_g, model_m

In [12]:
eln_model_g, eln_model_m = get_eln_models(opt_params['eln'])
dml_algorithm.dml_ate_att(y_data, d_data, x_data_quad_stand, eln_model_g, eln_model_m, m_bounds=(0.01, 0.99))

([-83.42479754212309,
  42682.65474769145,
  array([-3459.52500797,  3292.67541288])],
 [2095.053572467804,
  29657.299146299207,
  array([-250.77084919, 4440.87799413])])

## Support vector machine

In [13]:
def get_svm_models(svm_params_dict):
    model_g = []
    for d in [0, 1]:
        model = SVR()
        model.set_params(**svm_params_dict[f'g{d}'])
        model_g.append(model)
    model_m = SVC(probability=True, random_state=42)
    model_m.set_params(**svm_params_dict['m'])
    return model_g, model_m

In [15]:
svm_model_g, svm_model_m = get_svm_models(opt_params['svm'])
dml_algorithm.dml_ate_att(y_data, d_data, x_data_stand, svm_model_g, svm_model_m, m_bounds=(0.01, 0.99))

([1104.0040593055337,
  26635.367869832735,
  array([-1002.79252122,  3210.80063983])],
 [1317.8333917941186,
  21744.050327307876,
  array([-402.07117645, 3037.73796004])])

## Random forest

## XGBoost

In [16]:
def get_xgb_models(xgb_params_dict):
    model_g = []
    for d in [0, 1]:
        model = xgb.XGBRegressor(objective='reg:squarederror', seed=0)
        model.set_params(**xgb_params_dict[f'g{d}'])
        model_g.append(model)
    model_m = xgb.XGBClassifier(objective='binary:logistic', seed=0)
    model_m.set_params(**xgb_params_dict['m'])
    return model_g, model_m

In [17]:
xgb_model_g, xgb_model_m = get_xgb_models(opt_params['xgb'])
dml_algorithm.dml_ate_att(y_data, d_data, x_data, xgb_model_g, xgb_model_m, m_bounds=(0.01, 0.99))

([323.7256148991961,
  22119.953497675593,
  array([-1425.91203298,  2073.36326278])],
 [1449.4539078180278,
  22230.95313207064,
  array([-308.96355686, 3207.87137249])])

## Multilayer perceptron

In [18]:
def get_mlp_models(mlp_params_dict):
    model_g = []
    for d in [0, 1]:
        model = MLPRegressor(random_state=42)
        model.set_params(**mlp_params_dict[f'g{d}'])
        model_g.append(model)
    model_m = MLPClassifier(random_state=42)
    model_m.set_params(**mlp_params_dict['m'])
    return model_g, model_m

In [19]:
mlp_model_g, mlp_model_m = get_mlp_models(opt_params['mlp'])
dml_algorithm.dml_ate_att(y_data, d_data, x_data_stand, mlp_model_g, mlp_model_m, m_bounds=(0.01, 0.99))

([349.111310856916,
  24249.436605917683,
  array([-1568.96357253,  2267.18619425])],
 [1225.7843988711334,
  20144.181087585115,
  array([-367.5741776 , 2819.14297534])])

## Hybrid