In [61]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from lightgbm import LGBMRegressor,LGBMClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV

In [11]:
data_x = pd.read_csv("./data/practice/acic_practice_0001.csv")
data_y = pd.read_csv("./data/practice_year/acic_practice_year_0001.csv")

# propensity score

In [56]:
data_x

Unnamed: 0,id.practice,X1,X2,X3,X4,X5,X6,X7,X8,X9
0,1,0,A,1,A,1,20.774076,14.153255,0.161126,43.431874
1,2,0,A,0,C,0,33.565928,3.284657,0.556784,12.721988
2,3,0,C,1,A,1,57.283021,11.178051,0.257244,-7.352617
3,4,1,C,1,A,0,41.900776,1.542463,0.129618,20.032199
4,5,1,B,1,A,0,41.486809,7.504068,0.413924,35.449536
...,...,...,...,...,...,...,...,...,...,...
495,496,0,C,1,A,1,29.898841,17.945129,0.381701,40.224974
496,497,1,C,0,A,1,20.378333,20.770890,0.422255,62.857150
497,498,0,A,0,A,0,30.162733,9.174455,0.205427,-4.931348
498,499,1,C,0,A,1,32.795499,19.769398,0.295538,84.228736


In [12]:
data_z = pd.merge(data_x, data_y[['id.practice','Z']].drop_duplicates(), on='id.practice')
df = data_z
ct = ['X2','X4']
for c in ct:
    df[c] = df[c].astype('category')

debias_m = LGBMClassifier(max_depth=3)

X = ['X1','X2','X3','X4','X5','X6','X7','X8','X9']
T = ['Z']
#ps_res =  df[T] - cross_val_predict(debias_m, df[X], df[T].values.ravel(), cv=5, method='predict_proba')[:,1].reshape(-1,1) + df[T].mean()

In [13]:
debias_m = LGBMClassifier(objective = 'binary',
                         is_unbalance = True,
                         #metric = 'log_loss',
                         metric = 'binary_logloss,auc',
                         max_depth = 4,
                         num_leaves = 20,
                         learning_rate = 0.1,
                         #feature_fraction = 0.7,
                         min_child_samples=21,
                         min_child_weight=0.001,
                         #bagging = 1,
                         #subsample_freq = 2,
                         reg_alpha = 0.002,
                         reg_lambda = 10,
                         cat_smooth = 0,
                         n_estimators = 200,   
                        )

ps = cross_val_predict(debias_m, df[X], df[T].values.ravel(), cv=10, method='predict_proba')[:,1]
#ps_res =  df[T] - cross_val_predict(debias_m, df[X], df[T].values.ravel(), cv=5, method='predict_proba')[:,1].reshape(-1,1) + df[T].mean()
data_x = data_x.assign(ps = ps)

In [14]:
data_z = data_z.assign(ps = ps)
data_z

Unnamed: 0,id.practice,X1,X2,X3,X4,X5,X6,X7,X8,X9,Z,ps
0,1,0,A,1,A,1,20.774076,14.153255,0.161126,43.431874,1,0.653730
1,2,0,A,0,C,0,33.565928,3.284657,0.556784,12.721988,1,0.878407
2,3,0,C,1,A,1,57.283021,11.178051,0.257244,-7.352617,0,0.015035
3,4,1,C,1,A,0,41.900776,1.542463,0.129618,20.032199,0,0.057317
4,5,1,B,1,A,0,41.486809,7.504068,0.413924,35.449536,0,0.024983
...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,0,C,1,A,1,29.898841,17.945129,0.381701,40.224974,1,0.843063
496,497,1,C,0,A,1,20.378333,20.770890,0.422255,62.857150,0,0.907315
497,498,0,A,0,A,0,30.162733,9.174455,0.205427,-4.931348,0,0.135969
498,499,1,C,0,A,1,32.795499,19.769398,0.295538,84.228736,1,0.954601


# average outcome for year 3,4

In [27]:
data_o = pd.merge(data_x, data_y, on='id.practice')
df = data_o
ct = ['X2','X4']
for c in ct:
    df[c] = df[c].astype('category')
    
#XV = df.columns.drop(['id.practice','year','post','Z','n.patients'])
V = ['V1_avg', 'V2_avg', 'V3_avg', 'V4_avg', 'V5_A_avg', 'V5_B_avg', 'V5_C_avg']
Y = ['Y']

In [28]:
means = []
for i in range(1,5):
    means.append(df[(df['year']==i) & (df['Z']==0)]['Y'].mean())
    print(means[i-1])

avgs = pd.DataFrame({'year': [1, 2, 3, 4], 't_avg': means})
df = pd.merge(df, avgs, on='year')

df_34 = df[df['year'] >= 3]

870.6451336763548
1010.0760840822331
1134.3928342316492
1250.6285559851644


In [33]:
for i in [3,4]:
    print(df[(df['year']==i) & (df['Z']==1)]['Y'].mean())

1214.9423399695077
1325.0602276295026


In [29]:
df_34.columns

Index(['id.practice', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9',
       'ps', 'year', 'Y', 'Z', 'post', 'n.patients', 'V1_avg', 'V2_avg',
       'V3_avg', 'V4_avg', 'V5_A_avg', 'V5_B_avg', 'V5_C_avg', 't_avg'],
      dtype='object')

In [20]:
denoise_m = LGBMRegressor(is_unbalance = True,
                         #metric = 'log_loss',
                         metric = 'binary_logloss,auc',
                         max_depth = 4,
                         num_leaves = 20,
                         learning_rate = 0.1,
                         #feature_fraction = 0.7,
                         min_child_samples=21,
                         min_child_weight=0.001,
                         #bagging = 1,
                         #subsample_freq = 2,
                         reg_alpha = 0.002,
                         reg_lambda = 10,
                         cat_smooth = 0,
                         n_estimators = 200,   
                        )

# \hat{\mu_0(v)+e(x)r(x)+f(t)}
#pred3 = cross_val_predict(denoise_m, df_3[V], df_3[Y].values.ravel(), cv=10, method='predict_proba')[:,1]
#pred4 = cross_val_predict(denoise_m, df_4[V], df_4[Y].values.ravel(), cv=10, method='predict_proba')[:,1]

df_34 = df[df['year'] >= 3]
pred_M = cross_val_predict(denoise_m, df_34[V], df_34['Y']-df_34['t_avg'], fit_params={'sample_weight':df_34['n.patients']}, cv=10)

# WLS for ATTs

In [30]:
df_34 = df_34.assign(y_res = df_34['Y'] - pred_M, z_res = df_34['Z'] - df_34['ps'])
df_34.columns

Index(['id.practice', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9',
       'ps', 'year', 'Y', 'Z', 'post', 'n.patients', 'V1_avg', 'V2_avg',
       'V3_avg', 'V4_avg', 'V5_A_avg', 'V5_B_avg', 'V5_C_avg', 't_avg',
       'y_res', 'z_res'],
      dtype='object')

In [74]:
df_3 = df_34[df_34['year'] == 3]
df_4 = df_34[df_34['year'] == 4]

In [47]:
final_model = smf.wls(formula='y_res ~ z_res', data=df_34, weights=df_34['n.patients']).fit()
final_model.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1193.7287,4.432,269.361,0.000,1185.032,1202.425
z_res,19.5846,10.333,1.895,0.058,-0.692,39.861


In [50]:
pow(4,0.5)

2.0

In [59]:
25.09+1.959964*13.152

50.867446528

In [81]:
final_model = smf.wls(formula='y_res ~ z_res', data=df_3, weights=df_3['n.patients']).fit()
final_model.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1121.3578,5.092,220.219,0.000,1111.353,1131.362
z_res,14.5905,11.855,1.231,0.219,-8.702,37.883


In [80]:
df_3.head()

Unnamed: 0,id.practice,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,V1_avg,V2_avg,V3_avg,V4_avg,V5_A_avg,V5_B_avg,V5_C_avg,t_avg,y_res,z_res
1000,1,0,A,1,A,1,20.774076,14.153255,0.161126,43.431874,...,10.896147,2.867769,0.528926,0.207485,0.727273,0.231405,0.041322,1134.392834,1067.737289,0.34627
1001,2,0,A,0,C,0,33.565928,3.284657,0.556784,12.721988,...,11.973041,2.858521,0.488746,-0.112637,0.723473,0.160772,0.115756,1134.392834,1216.283315,0.121593
1002,3,0,C,1,A,1,57.283021,11.178051,0.257244,-7.352617,...,11.099161,3.067308,0.570266,0.240018,0.586538,0.315828,0.097633,1134.392834,1058.641123,-0.015035
1003,4,1,C,1,A,0,41.900776,1.542463,0.129618,20.032199,...,11.562932,2.902067,0.531012,0.048149,0.764962,0.145811,0.089227,1134.392834,1099.941106,-0.057317
1004,5,1,B,1,A,0,41.486809,7.504068,0.413924,35.449536,...,11.041405,2.824859,0.610169,0.24616,0.740113,0.163842,0.096045,1134.392834,1200.168722,-0.024983


In [79]:
df_4.head()

Unnamed: 0,id.practice,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,V1_avg,V2_avg,V3_avg,V4_avg,V5_A_avg,V5_B_avg,V5_C_avg,t_avg,y_res,z_res
1500,1,0,A,1,A,1,20.774076,14.153255,0.161126,43.431874,...,10.549631,2.717557,0.541985,0.183141,0.70229,0.259542,0.038168,1250.628556,1440.333695,0.34627
1501,2,0,A,0,C,0,33.565928,3.284657,0.556784,12.721988,...,12.020978,2.903974,0.486755,-0.169087,0.738411,0.152318,0.109272,1250.628556,1055.4752,0.121593
1502,3,0,C,1,A,1,57.283021,11.178051,0.257244,-7.352617,...,11.072433,3.067197,0.559971,0.192127,0.596821,0.307081,0.096098,1250.628556,1328.719656,-0.015035
1503,4,1,C,1,A,0,41.900776,1.542463,0.129618,20.032199,...,11.617541,2.865232,0.525481,0.030836,0.755379,0.150623,0.093998,1250.628556,1312.703993,-0.057317
1504,5,1,B,1,A,0,41.486809,7.504068,0.413924,35.449536,...,10.961217,2.871951,0.628049,0.267186,0.756098,0.158537,0.085366,1250.628556,1800.378796,-0.024983


In [87]:
model = LinearRegression()
r3 = model.fit(X=df_3[['z_res']], y=df_3[['y_res']], sample_weight=df_3['n.patients'])
#r4 = model.fit(X=df_4[['z_res']], y=df_4[['y_res']], sample_weight=df_4['n.patients'])

In [90]:
r3

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'normalize': False,
 'positive': False}

# Non Parametric Double/Debiased ML

In [77]:
df_34 = df_34.assign(y_res = df_34['Y'] - pred_M, z_res = df_34['Z'] - df_34['ps'])

model_final = LGBMRegressor(max_depth=6)
 
# create the weights
w = df_34['z_res'] ** 2 
 
# create the transformed target
y_star = (df_34['y_res'] / df_34['z_res'])
 
# use a weighted regression ML model to predict the target with the weights.
model_final.fit(X=df_34[V], y=y_star, sample_weight=w)

LGBMRegressor(max_depth=6)

# Iterative Estimation

In [None]:
# initial estimator f; cross-fitting w, M; estimating r3,r4 by regression

# repeat:
#   subtracting w*r; estimating f; cross-fitting m
#   evaluating M = m + f;
#   estimating r3,r4
# until convergence