# CausalMLを使って効果を測定する

In [1]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
from IPython.display import display, display_markdown, Markdown
from textwrap import dedent

from causalml.inference.meta import BaseXLearner, BaseTLearner, BaseRLearner, BaseSLearner
from causalml.inference.meta.drlearner import BaseDRLearner

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import HuberRegressor, LinearRegression, LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')


RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd

## Lalando-LoadData-RLang.ipynbで作成したCSVファイルを読み込む

In [2]:
data1 = pd.read_csv('cps1_nsw_data_R.csv')
data3 = pd.read_csv('cps3_nsw_data_R.csv')
data0 = pd.read_csv('nsw_dw_R.csv')

data3['treat'].value_counts()

0    429
1    185
Name: treat, dtype: int64

## ATE/ATT/ATUを計算する関数を定義

In [3]:
def calc_effects(learner, X, z, y, test_size=.25, ite_percentile=[10, 25, 50, 75, 90]):
    X_train, X_test, z_train, z_test, y_train, y_test = \
        train_test_split(X, z, y, test_size=test_size, random_state=0)
    if isinstance(learner, BaseSLearner):
        ate, lb, ub = learner.estimate_ate(X=X_train, treatment=z_train, y=y_train, return_ci=True)
    else:
        ate, lb, ub = learner.estimate_ate(X=X_train, treatment=z_train, y=y_train)

    index_re75_0_train = np.where(X_train['re75'] == 0)[0]
    index_re75_0_test = np.where(X_test['re75'] == 0)[0]
    
    index_re75_ne0_train = np.where(X_train['re75'] != 0)[0]
    index_re75_ne0_test = np.where(X_test['re75'] != 0)[0]
    
    index_treat_test = np.where(z_test == 1)[0]
    index_control_test = np.where(z_test == 0)[0]

    index_treat_train = np.where(z_train == 1)[0]
    index_control_train = np.where(z_train == 0)[0]
    
    if isinstance(learner, BaseRLearner):
        ite_train = learner.predict(X=X_train)
        ite_test = learner.predict(X=X_test)
    else:
        ite_train = learner.predict(X=X_train, treatment=z_train, y=y_train)
        ite_test = learner.predict(X=X_test, treatment=z_test, y=y_test)

        
    ate_test_calc = np.mean(ite_test)
    ate_train_calc = np.mean(ite_train)
    att_test_calc = np.mean(ite_test[index_treat_test])
    atu_test_calc = np.mean(ite_test[index_control_test])
    att_train_calc = np.mean(ite_train[index_treat_train])
    atu_train_calc = np.mean(ite_train[index_control_train])
    
    cate_re75_0_train = np.mean(ite_train[index_re75_0_train])
    cate_re75_0_test = np.mean(ite_test[index_re75_0_test])
    cate_re75_ne0_train = np.mean(ite_train[index_re75_ne0_train])
    cate_re75_ne0_test = np.mean(ite_test[index_re75_ne0_test])
    
    index=['ATE(estimate_ate)', 'ATE(lb)', 'ATE(ub)', 
           'ATE', 'ATT', 'ATU', 'CATE(re75=0)', 'CATE(re75!=0)']
    
    train_vals = [ate[0], lb[0], ub[0], ate_train_calc, att_train_calc, 
                  atu_train_calc, cate_re75_0_train, cate_re75_ne0_train]
    test_vals = [np.nan, np.nan, np.nan, ate_test_calc, att_test_calc, 
                 atu_test_calc, cate_re75_0_test, cate_re75_ne0_test]
    
    ite_q_train = np.percentile(ite_train, q=ite_percentile)
    ite_q_test  = np.percentile(ite_test, q=ite_percentile)
    
    for i, q in enumerate(ite_percentile):
        train_vals.append(ite_q_train[i])
        test_vals.append(ite_q_test[i])
        index.append(f'ITE({q}%)')
    
    data = pd.DataFrame(dict(train=train_vals, test=test_vals), index=index)
    return data

    

## DR(Double Robust)Learner

In [4]:
def run_dr(X, z, y, outcome_model):
    learner = BaseDRLearner(learner=outcome_model)
    return calc_effects(learner=learner, X=X, z=z, y=y)


### 実行

In [5]:
data = data1.copy()

outcome_model = GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.05, loss='ls', max_depth=4,
                          max_features='log2', max_leaf_nodes=None,
                          min_impurity_decrease=0.3, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=5,
                          min_weight_fraction_leaf=0.0, n_estimators=80,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=7328, subsample=0.75, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

x_names = ['re74', 're75', 'age', 'education', 'black',
                'hispanic', 'nodegree', 'married'] 
z_name = 'treat'
y_name = 're78'

X = data[ x_names ]
z = data[ z_name ]
y = data[ y_name ]

run_dr(X=X, y=y, z=z, outcome_model=outcome_model)


Unnamed: 0,train,test
ATE(estimate_ate),-6059.460486,
ATE(lb),-7199.569489,
ATE(ub),-4919.351483,
ATE,-6059.460486,-5902.143432
ATT,-2048.925123,-6107.736931
ATU,-6105.604715,-5899.7265
CATE(re75=0),4372.18546,4973.498368
CATE(re75!=0),-7426.875571,-7273.709617
ITE(10%),-14123.766847,-14070.258402
ITE(25%),-12483.012593,-12513.945891


## T-Leaner

In [6]:
def run_t(X, z, y, regressor):
    learner = BaseTLearner(learner=regressor)
    return calc_effects(learner=learner, X=X, z=z, y=y)


### 実行

In [7]:
data = data1.copy()


regressor = GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.05, loss='ls', max_depth=4,
                          max_features='log2', max_leaf_nodes=None,
                          min_impurity_decrease=0.3, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=5,
                          min_weight_fraction_leaf=0.0, n_estimators=80,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=7328, subsample=0.75, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

x_names = ['re74', 're75', 'age', 'education', 'black',
                'hispanic', 'nodegree', 'married'] 
z_name = 'treat'
y_name = 're78'

X = data[ x_names ]
z = data[ z_name ]
y = data[ y_name ]

run_t(X=X, y=y, z=z, regressor=regressor)



Unnamed: 0,train,test
ATE(estimate_ate),-6819.458268,
ATE(lb),-7670.282899,
ATE(ub),-5968.633638,
ATE,-6819.458268,-6839.214444
ATT,876.294279,1275.445679
ATU,-6908.003695,-6934.609398
CATE(re75=0),2074.837231,2227.934496
CATE(re75!=0),-7985.352308,-7982.70511
ITE(10%),-14916.139734,-14860.979969
ITE(25%),-12309.331487,-12209.913236


## X-Learner

In [8]:
def run_x(X, z, y, outcome_model):
    learner = BaseXLearner(learner=outcome_model)
    return calc_effects(learner=learner, X=X, z=z, y=y)


### 実行
- 自作と関数でATEが合わない

In [9]:
data = data1.copy()


outcome_model = GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.05, loss='ls', max_depth=4,
                          max_features='log2', max_leaf_nodes=None,
                          min_impurity_decrease=0.3, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=5,
                          min_weight_fraction_leaf=0.0, n_estimators=80,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=7328, subsample=0.75, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

x_names = ['re74', 're75', 'age', 'education', 'black',
                'hispanic', 'nodegree', 'married'] 
z_name = 'treat'
y_name = 're78'

X = data[ x_names ]
z = data[ z_name ]
y = data[ y_name ]

run_x(X=X, y=y, z=z, outcome_model=outcome_model)



Unnamed: 0,train,test
ATE(estimate_ate),-3023.69519,
ATE(lb),-3866.455855,
ATE(ub),-2180.934525,
ATE,-3050.035467,-3089.03985
ATT,1296.120484,1421.612266
ATU,-3100.041264,-3142.066526
CATE(re75=0),2820.612413,2816.251756
CATE(re75!=0),-3819.579652,-3833.77735
ITE(10%),-7273.796112,-7385.38202
ITE(25%),-5467.935486,-5467.352497


### 実行(線形回帰)
- EconMLとCausalMLの結果が一致することを確認するために用意

In [10]:
data = data1.copy()

outcome_model = LinearRegression()

x_names = ['re74', 're75', 'age', 'education', 'black',
                'hispanic', 'nodegree', 'married'] 
z_name = 'treat'
y_name = 're78'

X = data[ x_names ]
z = data[ z_name ]
y = data[ y_name ]

run_x(X=X, y=y, z=z, outcome_model=outcome_model)


Unnamed: 0,train,test
ATE(estimate_ate),-5542.932862,
ATE(lb),-6882.902452,
ATE(ub),-4202.963273,
ATE,-5542.932862,-5648.363535
ATT,715.186453,-239.834748
ATU,-5614.937237,-5711.945539
CATE(re75=0),4649.428497,4458.659091
CATE(re75!=0),-6878.981722,-6922.996399
ITE(10%),-14122.603764,-14225.199348
ITE(25%),-11486.174951,-11385.929459


## S-Learner

In [11]:
def run_s(X, z, y, model):
    learner = BaseSLearner(learner=model)
    return calc_effects(learner=learner, X=X, z=z, y=y)


### 実行

In [12]:
data = data1.copy()


outcome_model = GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.05, loss='ls', max_depth=4,
                          max_features='log2', max_leaf_nodes=None,
                          min_impurity_decrease=0.3, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=5,
                          min_weight_fraction_leaf=0.0, n_estimators=80,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=7328, subsample=0.75, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

x_names = ['re74', 're75', 'age', 'education', 'black',
                'hispanic', 'nodegree', 'married'] 
z_name = 'treat'
y_name = 're78'

X = data[ x_names ]
z = data[ z_name ]
y = data[ y_name ]

run_s(X=X, y=y, z=z, model=outcome_model)


Unnamed: 0,train,test
ATE(estimate_ate),-1874.246976,
ATE(lb),-3215.81118,
ATE(ub),-532.682773,
ATE,-1874.246976,-1874.186368
ATT,387.130776,345.936296
ATU,-1900.26583,-1900.285859
CATE(re75=0),970.744623,961.316666
CATE(re75!=0),-2247.178002,-2231.781823
ITE(10%),-4257.235829,-4257.235829
ITE(25%),-3695.725599,-3585.895404


### 実行（線形回帰）
- EconMLとCausalMLの結果が一致することを確認するために用意

In [13]:
data = data1.copy()

outcome_model = LinearRegression()

x_names = ['re74', 're75', 'age', 'education', 'black',
                'hispanic', 'nodegree', 'married'] 
z_name = 'treat'
y_name = 're78'

X = data[ x_names ]
z = data[ z_name ]
y = data[ y_name ]

run_s(X=X, y=y, z=z, model=outcome_model)


Unnamed: 0,train,test
ATE(estimate_ate),720.289796,
ATE(lb),-740.542412,
ATE(ub),2181.122005,
ATE,720.289796,720.289796
ATT,720.289796,720.289796
ATU,720.289796,720.289796
CATE(re75=0),720.289796,720.289796
CATE(re75!=0),720.289796,720.289796
ITE(10%),720.289796,720.289796
ITE(25%),720.289796,720.289796


## R-Learner

In [14]:
def run_r(X, z, y, outcome_model, ps_model):
    learner = BaseRLearner(learner=outcome_model, propensity_learner=ps_model)
    return calc_effects(learner=learner, X=X, z=z, y=y)


### 実行

In [15]:
data = data1.copy()

outcome_model = GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.05, loss='ls', max_depth=4,
                          max_features='log2', max_leaf_nodes=None,
                          min_impurity_decrease=0.3, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=5,
                          min_weight_fraction_leaf=0.0, n_estimators=80,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=7328, subsample=0.75, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

ps_model = LinearDiscriminantAnalysis()

x_names = ['re74', 're75', 'age', 'education', 'black',
                'hispanic', 'nodegree', 'married'] 
z_name = 'treat'
y_name = 're78'

X = data[ x_names ]
z = data[ z_name ]
y = data[ y_name ]

run_r(X=X, y=y, z=z, outcome_model=outcome_model, ps_model=ps_model)


Unnamed: 0,train,test
ATE(estimate_ate),-14319.340108,
ATE(lb),-14339.481096,
ATE(ub),-14299.19912,
ATE,-14319.340108,-10108.310705
ATT,1066.186769,2797.327661
ATU,-14496.362178,-10260.027814
CATE(re75=0),4598.158417,8769.56611
CATE(re75!=0),-16799.109167,-12489.067442
ITE(10%),-70601.513012,-69560.988288
ITE(25%),-34708.683626,-34167.400147
