In [1]:
from rulefit_uplift_forest import CausalRuleEnsembling
from sklearn.model_selection import KFold
from causalml.propensity import ElasticNetPropensityModel

In [2]:
import pandas as pd
import numpy as np

In [3]:
import random
import matplotlib.pyplot as plt

In [4]:
import miceforest as mf

In [5]:
from causalml.match import create_table_one

# Train our model on the ATACH2 + ERICH

In [6]:
df = pd.read_csv('temp_data/imputed_all.csv')

In [7]:
df

Unnamed: 0.1,Unnamed: 0,TREATMENT,DEMO_AGE,OUTCOME_mRS90,CT1_ICHVOL,CT1_IVHVOL,GCS_TTL,NIHSS_TTL,BP_S0,BP_D0,...,LB_BUN,LB_CREATINIE,RACE,GENDER,ETHNICITY,CT1_ICHSIDE,ICHLOC,source,group,index
0,764,1,90.0,2.0,0.30576,0.00000,15.0,2.0,194.0,81.0,...,24.00,1.53,White,Male,Non-Hispanic,0.0,Thalamus,atach2,train,0
1,731,1,63.0,0.0,7.79725,0.00000,15.0,3.0,163.0,88.0,...,10.00,0.96,Asian,Male,Non-Hispanic,0.0,Basal Ganglia,atach2,train,1
2,210,1,51.0,0.0,5.78500,0.00000,15.0,9.0,180.0,80.0,...,17.42,1.01,Asian,Male,Non-Hispanic,1.0,Other,atach2,train,2
3,142,1,48.0,5.0,26.86619,0.00000,9.0,24.0,144.0,88.0,...,13.00,0.84,Black,Female,Non-Hispanic,0.0,Basal Ganglia,atach2,train,3
4,527,1,79.0,4.0,7.65244,10.50645,12.0,15.0,179.0,74.0,...,15.60,0.75,White,Female,Non-Hispanic,0.0,Thalamus,atach2,train,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3701,946,0,61.0,4.0,6.77108,0.00000,15.0,17.0,191.0,105.0,...,15.80,0.88,Asian,Female,Non-Hispanic,1.0,Thalamus,atach2,test,3701
3702,877,1,87.0,5.0,16.86191,5.68867,9.0,15.0,179.0,69.0,...,15.00,1.01,Asian,Male,Non-Hispanic,1.0,Basal Ganglia,atach2,test,3702
3703,573,0,44.0,3.0,24.69418,0.00000,12.0,8.0,210.0,120.0,...,21.86,1.11,Asian,Male,Non-Hispanic,1.0,Basal Ganglia,atach2,test,3703
3704,886,0,54.0,2.0,7.06034,0.00000,15.0,3.0,209.0,106.0,...,19.00,1.60,Black,Male,Non-Hispanic,1.0,Basal Ganglia,atach2,test,3704


In [8]:
def transform_var(df):
    df['age'] = (df['DEMO_AGE'] - np.mean(df['DEMO_AGE'])) / np.std(df['DEMO_AGE'])
    df['ich_vol'] = np.log(df['CT1_ICHVOL'] + 1)
    df['ivh_vol'] = np.log(df['CT1_IVHVOL'] + 1)
    df['gcs'] = (df['GCS_TTL'] - np.mean(df['GCS_TTL'])) / np.std(df['GCS_TTL'])
    df['nihss'] = np.log(df['NIHSS_TTL'] + 1)
    df['sbp'] = (df['BP_S0'] - np.mean(df['BP_S0'])) / np.std(df['BP_S0'])
    df['dbp'] = (df['BP_D0'] - np.mean(df['BP_D0'])) / np.std(df['BP_D0'])
    df['PP'] = df['BP_S0'] - df['BP_D0']
    df['MAP'] = df['BP_S0']/3 + df['BP_D0']*2/3
    df['pp'] = (df['PP'] -np.mean(df['PP'])) / np.std(df['PP'])
    df['map'] = (df['MAP'] - np.mean(df['MAP']))/np.std(df['MAP'])
    
    df['pc'] = (df['LB_PC'] - np.mean(df['LB_PC'])) / np.std(df['LB_PC'])
    df['glucose'] = (df['LB_GLUCOSE'] - np.mean(df['LB_GLUCOSE'])) / np.std(df['LB_GLUCOSE'])
    df['sodium'] = (df['LB_SODIUM'] - np.mean(df['LB_SODIUM'])) / np.std(df['LB_SODIUM'])
    df['potassium'] = (df['LB_POTASSIUM'] - np.mean(df['LB_POTASSIUM'])) / np.std(df['LB_POTASSIUM'])
    df['chloride'] = (df['LB_CHLORIDE'] - np.mean(df['LB_CHLORIDE'])) / np.std(df['LB_CHLORIDE'])
    df['cd'] = (df['LB_CD'] - np.mean(df['LB_CD'])) / np.std(df['LB_CD'])
    df['bun'] = (df['LB_BUN'] - np.mean(df['LB_BUN'])) / np.std(df['LB_BUN'])
    df['hemoglobin'] = (df['LB_HEMOGLOBIN'] - np.mean(df['LB_HEMOGLOBIN'])) / np.std(df['LB_HEMOGLOBIN'])
    df['hematocrit'] = (df['LB_HEMATOCRIT'] - np.mean(df['LB_HEMATOCRIT'])) / np.std(df['LB_HEMATOCRIT'])
    df['wbc'] = (df['LB_WBC'] - np.mean(df['LB_WBC'])) / np.std(df['LB_WBC'])
    df['creatinie'] = (df['LB_CREATINIE'] - np.mean(df['LB_CREATINIE'])) / np.std(df['LB_CREATINIE'])
    df['aptt'] = (df['LB_APTT'] - np.mean(df['LB_APTT'])) / np.std(df['LB_APTT'])
    df['inr'] = (df['LB_INR'] - np.mean(df['LB_INR'])) / np.std(df['LB_INR'])
    
    return df

In [9]:
def test_transform_var(df, train):
    # Using the training data to 
    df['age'] = (df['DEMO_AGE'] - np.mean(train['DEMO_AGE'])) / np.std(train['DEMO_AGE'])
    df['ich_vol'] = np.log(df['CT1_ICHVOL'] + 1)
    df['ivh_vol'] = np.log(df['CT1_IVHVOL'] + 1)
    df['gcs'] = (df['GCS_TTL'] - np.mean(train['GCS_TTL'])) / np.std(train['GCS_TTL'])
    df['nihss'] = np.log(df['NIHSS_TTL'] + 1)
    df['sbp'] = (df['BP_S0'] - np.mean(train['BP_S0'])) / np.std(train['BP_S0'])
    df['dbp'] = (df['BP_D0'] - np.mean(train['BP_D0'])) / np.std(train['BP_D0'])
    
    df['PP'] = df['BP_S0'] - df['BP_D0']
    df['MAP'] = df['BP_S0']/3 + df['BP_D0']*2/3
    df['pp'] = (df['PP'] -np.mean(train['PP'])) / np.std(train['PP'])
    df['map'] = (df['MAP'] - np.mean(train['MAP']))/np.std(train['MAP'])
    
    df['pc'] = (df['LB_PC'] - np.mean(train['LB_PC'])) / np.std(train['LB_PC'])
    df['glucose'] = (df['LB_GLUCOSE'] - np.mean(train['LB_GLUCOSE'])) / np.std(train['LB_GLUCOSE'])
    df['sodium'] = (df['LB_SODIUM'] - np.mean(train['LB_SODIUM'])) / np.std(train['LB_SODIUM'])
    df['potassium'] = (df['LB_POTASSIUM'] - np.mean(train['LB_POTASSIUM'])) / np.std(train['LB_POTASSIUM'])
    df['chloride'] = (df['LB_CHLORIDE'] - np.mean(train['LB_CHLORIDE'])) / np.std(train['LB_CHLORIDE'])
    df['cd'] = (df['LB_CD'] - np.mean(train['LB_CD'])) / np.std(train['LB_CD'])
    df['bun'] = (df['LB_BUN'] - np.mean(train['LB_BUN'])) / np.std(train['LB_BUN'])
    df['hemoglobin'] = (df['LB_HEMOGLOBIN'] - np.mean(train['LB_HEMOGLOBIN'])) / np.std(train['LB_HEMOGLOBIN'])
    df['hematocrit'] = (df['LB_HEMATOCRIT'] - np.mean(train['LB_HEMATOCRIT'])) / np.std(train['LB_HEMATOCRIT'])
    df['wbc'] = (df['LB_WBC'] - np.mean(train['LB_WBC'])) / np.std(train['LB_WBC'])
    df['creatinie'] = (df['LB_CREATINIE'] - np.mean(train['LB_CREATINIE'])) / np.std(train['LB_CREATINIE'])
    df['aptt'] = (df['LB_APTT'] - np.mean(train['LB_APTT'])) / np.std(train['LB_APTT'])
    df['inr'] = (df['LB_INR'] - np.mean(train['LB_INR'])) / np.std(train['LB_INR'])
    
    return df

# Propensity score matching

In [10]:
from causalml.match import NearestNeighborMatch, create_table_one
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score as auc
from sklearn import preprocessing

In [11]:
pre_treatment_var = ['age', 'ich_vol', 'ivh_vol', 'gcs', 'nihss', 'sbp', 'dbp', 'pp', 'map', 
                     'RACE_Asian', 'RACE_Black', 'RACE_Other', 'RACE_White', 'GENDER_Male', 'GENDER_Female',
                     'ETHNICITY_Hispanic', 'ETHNICITY_Non-Hispanic',
                     
                     'HIS_HYPERTENSION', 'HIS_HYPERLIPIDEMIA', 'HIS_DM2',
                     'HIS_DM1', 'HIS_HF', 'HIS_AF', 'HIS_PTCA', 'HIS_PV','HIS_MYOCARDIAL', 
                     'HIS_ANTIDIABETIC', 'HIS_ANTIHYPERTENSIVES',
                     
                     'CT1_ICHSIDE','ICHLOC_Basal Ganglia','ICHLOC_Lobar', 'ICHLOC_Other', 'ICHLOC_Thalamus', 
                     
                     'wbc', 'hemoglobin','hematocrit', 'pc', 'aptt', 'inr', 'glucose',
                     'sodium', 'potassium', 'chloride', 'cd', 'bun','creatinie']

In [12]:
source = df['source']
group = df['group']
df_new = df.copy()
df_new.drop(columns=['source', 'group'], inplace=True)
df_new = pd.get_dummies(df_new)
df_new['source'] = source
df_new['group'] = group
df_new['index'] = df['index']
trial = df_new[df_new['source'] == 'atach2'].copy()
trial_train = trial[trial['group'] == 'train'].copy()
trial_test = trial[trial['group'] == 'test'].copy()
trial_train = transform_var(trial_train)
trial_test = test_transform_var(trial_test, trial_train)

In [13]:
df_train = df_new[df_new['group'] == 'train'].copy()
df_train = transform_var(df_train)

In [14]:
len(df_train)

3506

In [15]:
eval_all = create_table_one(df_train, 'TREATMENT',pre_treatment_var)
a = 0
b = 0
# Calculate average smd on ERICH + ATACH2
for i in range(len(eval_all['SMD'])):
    if (eval_all['SMD'][i] !='') & ~(pd.isna(eval_all['SMD'][i])):
        a += abs(eval_all['SMD'][i])
        b += 1
a/b

0.26606956521739134

In [16]:
X = np.array(df_train[pre_treatment_var])
y = np.array(df_train['TREATMENT'])
pm = LogisticRegression(penalty='none', max_iter=3000)
pm.fit(X, y)
clip_bounds = (1e-3, 1-1e-3)
score_lgr = np.clip(pm.predict_proba(X)[:, 1], *clip_bounds)
print('AUC score: {:.6f}'.format(auc(y, score_lgr)))

AUC score: 0.918327


In [17]:
df_train['score'] = score_lgr

In [18]:
df_temp = df_train[~((df_train['TREATMENT'] == 0) & (df_train['source'] == 'atach2'))]

In [19]:
df_temp.reset_index(inplace=True, drop=True)
psm = NearestNeighborMatch(caliper=0.2, replace=False, ratio=1, random_state=11)
matched = psm.match(data=df_temp, treatment_col='TREATMENT',score_cols=['score'])

In [20]:
matched_erich = matched[matched['source'] == 'erich']

In [21]:
len(matched_erich)

134

In [22]:
matched_erich.columns

Index(['Unnamed: 0', 'TREATMENT', 'DEMO_AGE', 'OUTCOME_mRS90', 'CT1_ICHVOL',
       'CT1_IVHVOL', 'GCS_TTL', 'NIHSS_TTL', 'BP_S0', 'BP_D0',
       'HIS_HYPERTENSION', 'HIS_HYPERLIPIDEMIA', 'HIS_DM1', 'HIS_DM2',
       'HIS_HF', 'HIS_AF', 'HIS_PTCA', 'HIS_PV', 'HIS_MYOCARDIAL',
       'HIS_ANTIDIABETIC', 'HIS_ANTIHYPERTENSIVES', 'LB_WBC', 'LB_HEMOGLOBIN',
       'LB_HEMATOCRIT', 'LB_PC', 'LB_APTT', 'LB_INR', 'LB_GLUCOSE',
       'LB_SODIUM', 'LB_POTASSIUM', 'LB_CHLORIDE', 'LB_CD', 'LB_BUN',
       'LB_CREATINIE', 'CT1_ICHSIDE', 'index', 'RACE_Asian', 'RACE_Black',
       'RACE_Other', 'RACE_White', 'GENDER_Female', 'GENDER_Male',
       'ETHNICITY_Hispanic', 'ETHNICITY_Non-Hispanic', 'ICHLOC_Basal Ganglia',
       'ICHLOC_Lobar', 'ICHLOC_Other', 'ICHLOC_Thalamus', 'source', 'group',
       'age', 'ich_vol', 'ivh_vol', 'gcs', 'nihss', 'sbp', 'dbp', 'PP', 'MAP',
       'pp', 'map', 'pc', 'glucose', 'sodium', 'potassium', 'chloride', 'cd',
       'bun', 'hemoglobin', 'hematocrit', 'wbc', '

In [23]:
merge = pd.concat([trial_train, matched_erich])
pre_treatment_var = ['age', 'ich_vol', 'ivh_vol', 'gcs', 'nihss', 'sbp', 'dbp', 'pp', 'map', 
                     'RACE_Asian', 'RACE_Black', 'RACE_Other', 'RACE_White', 'GENDER_Male', 'GENDER_Female',
                     'ETHNICITY_Hispanic', 'ETHNICITY_Non-Hispanic',
                     
                     'HIS_HYPERTENSION', 'HIS_HYPERLIPIDEMIA', 'HIS_DM2',
                     'HIS_DM1', 'HIS_HF', 'HIS_AF', 'HIS_PTCA', 'HIS_PV','HIS_MYOCARDIAL', 
                     'HIS_ANTIDIABETIC', 'HIS_ANTIHYPERTENSIVES',
                     
                     'CT1_ICHSIDE','ICHLOC_Basal Ganglia', 'ICHLOC_Lobar', 'ICHLOC_Thalamus', 'ICHLOC_Other', 
                     
                     'wbc', 'hemoglobin','hematocrit', 'pc', 'aptt', 'inr', 'glucose',
                     'sodium', 'potassium', 'chloride', 'cd', 'bun','creatinie']
X = np.array(merge[pre_treatment_var])
y = np.array(merge['TREATMENT'])
pm_lgr = LogisticRegression(penalty='none', max_iter=3000)
pm_lgr.fit(X, y)
clip_bounds = (1e-3, 1-1e-3)
score_lgr = np.clip(pm_lgr.predict_proba(X)[:, 1], *clip_bounds)
print('AUC score: {:.6f}'.format(auc(y, score_lgr)))
merge['score'] = score_lgr

AUC score: 0.653936


In [24]:
len(merge), len(df_train)

(934, 3506)

In [25]:
eval_merge = create_table_one(merge, 'TREATMENT',pre_treatment_var)
a = 0
b = 0
# Calculate average smd on ERICH + ATACH2
for i in range(len(eval_merge['SMD'])):
    if (eval_merge['SMD'][i] !='') & ~(pd.isna(eval_merge['SMD'][i])):
        a += abs(eval_merge['SMD'][i])
        b += 1
a/b

0.06047826086956524

In [26]:
#matched_erich.to_csv('temp_data/matched_erich.csv')

In [27]:
# Calculate average smd on ERICH + ATACH2
eval_trial = create_table_one(trial_train, 'TREATMENT',pre_treatment_var)
a = 0
b = 0
for i in range(len(eval_trial['SMD'])):
    if (eval_trial['SMD'][i] !='') & ~(pd.isna(eval_trial['SMD'][i])):
        a += abs(eval_trial['SMD'][i])
        b += 1
a/b

0.04847826086956521

# Final model
selected model from CV:

tree:
- max_depth = 3
- n_reg = 5
- n_samples = 80
- min_treatment_samples = 30

Ensemble:
- max_depth = 3
- n_reg = 5
- n_samples = 80
- min_treatment_samples = 30

# Results (multiple seeds)

In [28]:
train = pd.concat([trial_train, matched_erich])
train_X = np.array(train[pre_treatment_var])
test_X = np.array(trial_test[pre_treatment_var])
train_treatment=(train['TREATMENT']!=0).astype(int).values
test_treatment=(trial_test['TREATMENT']!=0).astype(int).values
y_train = (train['OUTCOME_mRS90']).values
y_test = (trial_test['OUTCOME_mRS90']).values
y_bin_train = np.array([0] * len(y_train))
y_bin_train[np.where(y_train <= 3)] = 1
y_bin_test = np.array([0] * len(y_test))
y_bin_test[np.where(y_test <= 3)] = 1

In [29]:
train_X.shape

(934, 46)

In [30]:
treatment_train = ['control'] * len(train_treatment)
for i in range(len(train_treatment)):
    if train_treatment[i] == 1:
        treatment_train[i] = 'treatment'
treatment_train = np.array(treatment_train)
treatment_test = ['control'] * len(test_treatment)
for i in range(len(test_treatment)):
    if test_treatment[i] == 1:
        treatment_test[i] = 'treatment'
treatment_test = np.array(treatment_test)

In [31]:
test_X.shape, y_bin_test.shape, treatment_test.shape

((200, 46), (200,), (200,))

In [32]:
from uplift_forest_customed import UpliftTreeNew
model_tree = UpliftTreeNew(n_reg=5, min_samples_leaf = 80, min_samples_treatment = 30, random_state=100, control_name='control')
model_tree.fit(train_X, treatment_train, y_bin_train)
model_tree.eval_qini(test_X, y_bin_test, test_treatment)

0.050482793298807054

In [33]:
tree_rules = model_tree.get_rules()
len(tree_rules)

2

In [35]:
qini_max = -1
res = {'res_tree':[], 'res_lasso':[]}
for seed in range(0, 30):
    model_temp = CausalRuleEnsembling(
         tree_depth = 3, 
         tree_eval_func = 'KL', 
         n_reg=5, 
         n_estimator = 100,           
         min_samples_leaf =80, 
         min_samples_treatment = 30, 
         model_type='rl', 
         lin_standardise=False,
         random_state = seed)
    model_temp.fit(train_X, treatment_train, y_bin_train, pre_treatment_var)
    a, b = model_temp.eval_qini(test_X, y_bin_test, test_treatment)
    
    
    res['res_tree'].append(a)
    res['res_lasso'].append(b)
    print('Seed round:', seed, 'Qini:', b)
    if b > qini_max:
        qini_max = b
        final_model = model_temp

Seed round: 0 Qini: -0.01304955329039506
Seed round: 1 Qini: -0.008396843297814755
Seed round: 2 Qini: 0.004766745345914925
Seed round: 3 Qini: 0.034404258749558915
Seed round: 4 Qini: 0.005657889592928664
Seed round: 5 Qini: 0.009493979075801042
Seed round: 6 Qini: -0.007678108844869361
Seed round: 7 Qini: 0.021581647474610985
Seed round: 8 Qini: -0.014091666080206556
Seed round: 9 Qini: 0.047835559436617985
Seed round: 10 Qini: -0.016933942326576513
Seed round: 11 Qini: 0.016723279707115193
Seed round: 12 Qini: 0.010701978820323785
Seed round: 13 Qini: 0.023588723645631544
Seed round: 14 Qini: 0.049204094562025055
Seed round: 15 Qini: 0.03995023626569345
Seed round: 16 Qini: -0.018803611142520705
Seed round: 17 Qini: 0.019813678224592294
Seed round: 18 Qini: 0.019159188657065788
Seed round: 19 Qini: 7.575387859469125e-05
Seed round: 20 Qini: 0.03156256366835131
Seed round: 21 Qini: 0.00833355706044907
Seed round: 22 Qini: -0.0032294392779179905
Seed round: 23 Qini: 0.0004223339118331

In [36]:
np.mean(res['res_tree']),np.mean(res['res_lasso'])

(0.009600142665959875, 0.010749381470292913)

In [37]:
np.std(res['res_tree']),np.std(res['res_lasso'])

(0.020029408639315877, 0.01989059784251751)

In [41]:
with open('results/AE_tree_mrs2.txt', 'w') as filehandle:
    for listitem in res['res_tree']:
        filehandle.write(f'{listitem}\n')

with open('results/AE_ensemble_mrs2.txt', 'w') as filehandle:
    for listitem in res['res_lasso']:
        filehandle.write(f'{listitem}\n')

In [39]:
rules = final_model.get_rules()
rules = rules[rules['type']=='rule']
rules_ = rules[(rules['coef'] != 0)]

In [40]:
len(rules), len(rules_)

(213, 171)