In [1]:
from rulefit_uplift_forest import CausalRuleEnsembling
from sklearn.model_selection import KFold
import torch
import sdv
from sdv.tabular import CTGAN
from sdv.sampling import Condition
import pandas as pd
import numpy as np
import random
from sklearn.metrics import roc_auc_score as auc
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import miceforest as mf
import random

In [2]:
random.seed(1)
np.random.seed(1)

In [3]:
df = pd.read_csv('temp_data/imputed_all.csv')

In [4]:
df.columns

Index(['Unnamed: 0', 'TREATMENT', 'DEMO_AGE', 'OUTCOME_mRS90', 'CT1_ICHVOL',
       'CT1_IVHVOL', 'GCS_TTL', 'NIHSS_TTL', 'BP_S0', 'BP_D0',
       'HIS_HYPERTENSION', 'HIS_HYPERLIPIDEMIA', 'HIS_DM1', 'HIS_DM2',
       'HIS_HF', 'HIS_AF', 'HIS_PTCA', 'HIS_PV', 'HIS_MYOCARDIAL',
       'HIS_ANTIDIABETIC', 'HIS_ANTIHYPERTENSIVES', 'LB_WBC', 'LB_HEMOGLOBIN',
       'LB_HEMATOCRIT', 'LB_PC', 'LB_APTT', 'LB_INR', 'LB_GLUCOSE',
       'LB_SODIUM', 'LB_POTASSIUM', 'LB_CHLORIDE', 'LB_CD', 'LB_BUN',
       'LB_CREATINIE', 'RACE', 'GENDER', 'ETHNICITY', 'CT1_ICHSIDE', 'ICHLOC',
       'source', 'group', 'index'],
      dtype='object')

# Train our model on the trial only
1000 samples from RCT

Train-test split: 8:2

Test data will be only involved in the true trial data

In [5]:
def transform_var(df):
    df['age'] = (df['DEMO_AGE'] - np.mean(df['DEMO_AGE'])) / np.std(df['DEMO_AGE'])
    df['ich_vol'] = np.log(df['CT1_ICHVOL'] + 1)
    df['ivh_vol'] = np.log(df['CT1_IVHVOL'] + 1)
    df['gcs'] = (df['GCS_TTL'] - np.mean(df['GCS_TTL'])) / np.std(df['GCS_TTL'])
    df['nihss'] = np.log(df['NIHSS_TTL'] + 1)
    df['sbp'] = (df['BP_S0'] - np.mean(df['BP_S0'])) / np.std(df['BP_S0'])
    df['dbp'] = (df['BP_D0'] - np.mean(df['BP_D0'])) / np.std(df['BP_D0'])
    df['PP'] = df['BP_S0'] - df['BP_D0']
    df['MAP'] = df['BP_S0']/3 + df['BP_D0']*2/3
    df['pp'] = (df['PP'] -np.mean(df['PP'])) / np.std(df['PP'])
    df['map'] = (df['MAP'] - np.mean(df['MAP']))/np.std(df['MAP'])
    
    df['pc'] = (df['LB_PC'] - np.mean(df['LB_PC'])) / np.std(df['LB_PC'])
    df['glucose'] = (df['LB_GLUCOSE'] - np.mean(df['LB_GLUCOSE'])) / np.std(df['LB_GLUCOSE'])
    df['sodium'] = (df['LB_SODIUM'] - np.mean(df['LB_SODIUM'])) / np.std(df['LB_SODIUM'])
    df['potassium'] = (df['LB_POTASSIUM'] - np.mean(df['LB_POTASSIUM'])) / np.std(df['LB_POTASSIUM'])
    df['chloride'] = (df['LB_CHLORIDE'] - np.mean(df['LB_CHLORIDE'])) / np.std(df['LB_CHLORIDE'])
    df['cd'] = (df['LB_CD'] - np.mean(df['LB_CD'])) / np.std(df['LB_CD'])
    df['bun'] = (df['LB_BUN'] - np.mean(df['LB_BUN'])) / np.std(df['LB_BUN'])
    df['hemoglobin'] = (df['LB_HEMOGLOBIN'] - np.mean(df['LB_HEMOGLOBIN'])) / np.std(df['LB_HEMOGLOBIN'])
    df['hematocrit'] = (df['LB_HEMATOCRIT'] - np.mean(df['LB_HEMATOCRIT'])) / np.std(df['LB_HEMATOCRIT'])
    df['wbc'] = (df['LB_WBC'] - np.mean(df['LB_WBC'])) / np.std(df['LB_WBC'])
    df['creatinie'] = (df['LB_CREATINIE'] - np.mean(df['LB_CREATINIE'])) / np.std(df['LB_CREATINIE'])
    df['aptt'] = (df['LB_APTT'] - np.mean(df['LB_APTT'])) / np.std(df['LB_APTT'])
    df['inr'] = (df['LB_INR'] - np.mean(df['LB_INR'])) / np.std(df['LB_INR'])
    
    return df

In [6]:
def test_transform_var(df, train):
    # Using the training data to 
    df['age'] = (df['DEMO_AGE'] - np.mean(train['DEMO_AGE'])) / np.std(train['DEMO_AGE'])
    df['ich_vol'] = np.log(df['CT1_ICHVOL'] + 1)
    df['ivh_vol'] = np.log(df['CT1_IVHVOL'] + 1)
    df['gcs'] = (df['GCS_TTL'] - np.mean(train['GCS_TTL'])) / np.std(train['GCS_TTL'])
    df['nihss'] = np.log(df['NIHSS_TTL'] + 1)
    df['sbp'] = (df['BP_S0'] - np.mean(train['BP_S0'])) / np.std(train['BP_S0'])
    df['dbp'] = (df['BP_D0'] - np.mean(train['BP_D0'])) / np.std(train['BP_D0'])
    
    df['PP'] = df['BP_S0'] - df['BP_D0']
    df['MAP'] = df['BP_S0']/3 + df['BP_D0']*2/3
    df['pp'] = (df['PP'] -np.mean(train['PP'])) / np.std(train['PP'])
    df['map'] = (df['MAP'] - np.mean(train['MAP']))/np.std(train['MAP'])
    
    df['pc'] = (df['LB_PC'] - np.mean(train['LB_PC'])) / np.std(train['LB_PC'])
    df['glucose'] = (df['LB_GLUCOSE'] - np.mean(train['LB_GLUCOSE'])) / np.std(train['LB_GLUCOSE'])
    df['sodium'] = (df['LB_SODIUM'] - np.mean(train['LB_SODIUM'])) / np.std(train['LB_SODIUM'])
    df['potassium'] = (df['LB_POTASSIUM'] - np.mean(train['LB_POTASSIUM'])) / np.std(train['LB_POTASSIUM'])
    df['chloride'] = (df['LB_CHLORIDE'] - np.mean(train['LB_CHLORIDE'])) / np.std(train['LB_CHLORIDE'])
    df['cd'] = (df['LB_CD'] - np.mean(train['LB_CD'])) / np.std(train['LB_CD'])
    df['bun'] = (df['LB_BUN'] - np.mean(train['LB_BUN'])) / np.std(train['LB_BUN'])
    df['hemoglobin'] = (df['LB_HEMOGLOBIN'] - np.mean(train['LB_HEMOGLOBIN'])) / np.std(train['LB_HEMOGLOBIN'])
    df['hematocrit'] = (df['LB_HEMATOCRIT'] - np.mean(train['LB_HEMATOCRIT'])) / np.std(train['LB_HEMATOCRIT'])
    df['wbc'] = (df['LB_WBC'] - np.mean(train['LB_WBC'])) / np.std(train['LB_WBC'])
    df['creatinie'] = (df['LB_CREATINIE'] - np.mean(train['LB_CREATINIE'])) / np.std(train['LB_CREATINIE'])
    df['aptt'] = (df['LB_APTT'] - np.mean(train['LB_APTT'])) / np.std(train['LB_APTT'])
    df['inr'] = (df['LB_INR'] - np.mean(train['LB_INR'])) / np.std(train['LB_INR'])
    
    return df

In [7]:
pre_treatment_var = ['age', 'ich_vol', 'ivh_vol', 'gcs', 'nihss', 'sbp', 'dbp', 'pp', 'map', 
                     'RACE_Asian', 'RACE_Black', 'RACE_Other', 'RACE_White', 'GENDER_Male', 'GENDER_Female',
                     'ETHNICITY_Hispanic','ETHNICITY_Non-Hispanic',
                     
                     'HIS_HYPERTENSION', 'HIS_HYPERLIPIDEMIA', 'HIS_DM2',
                     'HIS_DM1', 'HIS_HF', 'HIS_AF', 'HIS_PTCA', 'HIS_PV','HIS_MYOCARDIAL', 
                     'HIS_ANTIDIABETIC', 'HIS_ANTIHYPERTENSIVES',
                     
                     'CT1_ICHSIDE','ICHLOC_Basal Ganglia', 'ICHLOC_Lobar', 'ICHLOC_Thalamus', 'ICHLOC_Other',
                     
                     'wbc', 'hemoglobin','hematocrit', 'pc', 'aptt', 'inr', 'glucose',
                     'sodium', 'potassium', 'chloride', 'cd', 'bun','creatinie']

In [8]:
len(pre_treatment_var)

46

In [9]:
group = df['group']
source = df['source']
df.drop(columns=['source', 'group'])
df = pd.get_dummies(df)

In [10]:
len(df)

3706

In [11]:
df['group'] = group
df['source'] = source
df_train = df[(df['group'] == 'train')& (df['source'] == 'atach2')].copy()
df_train = transform_var(df_train)
df_test = df[(df['group'] == 'test')].copy()
df_test = test_transform_var(df_test, df_train)

In [12]:
X = np.array(df_train[pre_treatment_var])
y = np.array(df_train['TREATMENT'])
pm_lgr = LogisticRegression(penalty='none', max_iter=3000)
pm_lgr.fit(X, y)
clip_bounds = (1e-3, 1-1e-3)
score_lgr2 = np.clip(pm_lgr.predict_proba(X)[:, 1], *clip_bounds)
print('AUC score: {:.6f}'.format(auc(y, score_lgr2)))

AUC score: 0.620841


In [13]:
len(df_train)

800

In [14]:
df_train.reset_index(inplace=True, drop=True)
df_train['outcome'] = [0] * len(df_train)
for i in range(len(df_train)):
    if df_train.loc[i, 'OUTCOME_mRS90'] <= 2:
        df_train.loc[i, 'outcome'] = 1

In [15]:
train_X = np.array(df_train[pre_treatment_var])
train_treatment=(df_train['TREATMENT']!=0).astype(int).values
y_train = (df_train['OUTCOME_mRS90']).values
y_bin_train = np.array([0] * len(y_train))
y_bin_train[np.where(y_train <= 3)] = 1
round_idx = 1
depth = [3]
reg = [3, 5, 10]
min_samples = [50, 80, 100]
min_trts = [20, 30, 40]

res = {1:[[],[]], 2:[[],[]], 3:[[], []], 4: [[], []],5:[[],[]], 6:[[],[]], 7:[[], []], 8:[[], []], 9: [[], []]}
res2 = {1:[[],[]], 2:[[],[]], 3:[[], []], 4: [[], []],5:[[],[]], 6:[[],[]], 7: [[], []], 8: [[], []], 9: [[], []]}

seed = 100
kf = KFold(n_splits=4)
for train_index, test_index in kf.split(train_X):
    print('Round:', round_idx)
    idx = 1
    X_train, X_test = train_X[train_index], train_X[test_index]
    y_train, y_test = y_bin_train[train_index], y_bin_train[test_index]
    treatment_train, treatment_test = train_treatment[train_index], train_treatment[test_index]
    if sum(treatment_test) == 0:
        continue
    
    trt_train = [0] * len(treatment_train)
    for i in range(len(treatment_train)):
        if treatment_train[i] != 0:
            trt_train[i] = 'treatment'
        else:
            trt_train[i] = 'control'
    trt_train = np.array(trt_train)
    
    for d in depth:
        for r in reg:
            for j in range(len(min_samples)):
                n_samples = min_samples[j]
                n_trts = min_trts[j]
            
                # train the model
                model = CausalRuleEnsembling(tree_depth = d, n_reg=r, n_estimator=100, min_samples_leaf = n_samples, min_samples_treatment = n_trts, model_type='rl', lin_standardise=False, random_state = seed,max_iter=5000)
                model.fit(X_train, trt_train, y_train, pre_treatment_var)
                qini_tree, qini_last = model.eval_qini(X_test, y_test, treatment_test)

            
                res[idx][0].append(qini_tree)
                res[idx][1].append(qini_last)
            
                res_temp = ['Depth:' + str(d), 'n_reg:'+str(r),'n_samples:'+str(n_samples), 'n_trt:'+str(), 'qini1:'+str(qini_tree), 'qini_final:'+str(qini_last)]
                print(','.join(res_temp))
                idx += 1
    round_idx += 1

Round: 1
Depth:3,n_reg:3,n_samples:50,n_trt:,qini1:-0.03023572921615637,qini_final:-0.0293871052500526
Depth:3,n_reg:3,n_samples:80,n_trt:,qini1:-0.012795302407931284,qini_final:-0.011422652806374002
Depth:3,n_reg:3,n_samples:100,n_trt:,qini1:-0.010845397508272072,qini_final:-0.007980481939689317
Depth:3,n_reg:5,n_samples:50,n_trt:,qini1:-0.029933666704143665,qini_final:-0.029408127387478354
Depth:3,n_reg:5,n_samples:80,n_trt:,qini1:-0.008706506715310862,qini_final:-0.009079083263731896
Depth:3,n_reg:5,n_samples:100,n_trt:,qini1:-0.010030598578039814,qini_final:-0.010782257110078346
Depth:3,n_reg:10,n_samples:50,n_trt:,qini1:-0.019757827123614735,qini_final:-0.02226795237428512
Depth:3,n_reg:10,n_samples:80,n_trt:,qini1:-0.00800883074301827,qini_final:-0.010832541973097473
Depth:3,n_reg:10,n_samples:100,n_trt:,qini1:-0.011256051715967141,qini_final:-0.011899503900457013
Round: 2
Depth:3,n_reg:3,n_samples:50,n_trt:,qini1:-0.0016200778700535454,qini_final:-0.0030217391876908614
Depth:3,n

In [16]:
mu_qini = []
mu_qini_tree = []
sd_qini = []
sd_qini_tree = []
for i in range(9):
    idx = i+1
    mu_qini.append(np.mean(res[idx][1]))
    mu_qini_tree.append(np.mean(res[idx][0]))
    sd_qini.append(np.std(res[idx][1]))
    sd_qini_tree.append(np.mean(res[idx][0]))

In [17]:
print(mu_qini)
print(mu_qini_tree)

[-0.02580088542331132, -0.01204570389603924, -0.014256697209282729, -0.027753047482507405, -0.009881755690203088, -0.015436437582756441, -0.024398053127706487, -0.012047257150991225, -0.01703784845892397]
[-0.026109280058092613, -0.0116801613582252, -0.014046623722128473, -0.028008942885926535, -0.009920475508312999, -0.015900163725434617, -0.021394126270180037, -0.011819498067029664, -0.017640173190694292]


- n_reg = 5
- minimum_samples_leaf = 80
- minimum_treatment_leaf = 30

# Final model

In [18]:
from uplift_forest_customed import UpliftTreeNew

In [19]:
sum(df_train['TREATMENT']), len(df_train)

(397, 800)

In [20]:
train_X = np.array(df_train[pre_treatment_var])
test_X = np.array(df_test[pre_treatment_var])
train_treatment=(df_train['TREATMENT']!=0).astype(int).values
test_treatment=(df_test['TREATMENT']!=0).astype(int).values
y_train = (df_train['OUTCOME_mRS90']).values
y_test = (df_test['OUTCOME_mRS90']).values
y_bin_train = np.array([0] * len(y_train))
y_bin_train[np.where(y_train <= 2)] = 1

y_bin_test = np.array([0] * len(y_test))
y_bin_test[np.where(y_test <= 2)] = 1

treatment_train = ['control'] * len(train_treatment)
for i in range(len(train_treatment)):
    if train_treatment[i] == 1:
        treatment_train[i] = 'treatment'
treatment_train = np.array(treatment_train)

In [21]:
model_tree = UpliftTreeNew(n_reg=5, min_samples_leaf = 80, min_samples_treatment = 30, random_state=100, control_name='control')
model_tree.fit(train_X, treatment_train, y_bin_train)

In [22]:
model_tree.eval_qini(test_X, y_bin_test, test_treatment)

-0.10869551343553817

In [23]:
rules_tree = model_tree.get_rules()

In [24]:
len(rules_tree)

2

In [25]:
qini_max = -1
res = {'res_tree':[], 'res_lasso':[]}
for seed in range(30):
    model_temp = CausalRuleEnsembling(
         tree_depth = 3, 
         tree_eval_func = 'KL', 
         n_reg=5, 
         n_estimator = 100,           
         min_samples_leaf = 80, 
         min_samples_treatment = 30, 
         model_type='rl', 
         lin_standardise=False,
         random_state = seed)
    model_temp.fit(train_X, treatment_train, y_bin_train, pre_treatment_var)
    a, b = model_temp.eval_qini(test_X, y_bin_test, test_treatment)
    
    res['res_tree'].append(a)
    res['res_lasso'].append(b)
    print('Seed round:', seed, 'Tree qini:',a ,', Qini:', b)
    if b > qini_max:
        qini_max = b
        final_model = model_temp

Seed round: 0 Tree qini: -0.09240202871521223 , Qini: -0.09651595532190302
Seed round: 1 Tree qini: -0.05031585647291663 , Qini: -0.0518274844364105
Seed round: 5 Tree qini: -0.10053120347192737 , Qini: -0.09971050219229427
Seed round: 6 Tree qini: -0.04400682290767284 , Qini: -0.04743250892702362
Seed round: 7 Tree qini: -0.08055519582527883 , Qini: -0.07735886807375444
Seed round: 8 Tree qini: -0.06757098066277137 , Qini: -0.07009382181028753
Seed round: 9 Tree qini: -0.05182958510703443 , Qini: -0.0546893963993483
Seed round: 10 Tree qini: -0.04077406798766745 , Qini: -0.03915990686129929
Seed round: 11 Tree qini: -0.12352328128921689 , Qini: -0.12536596843659176
Seed round: 12 Tree qini: -0.04415614783845404 , Qini: -0.04402673403577505
Seed round: 13 Tree qini: -0.054683589138346776 , Qini: -0.05650624692159867
Seed round: 14 Tree qini: -0.10584440749584491 , Qini: -0.10674412054313301
Seed round: 15 Tree qini: -0.12614827840275888 , Qini: -0.12086640131699657
Seed round: 16 Tree 

In [27]:
np.mean(res['res_tree']),np.mean(res['res_lasso'])

(-0.07782866016013966, -0.07757704679560487)

In [28]:
np.std(res['res_tree']),np.std(res['res_lasso'])

(0.028990701645846224, 0.02846629630077618)

In [29]:
with open('results/A_tree_mrs2.txt', 'w') as filehandle:
    for listitem in res['res_tree']:
        filehandle.write(f'{listitem}\n')

In [30]:
with open('results/A_ensemble_mrs2.txt', 'w') as filehandle:
    for listitem in res['res_lasso']:
        filehandle.write(f'{listitem}\n')

In [31]:
rules = final_model.get_rules()
rules = rules[rules['type'] == 'rule']
rules_ = rules[(rules['coef'] != 0)]

In [32]:
len(rules), len(rules_)

(191, 167)