In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame
from sklearn.preprocessing import MinMaxScaler

In [2]:
train_df_origin = pd.read_csv('../../data/Sepsis_imp_train.csv')
test_df_origin = pd.read_csv('../../data/Sepsis_imp_test.csv')

In [3]:
train_df_origin = train_df_origin[train_df_origin.input_total_tev >= 0]
test_df_origin = test_df_origin[test_df_origin.input_total_tev >= 0]

In [4]:
train_df = train_df_origin.copy()
test_df = test_df_origin.copy()

In [5]:
binary_fields = ['gender','mechvent','re_admission']
norm_fields= ['age','Weight_kg','GCS','HR','SysBP','MeanBP','DiaBP','RR','Temp_C','FiO2_1',
    'Potassium','Sodium','Chloride','Glucose','Magnesium','Calcium',
    'Hb','WBC_count','Platelets_count','PTT','PT','Arterial_pH','paO2','paCO2',
    'Arterial_BE','HCO3','Arterial_lactate','SOFA','SIRS','Shock_Index',
    'PaO2_FiO2','cumulated_balance_tev', 'elixhauser', 'Albumin', u'CO2_mEqL', 'Ionised_Ca']
log_fields = ['max_dose_vaso','SpO2','BUN','Creatinine','SGOT','SGPT','Total_bili','INR',
              'input_total_tev','input_4hourly_tev','output_total','output_4hourly', 'bloc']

In [6]:
del train_df['charttime']
del test_df['charttime']
train_df.head()

Unnamed: 0,bloc,icustayid,gender,age,elixhauser,re_admission,SOFA,SIRS,Weight_kg,GCS,...,input_total_tev,input_4hourly_tev,output_total,output_4hourly,cumulated_balance_tev,sedation,mechvent,rrt,died_in_hosp,mortality_90d
0,1.0,12.0,1.0,12049.217303,0.0,0.0,7.0,1.0,51.200001,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,12.0,1.0,12049.217303,0.0,0.0,3.0,2.0,51.200001,15.0,...,0.0,0.0,650.0,650.0,-650.0,0.0,0.0,0.0,0.0,0.0
2,3.0,12.0,1.0,12049.217303,0.0,0.0,2.0,2.0,51.200001,15.0,...,0.0,0.0,1200.0,550.0,-1200.0,0.0,0.0,0.0,0.0,0.0
3,4.0,12.0,1.0,12049.217303,0.0,0.0,5.0,2.0,51.200001,15.0,...,0.0,0.0,1200.0,0.0,-1200.0,0.0,0.0,0.0,0.0,0.0
4,1.0,14.0,0.0,30946.97,2.0,0.0,5.0,2.0,56.872728,3.571429,...,1300.0,1300.0,340.0,160.0,960.0,1.0,1.0,0.0,0.0,1.0


In [7]:
train_df.keys()
# len(binary_fields + norm_fields + log_fields)

Index(['bloc', 'icustayid', 'gender', 'age', 'elixhauser', 're_admission',
       'SOFA', 'SIRS', 'Weight_kg', 'GCS', 'HR', 'SysBP', 'MeanBP', 'DiaBP',
       'Shock_Index', 'RR', 'SpO2', 'Temp_C', 'FiO2_1', 'Potassium', 'Sodium',
       'Chloride', 'Glucose', 'BUN', 'Creatinine', 'Magnesium', 'Calcium',
       'Ionised_Ca', 'CO2_mEqL', 'SGOT', 'SGPT', 'Total_bili', 'Albumin', 'Hb',
       'WBC_count', 'Platelets_count', 'PTT', 'PT', 'INR', 'Arterial_pH',
       'paO2', 'paCO2', 'Arterial_BE', 'Arterial_lactate', 'HCO3', 'PaO2_FiO2',
       'median_dose_vaso', 'max_dose_vaso', 'input_total_tev',
       'input_4hourly_tev', 'output_total', 'output_4hourly',
       'cumulated_balance_tev', 'sedation', 'mechvent', 'rrt', 'died_in_hosp',
       'mortality_90d'],
      dtype='object')

In [8]:
train_df[binary_fields] = train_df[binary_fields] - 0.5 
test_df[binary_fields] = test_df[binary_fields] - 0.5

In [9]:
train_df[binary_fields].head()

Unnamed: 0,gender,mechvent,re_admission
0,0.5,-0.5,-0.5
1,0.5,-0.5,-0.5
2,0.5,-0.5,-0.5
3,0.5,-0.5,-0.5
4,-0.5,0.5,-0.5


In [10]:
# normal distn fields
for item in norm_fields:
    av = train_df[item].mean()
    std = train_df[item].std()
    train_df[item] = (train_df[item] - av) / std
    test_df[item] = (test_df[item] - av) / std

In [11]:
train_df.head()

Unnamed: 0,bloc,icustayid,gender,age,elixhauser,re_admission,SOFA,SIRS,Weight_kg,GCS,...,input_total_tev,input_4hourly_tev,output_total,output_4hourly,cumulated_balance_tev,sedation,mechvent,rrt,died_in_hosp,mortality_90d
0,1.0,12.0,0.5,-1.881361,-1.817981,-0.5,0.212787,-0.544145,-1.303007,0.760683,...,0.0,0.0,0.0,0.0,-0.324079,0.0,-0.5,0.0,0.0,0.0
1,2.0,12.0,0.5,-1.881361,-1.817981,-0.5,-0.936695,0.406064,-1.303007,0.760683,...,0.0,0.0,650.0,650.0,-0.403752,0.0,-0.5,0.0,0.0,0.0
2,3.0,12.0,0.5,-1.881361,-1.817981,-0.5,-1.224065,0.406064,-1.303007,0.760683,...,0.0,0.0,1200.0,550.0,-0.471168,0.0,-0.5,0.0,0.0,0.0
3,4.0,12.0,0.5,-1.881361,-1.817981,-0.5,-0.361954,0.406064,-1.303007,0.760683,...,0.0,0.0,1200.0,0.0,-0.471168,0.0,-0.5,0.0,0.0,0.0
4,1.0,14.0,-0.5,1.185671,-0.875948,-0.5,-0.361954,0.406064,-1.07066,-2.411634,...,1300.0,1300.0,340.0,160.0,-0.206408,1.0,0.5,0.0,0.0,1.0


In [12]:
# log normal fields
train_df[log_fields] = np.log(0.1 + train_df[log_fields])
test_df[log_fields] = np.log(0.1 + test_df[log_fields])

In [13]:
train_df.head()

Unnamed: 0,bloc,icustayid,gender,age,elixhauser,re_admission,SOFA,SIRS,Weight_kg,GCS,...,input_total_tev,input_4hourly_tev,output_total,output_4hourly,cumulated_balance_tev,sedation,mechvent,rrt,died_in_hosp,mortality_90d
0,0.09531,12.0,0.5,-1.881361,-1.817981,-0.5,0.212787,-0.544145,-1.303007,0.760683,...,-2.302585,-2.302585,-2.302585,-2.302585,-0.324079,0.0,-0.5,0.0,0.0,0.0
1,0.741937,12.0,0.5,-1.881361,-1.817981,-0.5,-0.936695,0.406064,-1.303007,0.760683,...,-2.302585,-2.302585,6.477126,6.477126,-0.403752,0.0,-0.5,0.0,0.0,0.0
2,1.131402,12.0,0.5,-1.881361,-1.817981,-0.5,-1.224065,0.406064,-1.303007,0.760683,...,-2.302585,-2.302585,7.09016,6.3101,-0.471168,0.0,-0.5,0.0,0.0,0.0
3,1.410987,12.0,0.5,-1.881361,-1.817981,-0.5,-0.361954,0.406064,-1.303007,0.760683,...,-2.302585,-2.302585,7.09016,-2.302585,-0.471168,0.0,-0.5,0.0,0.0,0.0
4,0.09531,14.0,-0.5,1.185671,-0.875948,-0.5,-0.361954,0.406064,-1.07066,-2.411634,...,7.170196,7.170196,5.82924,5.075799,-0.206408,1.0,0.5,0.0,0.0,1.0


In [14]:
for item in log_fields:
    av = train_df[item].mean()
    std = train_df[item].std()
    train_df[item] = (train_df[item] - av) / std
    test_df[item] = (test_df[item] - av) / std

In [15]:
train_df.head()

Unnamed: 0,bloc,icustayid,gender,age,elixhauser,re_admission,SOFA,SIRS,Weight_kg,GCS,...,input_total_tev,input_4hourly_tev,output_total,output_4hourly,cumulated_balance_tev,sedation,mechvent,rrt,died_in_hosp,mortality_90d
0,-2.282786,12.0,0.5,-1.881361,-1.817981,-0.5,0.212787,-0.544145,-1.303007,0.760683,...,-3.604185,-1.554197,-2.738036,-1.965391,-0.324079,0.0,-0.5,0.0,0.0,0.0
1,-1.449146,12.0,0.5,-1.881361,-1.817981,-0.5,-0.936695,0.406064,-1.303007,0.760683,...,-3.604185,-1.554197,-0.062236,0.756577,-0.403752,0.0,-0.5,0.0,0.0,0.0
2,-0.947042,12.0,0.5,-1.881361,-1.817981,-0.5,-1.224065,0.406064,-1.303007,0.760683,...,-3.604185,-1.554197,0.124598,0.704794,-0.471168,0.0,-0.5,0.0,0.0,0.0
3,-0.586597,12.0,0.5,-1.881361,-1.817981,-0.5,-0.361954,0.406064,-1.303007,0.760683,...,-3.604185,-1.554197,0.124598,-1.965391,-0.471168,0.0,-0.5,0.0,0.0,0.0
4,-2.282786,14.0,-0.5,1.185671,-0.875948,-0.5,-0.361954,0.406064,-1.07066,-2.411634,...,-0.174578,1.141059,-0.259693,0.322124,-0.206408,1.0,0.5,0.0,0.0,1.0


In [16]:
scaled_train_df = pd.DataFrame(MinMaxScaler().fit_transform(train_df), columns=train_df.keys())

In [17]:
scaled_train_df.head()

Unnamed: 0,bloc,icustayid,gender,age,elixhauser,re_admission,SOFA,SIRS,Weight_kg,GCS,...,input_total_tev,input_4hourly_tev,output_total,output_4hourly,cumulated_balance_tev,sedation,mechvent,rrt,died_in_hosp,mortality_90d
0,0.0,0.0,1.0,0.203388,0.0,0.0,0.304348,0.25,0.183842,1.0,...,0.0,0.0,0.0,0.0,0.177479,0.0,0.0,0.0,0.0,0.0
1,0.22256,0.0,1.0,0.203388,0.0,0.0,0.130435,0.5,0.183842,1.0,...,0.0,0.0,0.616278,0.782139,0.175772,0.0,0.0,0.0,0.0,0.0
2,0.356608,0.0,1.0,0.203388,0.0,0.0,0.086957,0.5,0.183842,1.0,...,0.0,0.0,0.659309,0.76726,0.174328,0.0,0.0,0.0,0.0,0.0
3,0.452837,0.0,1.0,0.203388,0.0,0.0,0.217391,0.5,0.183842,1.0,...,0.0,0.0,0.659309,0.0,0.174328,0.0,0.0,0.0,0.0,0.0
4,0.0,2e-05,0.0,0.90901,0.142857,0.0,0.217391,0.5,0.204211,0.047619,...,0.632976,0.822795,0.570801,0.657302,0.179999,1.0,1.0,0.0,0.0,1.0


In [18]:
scaled_test_df = pd.DataFrame(MinMaxScaler().fit_transform(test_df), columns=test_df.keys())

In [19]:
scaled_test_df.head()

Unnamed: 0,bloc,icustayid,gender,age,elixhauser,re_admission,SOFA,SIRS,Weight_kg,GCS,...,input_total_tev,input_4hourly_tev,output_total,output_4hourly,cumulated_balance_tev,sedation,mechvent,rrt,died_in_hosp,mortality_90d
0,0.0,0.0,0.0,0.3779,0.230769,0.0,0.545455,0.75,0.926956,0.40625,...,0.650113,0.802778,0.0,0.0,0.601591,0.0,0.0,0.0,0.0,0.0
1,0.22256,0.0,0.0,0.3779,0.230769,0.0,0.363636,0.75,0.926956,0.375,...,0.730161,0.867793,0.565062,0.747928,0.607028,0.0,0.0,0.0,0.0,0.0
2,0.356608,0.0,0.0,0.3779,0.230769,0.0,0.363636,1.0,0.926956,0.34375,...,0.759614,0.844265,0.600596,0.713579,0.611279,0.0,0.0,0.0,0.0,0.0
3,0.452837,0.0,0.0,0.3779,0.230769,0.0,0.318182,1.0,0.926956,0.3125,...,0.780587,0.845492,0.623478,0.711891,0.615618,0.0,0.0,0.0,0.0,0.0
4,0.527957,0.0,0.0,0.3779,0.230769,0.0,0.318182,1.0,0.926956,0.28125,...,0.796914,0.846569,0.643104,0.726051,0.619879,0.0,0.0,0.0,0.0,0.0


In [20]:
# compute reward
def compute_r(df):
    reward = np.zeros((df.shape[0], 1))
    dided_in_hosp = df.groupby(['icustayid']).last().loc[:,['died_in_hosp']].values
    stay_lens = df.groupby(['icustayid']).count().loc[:,['bloc']].values
    cum = 0
    for i, stay_len in enumerate(stay_lens):
        cum += stay_len
        if dided_in_hosp[i] == 0:
            reward[cum - 1] = 15
        else:
            reward[cum - 1] = -15
    print (cum)
    return reward

In [24]:
def compute_r2(df):
    keys = lambda x: x['icustayid']
    rewards = lambda x: 15 if x.iloc[-1] == 0 else -15
    return train_df.groupby('icustayid')['died_in_hosp'].transform(rewards)

In [22]:
# put back 
scaled_train_df['died_in_hosp'] = train_df_origin['died_in_hosp']
scaled_train_df['icustayid'] = train_df_origin['icustayid']
scaled_test_df['died_in_hosp'] = test_df_origin['died_in_hosp']
scaled_test_df['icustayid'] = test_df_origin['icustayid']

In [25]:
scaled_train_df['reward'] = compute_r2(train_df)
scaled_test_df['reward'] = compute_r2(test_df)

In [None]:
scaled_train_df.to_csv('../../data/train_scaled.csv',index = False)
scaled_test_df.to_csv('../../data/test_scaled.csv', index = False)

In [None]:
from discretize_sepsis_actions import discretize_actions

### train

In [None]:
train_df_origin.shape, scaled_train_df.shape

In [None]:
action_seq, md, input4 = discretize_actions(train_df_origin.loc[:,'input_4hourly_tev'], train_df_origin.loc[:,'median_dose_vaso']) 

In [None]:
# vaso_input = action_seq % 5.
# iv_input = action_seq // 5
vaso_input = action_seq // 5
iv_input = action_seq % 5

In [None]:
pd.Series(vaso_input).value_counts()

In [None]:
scaled_train_df['vaso_input'] = vaso_input
scaled_train_df['iv_input'] = iv_input

In [None]:
scaled_train_df[scaled_train_df['icustayid'] == 14]

### test

In [None]:
action_seq, md, input4 = discretize_actions(test_df_origin.loc[:,'input_4hourly_tev'], test_df_origin.loc[:,'median_dose_vaso']) 

In [None]:
vaso_input = action_seq // 5
iv_input = action_seq % 5

In [None]:
scaled_test_df['vaso_input'] = vaso_input
scaled_test_df['iv_input'] = iv_input

In [None]:
scaled_test_df.head()

In [None]:
scaled_train_df.to_csv('../../data/train_scaled.csv',index = False)
scaled_test_df.to_csv('../../data/test_scaled.csv', index = False)

### Intermediate reward & MOE classifer input features

In [None]:
scaled_train_df = pd.read_csv('../../data/train_scaled.csv')
scaled_test_df = pd.read_csv('../../data/test_scaled.csv')

In [None]:
scaled_train_df.head()

In [None]:
# compute change in sofa, and lactate
def compute_sofa_lactate_delta(df):
    c0 = -0.1/4
    c1 = -0.5/4
    c2 = -2
    shaped_rewards, delta_sofa, delta_lactate = [0], [0], [0]
    for i in df.index:
        if i == 0:
            continue
        if df.loc[i, 'icustayid'] == df.loc[i-1, 'icustayid']:
            sofa_cur = df.loc[i,'SOFA']
            sofa_prev = df.loc[i-1,'SOFA']
            lact_cur = df.loc[i,'Arterial_lactate']
            lact_prev = df.loc[i-1,'Arterial_lactate']
            delta_sofa += [sofa_cur - sofa_prev]
            delta_lactate += [lact_cur - lact_prev]
            reward = 0
            if sofa_cur == sofa_prev and sofa_cur != 0:
                reward += c0
            reward += c1*(sofa_cur-sofa_prev)
            reward += c2*np.tanh(lact_cur - lact_prev)
            shaped_rewards += [reward]
        else:
            delta_sofa += [0]
            delta_lactate += [0]
            shaped_rewards += [0]
    return delta_sofa, delta_lactate, shaped_rewards

In [None]:
origin_train_df = pd.read_csv('../../data/origin_train_clean.csv')
origin_test_df = pd.read_csv('../../data/origin_test_clean.csv')

In [None]:
def build_input_features(df):
    delta_sofa, delta_lactate, shaped_rewards = compute_sofa_lactate_delta(df)
    icuids = []
    for i in df.index:
        if i == 0:
            continue
        if df.loc[i, 'icustayid'] == df.loc[i-1, 'icustayid'] and df.loc[i, 'bloc'] != df.loc[i-1, 'bloc'] + 1:
            icuids += [ df.loc[i, 'icustayid'] ]
    icuids = list(set(icuids))
    for icuid in icuids:
        for i, idx in enumerate(df.loc[df['icustayid'] == icuid].index):
            df.loc[idx, 'bloc'] = i + 1
    input_features = pd.DataFrame()
    input_features['num_bloc'] = df['bloc']
    input_features['delta_sofa'] = delta_sofa
    input_features['delta_lactate'] = delta_lactate
    input_features['intermediate_reward'] = shaped_rewards
    return input_features

In [None]:
test_input_features = build_input_features(origin_test_df)

In [None]:
train_input_features = build_input_features(origin_train_df)

In [None]:
train_input_features.to_csv('../../data/train_input_features.csv',index = False)
test_input_features.to_csv('../../data/test_input_features.csv',index = False)

In [None]:
a = [0,1,0,1,0,1,1,1,0]
np.mean(a), np.std(a)

In [None]:
a = np.array([0,1,0,1,0,1,1,1,0])-0.5
np.mean(a), np.std(a)