In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame
from sklearn.preprocessing import MinMaxScaler

In [2]:
train_df_origin = pd.read_csv('path/to/trainset')
test_df_origin = pd.read_csv('path/to/testset')

In [3]:
# remove invalid samples
train_to_be_removed = train_df_origin[train_df_origin.input_total_tev < 0]['icustayid']
train_df_origin = train_df_origin[~train_df_origin.icustayid.isin(list(set(train_to_be_removed)))]

test_to_be_removed = test_df_origin[test_df_origin.input_total_tev < 0]['icustayid']
test_df_origin = test_df_origin[~test_df_origin.icustayid.isin(list(set(test_to_be_removed)))]

In [4]:
set(train_to_be_removed.values), set(test_to_be_removed.values)

({76921.0, 77243.0, 90270.0, 91090.0, 98366.0},
 {37166.0, 67895.0, 68546.0, 85154.0})

In [5]:
binary_fields = ['gender','mechvent','re_admission']
norm_fields= ['age','Weight_kg','GCS','HR','SysBP','MeanBP','DiaBP','RR','Temp_C','FiO2_1',
    'Potassium','Sodium','Chloride','Glucose','Magnesium','Calcium',
    'Hb','WBC_count','Platelets_count','PTT','PT','Arterial_pH','paO2','paCO2',
    'Arterial_BE','HCO3','Arterial_lactate','SOFA','SIRS','Shock_Index',
    'PaO2_FiO2','cumulated_balance_tev', 'elixhauser', 'Albumin', u'CO2_mEqL', 'Ionised_Ca']
log_fields = ['max_dose_vaso','SpO2','BUN','Creatinine','SGOT','SGPT','Total_bili','INR',
              'input_total_tev','input_4hourly_tev','output_total','output_4hourly', 'bloc']

In [14]:
train_df = train_df_origin.copy()
test_df = test_df_origin.copy()

In [15]:
del train_df['charttime']
del test_df['charttime']
train_df.head()

Unnamed: 0,bloc,icustayid,gender,age,elixhauser,re_admission,SOFA,SIRS,Weight_kg,GCS,...,input_total_tev,input_4hourly_tev,output_total,output_4hourly,cumulated_balance_tev,sedation,mechvent,rrt,died_in_hosp,mortality_90d
0,1.0,12.0,1.0,12049.217303,0.0,0.0,7.0,1.0,51.200001,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,12.0,1.0,12049.217303,0.0,0.0,3.0,2.0,51.200001,15.0,...,0.0,0.0,650.0,650.0,-650.0,0.0,0.0,0.0,0.0,0.0
2,3.0,12.0,1.0,12049.217303,0.0,0.0,2.0,2.0,51.200001,15.0,...,0.0,0.0,1200.0,550.0,-1200.0,0.0,0.0,0.0,0.0,0.0
3,4.0,12.0,1.0,12049.217303,0.0,0.0,5.0,2.0,51.200001,15.0,...,0.0,0.0,1200.0,0.0,-1200.0,0.0,0.0,0.0,0.0,0.0
4,1.0,14.0,0.0,30946.97,2.0,0.0,5.0,2.0,56.872728,3.571429,...,1300.0,1300.0,340.0,160.0,960.0,1.0,1.0,0.0,0.0,1.0


In [16]:
# binary fields
train_df[binary_fields] = train_df[binary_fields] - 0.5 
test_df[binary_fields] = test_df[binary_fields] - 0.5

In [17]:
# normal fields
for item in norm_fields:
    av = train_df[item].mean()
    std = train_df[item].std()
    train_df[item] = (train_df[item] - av) / std
    test_df[item] = (test_df[item] - av) / std

In [18]:
# log fields
train_df[log_fields] = np.log(0.1 + train_df[log_fields])
test_df[log_fields] = np.log(0.1 + test_df[log_fields])

for item in log_fields:
    av = train_df[item].mean()
    std = train_df[item].std()
    train_df[item] = (train_df[item] - av) / std
    test_df[item] = (test_df[item] - av) / std

In [20]:
# scale features to [0,1]
scaled_train_df = pd.DataFrame(MinMaxScaler().fit_transform(train_df), columns=train_df.keys())
scaled_test_df = pd.DataFrame(MinMaxScaler().fit_transform(test_df), columns=test_df.keys())

In [22]:
scaled_train_df['died_in_hosp'] = train_df_origin['died_in_hosp']
scaled_train_df['icustayid'] = train_df_origin['icustayid']
scaled_test_df['died_in_hosp'] = test_df_origin['died_in_hosp']
scaled_test_df['icustayid'] = test_df_origin['icustayid']

In [25]:
scaled_train_df.head()

Unnamed: 0,bloc,icustayid,gender,age,elixhauser,re_admission,SOFA,SIRS,Weight_kg,GCS,...,input_total_tev,input_4hourly_tev,output_total,output_4hourly,cumulated_balance_tev,sedation,mechvent,rrt,died_in_hosp,mortality_90d
0,0.0,12.0,1.0,0.203388,0.0,0.0,0.304348,0.25,0.183842,1.0,...,0.0,0.0,0.0,0.0,0.177479,0.0,0.0,0.0,0.0,0.0
1,0.22256,12.0,1.0,0.203388,0.0,0.0,0.130435,0.5,0.183842,1.0,...,0.0,0.0,0.616278,0.782139,0.175772,0.0,0.0,0.0,0.0,0.0
2,0.356608,12.0,1.0,0.203388,0.0,0.0,0.086957,0.5,0.183842,1.0,...,0.0,0.0,0.659309,0.76726,0.174328,0.0,0.0,0.0,0.0,0.0
3,0.452837,12.0,1.0,0.203388,0.0,0.0,0.217391,0.5,0.183842,1.0,...,0.0,0.0,0.659309,0.0,0.174328,0.0,0.0,0.0,0.0,0.0
4,0.0,14.0,0.0,0.90901,0.142857,0.0,0.217391,0.5,0.204211,0.047619,...,0.632976,0.822795,0.570801,0.657302,0.179999,1.0,1.0,0.0,0.0,1.0
