In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import pickle

In [2]:
# read the validation data
val_data = pd.read_csv('validation.csv')

In [3]:
# import the mortality prediction from Wen Rui's model
mortality = pd.read_csv('180424_Validation_death.csv.csv')

In [4]:
# To input the predicted death status as feature
mortality['id'] = val_data['id'].unique()
val_data = pd.merge(val_data, mortality, on='id', how='right').drop(columns=['icu_death', 'Unnamed: 0'])
val_data = val_data.rename(columns={'0': 'icu_death'})

In [5]:
# selected features
selected_lst = ['height', 'wbc', 'weight_admit', 'admission_age', 'glucose', 'ptt', 'bun', 'platelet', 'ck_cpk', 'ld_ldh', 
                'alp', 'ast', 'fibrinogen', 'pt', 'creatinine', 'amylase', 'rbc', 'rdw', 'alt',  'mchc', 'icu_death']

In [6]:
def data_preparation(df):
    cats = ['icu_death', 'gender', 'admission_type', 'atrial_fibrillation', 'malignant_cancer',
       'chf', 'ckd', 'cld', 'copd', 'diabetes', 'hypertension', 'ihd', 'stroke']
    for col in cats:
        df[col] = df[col].astype('int')

    # choose the last row for each patient
    df = df.sort_values(by=['id', 'charttime'], ascending=False)
    df = df.groupby('id').first().reset_index()

    # fill nan with mean
    num = df.select_dtypes(include=['float']).columns
    df[num] = df[num].fillna(df[num].mean())

    df['los_icu_class'] = df['los_icu_class'].apply(lambda x: 0 if x == 'less than 3 days' else 1)
    return df

In [7]:
def data_preprocessing(df):
    # numerical features
    num = df.select_dtypes(include=['float']).columns
    df_num = df[num]

    # z-score standardization
    scaler = preprocessing.StandardScaler()
    num_scaled = scaler.fit_transform(df_num)

    # merge scaled numeric feature with other categorical features
    num_scaled = pd.DataFrame(num_scaled, columns=num, index=df_num.index)
    df[num] = num_scaled[num]

    X = df
    y = df['los_icu_class']

    return X, y

In [8]:
with open('sd_rf_death.pkl', 'rb') as f:
    trained_rf = pickle.load(f)

In [9]:
val_data = data_preparation(val_data)
X, y = data_preprocessing(val_data)

In [10]:
rf_probs = trained_rf.predict_proba(X[selected_lst])
rf_probs = pd.DataFrame(rf_probs)
rf_probs['id'] = val_data['id']
rf_probs.to_csv('los_probability_sd_rf_validation.csv', index=False)