In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import pickle

In [2]:
# read the validation data
val_data = pd.read_csv('validation.csv')

In [3]:
# selected features by RF
rf_selected = ['creatinine', 'inr', 'aniongap', 'ckd', 'race_white', 'bun', 'charlson_score', 'diabetes', 'hypertension',
                'race_unknown', 'height', 'pt', 'bicarbonate', 'ihd', 'first_careunit_msicu', 'ptt', 'ast', 'spo2', 'rdw', 
                'admission_age']

In [4]:
def data_preparation(df):
    cats = ['icu_death', 'gender', 'admission_type', 'atrial_fibrillation', 'malignant_cancer',
       'chf', 'ckd', 'cld', 'copd', 'diabetes', 'hypertension', 'ihd', 'stroke']
    for col in cats:
        df[col] = df[col].astype('int')

    # choose the last row for each patient
    df = df.sort_values(by=['id', 'charttime'], ascending=False)
    df = df.groupby('id').first().reset_index()

    # fill nan with mean
    num = df.select_dtypes(include=['float']).columns
    df[num] = df[num].fillna(df[num].mean())
    return df

In [5]:
def data_preprocessing(df):
    # numerical features
    num = df.select_dtypes(include=['float']).columns
    df_num = df[num]

    # z-score standardization
    scaler = preprocessing.StandardScaler()
    num_scaled = scaler.fit_transform(df_num)

    # merge scaled numeric feature with other categorical features
    num_scaled = pd.DataFrame(num_scaled, columns=num, index=df_num.index)
    df[num] = num_scaled[num]

    X = df
    y = df['icu_death']

    return X, y

In [6]:
with open('sd_rf_death.pkl', 'rb') as f:
    trained_rf = pickle.load(f)

In [7]:
val_data = data_preparation(val_data)
X, y = data_preprocessing(val_data)

In [9]:
rf_probs = trained_rf.predict_proba(X[rf_selected])
rf_probs = pd.DataFrame(rf_probs)
rf_probs['id'] = val_data['id']
rf_probs.to_csv('death_probability_sd_rf_validation.csv', index=False)