In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import sys, gc, warnings, random, math, time, datetime, os
from tqdm import tqdm_notebook
warnings.filterwarnings('ignore')

from sklearn import metrics, preprocessing
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel, RFECV

import eli5
from eli5.sklearn import PermutationImportance

import xgboost as xgb
import lightgbm as lgb
from bayes_opt import BayesianOptimization

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

Using TensorFlow backend.


In [2]:
# read df_train and df_test
df_train = pd.read_pickle("../features/df_train.pkl")
df_test = pd.read_pickle("../features/df_test.pkl")
TARGET = "hospital_death"

# read features cols and cat cols
all_features = np.load("../features/all_features.npy", allow_pickle=True).tolist()
categorical_features = np.load("../features/categorical_features.npy", allow_pickle=True).tolist()

In [3]:
all_features = set(all_features) - set(["apache_4a_hospital_death_prob", "apache_4a_icu_death_prob"])

In [4]:
print("train/test shape is:", df_train.shape, df_test.shape)
print("features used # is", len(all_features))
df_train[all_features].head()

train/test shape is: (91713, 186) (39308, 186)
features used # is 147


Unnamed: 0,d1_diasbp_min,weight,d1_arterial_ph_min,d1_heartrate_min,h1_arterial_ph_min,h1_arterial_po2_min,d1_hematocrit_max,bilirubin_apache,d1_sysbp_max,resprate_apache,h1_diasbp_invasive_min,h1_arterial_ph_max,h1_resprate_max,d1_temp_max,h1_wbc_max,apache_2_bodysystem,d1_arterial_po2_max,ph_apache,d1_heartrate_max,icu_admit_source,h1_mbp_min,h1_calcium_max,h1_potassium_max,hematocrit_apache,d1_bun_min,albumin_apache,ventilated_apache,hospital_admit_source,d1_creatinine_min,h1_hco3_max,temp_apache,d1_mbp_invasive_min,h1_heartrate_min,glucose_apache,d1_hco3_max,h1_creatinine_max,d1_sysbp_noninvasive_min,h1_lactate_max,d1_glucose_min,h1_sysbp_invasive_max,d1_wbc_max,gender,d1_creatinine_max,h1_pao2fio2ratio_max,d1_arterial_po2_min,d1_calcium_max,d1_mbp_min,h1_sysbp_min,map_apache,h1_sysbp_noninvasive_min,d1_albumin_max,d1_resprate_min,h1_sysbp_noninvasive_max,d1_bun_max,d1_sysbp_invasive_max,pao2_apache,d1_diasbp_invasive_max,gcs_eyes_apache,d1_sodium_min,d1_albumin_min,h1_mbp_max,d1_arterial_pco2_max,solid_tumor_with_metastasis,h1_diasbp_max,d1_temp_min,d1_hematocrit_min,h1_resprate_min,h1_spo2_max,d1_diasbp_noninvasive_max,urineoutput_apache,apache_2_diagnosis,d1_resprate_max,h1_arterial_pco2_min,h1_arterial_po2_max,d1_platelets_max,gcs_verbal_apache,heart_rate_apache,d1_mbp_noninvasive_max,d1_pao2fio2ratio_min,d1_platelets_min,d1_spo2_max,paco2_apache,h1_hemaglobin_max,d1_pao2fio2ratio_max,d1_bilirubin_min,d1_sodium_max,h1_heartrate_max,h1_temp_min,pre_icu_los_days,d1_arterial_pco2_min,d1_sysbp_noninvasive_max,d1_mbp_noninvasive_min,age,d1_calcium_min,creatinine_apache,d1_hemaglobin_max,ethnicity,d1_inr_min,bmi,h1_glucose_min,apache_3j_diagnosis,d1_potassium_min,h1_platelets_max,icu_type,d1_glucose_max,d1_sysbp_invasive_min,h1_sysbp_invasive_min,d1_mbp_max,height,d1_mbp_invasive_max,h1_hematocrit_min,d1_diasbp_max,h1_spo2_min,h1_mbp_noninvasive_min,d1_inr_max,bun_apache,h1_arterial_pco2_max,h1_diasbp_noninvasive_min,h1_temp_max,h1_hematocrit_max,h1_bun_max,h1_mbp_invasive_min,d1_hco3_min,d1_lactate_min,d1_lactate_max,d1_wbc_min,h1_diasbp_invasive_max,wbc_apache,h1_sysbp_max,diabetes_mellitus,d1_sysbp_min,d1_potassium_max,sodium_apache,d1_bilirubin_max,d1_hemaglobin_min,h1_mbp_noninvasive_max,h1_glucose_max,d1_arterial_ph_max,d1_diasbp_invasive_min,apache_3j_bodysystem,h1_diasbp_noninvasive_max,h1_hemaglobin_min,fio2_apache,d1_diasbp_noninvasive_min,h1_diasbp_min,d1_spo2_min,gcs_motor_apache
0,37.0,73.9,,72.0,,,27.4,0.4,131.0,36.0,,,26.0,39.9,,0,,,119.0,1,85.0,,,27.4,30.0,2.3,0.0,4,2.23,,39.3,40.0,108.0,168.0,19.0,,73.0,,109.0,,14.1,1,2.51,,,8.5,46.0,115.0,40.0,115.0,2.3,10.0,131.0,31.0,122.0,,46.0,3.0,134.0,2.3,86.0,,0.0,68.0,37.2,27.4,18.0,100.0,68.0,,113.0,34.0,,,233.0,4.0,118.0,89.0,,233.0,100.0,,,,0.4,136.0,119.0,37.5,0.541667,,131.0,46.0,68.0,7.4,2.51,8.9,2,,22.73,,502.01,3.4,,2,168.0,64.0,,89.0,180.3,66.0,,68.0,74.0,85.0,,31.0,,63.0,39.5,,,,15.0,1.0,1.3,14.1,,14.1,131.0,1.0,73.0,4.0,134.0,0.4,8.9,86.0,,,32.0,9,68.0,,,37.0,63.0,74.0,6.0
1,31.0,70.2,7.45,72.0,7.45,51.0,36.9,,159.0,33.0,,7.45,31.0,36.3,12.7,6,51.0,7.45,118.0,1,57.0,8.6,4.2,36.9,9.0,,1.0,4,0.56,27.0,35.1,,100.0,145.0,27.0,0.56,67.0,3.5,128.0,,23.3,0,0.71,51.0,51.0,8.6,38.0,71.0,46.0,71.0,1.6,12.0,95.0,11.0,,51.0,,1.0,145.0,1.6,85.0,37.0,0.0,61.0,35.1,36.1,28.0,95.0,95.0,,108.0,32.0,37.0,51.0,557.0,1.0,120.0,120.0,51.0,487.0,100.0,37.0,11.3,54.8,0.5,145.0,114.0,36.3,0.927778,37.0,159.0,38.0,77.0,8.0,0.56,11.3,2,1.3,27.42,143.0,203.01,3.8,557.0,5,145.0,,,120.0,160.0,,36.9,95.0,70.0,57.0,1.3,9.0,37.0,48.0,36.3,36.9,9.0,,26.0,3.5,3.5,12.7,,12.7,95.0,1.0,67.0,4.2,145.0,0.5,11.1,85.0,145.0,7.45,,8,61.0,11.3,1.0,31.0,48.0,70.0,3.0
2,48.0,95.3,,68.0,,,,,148.0,37.0,,,20.0,37.0,,3,,,96.0,0,83.0,,,,,,0.0,3,,,36.7,,78.0,,,,105.0,,,,,0,,,,,68.0,124.0,68.0,124.0,,8.0,148.0,,,,,3.0,,,91.0,,0.0,88.0,36.7,,16.0,98.0,88.0,,122.0,21.0,,,,5.0,102.0,102.0,,,98.0,,,,,,96.0,36.7,0.000694,,148.0,68.0,25.0,,,,2,,31.95,,703.03,,,5,,,,102.0,172.7,,,88.0,91.0,83.0,,,,58.0,36.7,,,,,,,,,,148.0,0.0,105.0,,,,,91.0,,,,5,88.0,,,48.0,58.0,91.0,6.0
3,42.0,61.7,7.34,92.0,7.34,265.0,34.0,,158.0,4.0,44.0,7.37,12.0,38.0,8.8,0,337.0,7.39,116.0,2,71.0,,,25.9,,,1.0,8,,,34.8,52.0,96.0,185.0,,,84.0,,88.0,136.0,9.0,0,,337.0,102.0,,84.0,106.0,60.0,,,7.0,,,164.0,142.0,62.0,4.0,,,92.0,37.0,0.0,62.0,34.8,25.9,11.0,100.0,48.0,,203.0,23.0,33.0,337.0,198.0,5.0,114.0,84.0,236.666667,43.0,100.0,30.0,11.6,342.5,,,100.0,34.8,0.000694,27.0,158.0,84.0,81.0,,,11.6,2,1.1,22.64,,1206.03,3.5,43.0,2,185.0,78.0,106.0,84.0,165.1,92.0,34.0,48.0,99.0,,1.6,,36.0,,35.6,34.0,,71.0,,,,8.0,62.0,8.0,136.0,0.0,84.0,5.0,,,8.9,,,7.44,30.0,0,,11.6,0.6,42.0,44.0,95.0,6.0
4,57.0,,,60.0,,,,,147.0,16.0,,,,37.2,,7,,,89.0,0,92.0,,,,,,0.0,15,,,36.7,,76.0,,,,120.0,,,,,1,,,,,90.0,120.0,103.0,120.0,,16.0,130.0,,,,,,,,104.0,,0.0,99.0,36.7,,,100.0,99.0,,119.0,18.0,,,,,60.0,104.0,,,100.0,,,,,,89.0,,0.073611,,147.0,90.0,19.0,,,,2,,,,601.01,,,5,,,,104.0,188.0,,,99.0,100.0,92.0,,,,68.0,,,,,,,,,,,130.0,0.0,120.0,,,,,104.0,,,,10,99.0,,,57.0,68.0,96.0,


In [6]:
for RANDOM_SEED in range(2012, 2013):
    param = {
        "objective": "binary",
        'metric': 'auc',
        'num_leaves': 128,
        'learning_rate': 0.01,
        'n_estimators': 5000,
        'subsample': 0.5,
        "feature_fraction": 0.15,
        "lambda_l1": 2.2,
        "lambda_l2": 1.6,
        "max_depth": 8,
        "min_child_weight": 8,
        "min_split_gain": 0.1,
        'seed': RANDOM_SEED,
        'n_jobs': -1
    }

    cv= 8
    kf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_SEED)

    oof_pred = np.zeros(df_train.shape[0])
    y_pred = np.zeros(df_test.shape[0])
    df_test[TARGET] = 0

    for fold, (train_index, valid_index) in enumerate(kf.split(df_train, y = df_train[TARGET])):

        print("Traning on folder", fold, "...")
        X_train, X_valid = df_train.iloc[train_index][all_features], df_train.iloc[valid_index][all_features]
        y_train, y_valid = df_train.iloc[train_index][TARGET], df_train.iloc[valid_index][TARGET]

        model_lgb = lgb.LGBMRegressor(**param)

        model_lgb.fit(X_train, y_train,
                        eval_set=[(X_train, y_train), (X_valid, y_valid)],
                        categorical_feature=categorical_features,
                        early_stopping_rounds=500,
                        verbose=200)
        y_pred_valid = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration_)
        oof_pred[valid_index] = y_pred_valid

        df_test[TARGET] += model_lgb.predict(df_test[all_features], num_iteration=model_lgb.best_iteration_) / cv
        print("")

    print("-------------------------------------")
    print("Overall AUC score is", roc_auc_score(df_train[TARGET], oof_pred))

    # 0.9082
    WRITE_TOKEN = 1
    # submission should be (39308, 2)
    print("submission shape is", df_test[["encounter_id", TARGET]].shape)
    if WRITE_TOKEN:
        df_test[["encounter_id", TARGET]].to_csv("../submissions/sub-lgb_" + str(cv) + "_" + str(RANDOM_SEED) +".csv", index=None)
        print("submission is saved successfully!")

Traning on folder 0 ...
Training until validation scores don't improve for 500 rounds
[200]	training's auc: 0.925692	valid_1's auc: 0.876312
[400]	training's auc: 0.940819	valid_1's auc: 0.885471
[600]	training's auc: 0.951523	valid_1's auc: 0.889741
[800]	training's auc: 0.959437	valid_1's auc: 0.891864
[1000]	training's auc: 0.965554	valid_1's auc: 0.89333
[1200]	training's auc: 0.970414	valid_1's auc: 0.894335
[1400]	training's auc: 0.974353	valid_1's auc: 0.894866
[1600]	training's auc: 0.977708	valid_1's auc: 0.895175
[1800]	training's auc: 0.980546	valid_1's auc: 0.895573
[2000]	training's auc: 0.983192	valid_1's auc: 0.895846
[2200]	training's auc: 0.985361	valid_1's auc: 0.89599
[2400]	training's auc: 0.987465	valid_1's auc: 0.896091
[2600]	training's auc: 0.989152	valid_1's auc: 0.896109
[2800]	training's auc: 0.990734	valid_1's auc: 0.896058
Early stopping, best iteration is:
[2416]	training's auc: 0.98761	valid_1's auc: 0.896153

Traning on folder 1 ...
Training until valida