In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import sys, gc, warnings, random, math, time, datetime, os
from tqdm import tqdm_notebook
warnings.filterwarnings('ignore')

from sklearn import metrics, preprocessing
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel, RFECV

import eli5
from eli5.sklearn import PermutationImportance

import xgboost as xgb
import lightgbm as lgb
from bayes_opt import BayesianOptimization

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

Using TensorFlow backend.


In [2]:
# read df_train and df_test
df_train = pd.read_pickle("../features/df_train.pkl")
df_test = pd.read_pickle("../features/df_test.pkl")
TARGET = "hospital_death"

# read features cols and cat cols
all_features = np.load("../features/all_features.npy", allow_pickle=True).tolist()
categorical_features = np.load("../features/categorical_features.npy", allow_pickle=True).tolist()

In [3]:
print("train/test shape is:", df_train.shape, df_test.shape)
print("features used # is", len(all_features))
df_train[all_features].head()

train/test shape is: (91713, 186) (39308, 186)
features used # is 149


Unnamed: 0,age,bmi,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_type,pre_icu_los_days,weight,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,map_apache,paco2_apache,pao2_apache,ph_apache,resprate_apache,sodium_apache,temp_apache,urineoutput_apache,ventilated_apache,wbc_apache,d1_diasbp_invasive_max,d1_diasbp_invasive_min,d1_diasbp_max,d1_diasbp_min,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_heartrate_max,d1_heartrate_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_max,d1_mbp_noninvasive_min,d1_resprate_max,d1_resprate_min,d1_spo2_max,d1_spo2_min,d1_sysbp_invasive_max,d1_sysbp_invasive_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_max,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,h1_diasbp_invasive_max,h1_diasbp_invasive_min,h1_diasbp_max,h1_diasbp_min,h1_diasbp_noninvasive_max,h1_diasbp_noninvasive_min,h1_heartrate_max,h1_heartrate_min,h1_mbp_invasive_min,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,h1_temp_max,h1_temp_min,d1_albumin_max,d1_albumin_min,d1_bilirubin_max,d1_bilirubin_min,d1_bun_max,d1_bun_min,d1_calcium_max,d1_calcium_min,d1_creatinine_max,d1_creatinine_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_inr_max,d1_inr_min,d1_lactate_max,d1_lactate_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,h1_bun_max,h1_calcium_max,h1_creatinine_max,h1_glucose_max,h1_glucose_min,h1_hco3_max,h1_hemaglobin_max,h1_hemaglobin_min,h1_hematocrit_max,h1_hematocrit_min,h1_lactate_max,h1_platelets_max,h1_potassium_max,h1_wbc_max,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,diabetes_mellitus,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,68.0,22.73,2,1,180.3,4,1,2,0.541667,73.9,2.3,113.0,502.01,0.4,31.0,2.51,,3.0,6.0,4.0,168.0,118.0,27.4,40.0,,,,36.0,134.0,39.3,,0.0,14.1,46.0,32.0,68.0,37.0,68.0,37.0,119.0,72.0,66.0,40.0,89.0,46.0,89.0,46.0,34.0,10.0,100.0,74.0,122.0,64.0,131.0,73.0,131.0,73.0,39.9,37.2,,,68.0,63.0,68.0,63.0,119.0,108.0,,86.0,85.0,86.0,85.0,26.0,18.0,100.0,74.0,,,131.0,115.0,131.0,115.0,39.5,37.5,2.3,2.3,0.4,0.4,31.0,30.0,8.5,7.4,2.51,2.23,168.0,109.0,19.0,15.0,8.9,8.9,27.4,27.4,,,1.3,1.0,233.0,233.0,4.0,3.4,136.0,134.0,14.1,14.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.1,0.05,1.0,0.0,9,0
1,77.0,27.42,2,0,160.0,4,1,5,0.927778,70.2,,108.0,203.01,,9.0,0.56,1.0,1.0,3.0,1.0,145.0,120.0,36.9,46.0,37.0,51.0,7.45,33.0,145.0,35.1,,1.0,12.7,,,95.0,31.0,95.0,31.0,118.0,72.0,,,120.0,38.0,120.0,38.0,32.0,12.0,100.0,70.0,,,159.0,67.0,159.0,67.0,36.3,35.1,,,61.0,48.0,61.0,48.0,114.0,100.0,,85.0,57.0,85.0,57.0,31.0,28.0,95.0,70.0,,,95.0,71.0,95.0,71.0,36.3,36.3,1.6,1.6,0.5,0.5,11.0,9.0,8.6,8.0,0.71,0.56,145.0,128.0,27.0,26.0,11.3,11.1,36.9,36.1,1.3,1.3,3.5,3.5,557.0,487.0,4.2,3.8,145.0,145.0,23.3,12.7,9.0,8.6,0.56,145.0,143.0,27.0,11.3,11.3,36.9,36.9,3.5,557.0,4.2,12.7,37.0,37.0,7.45,7.45,51.0,51.0,54.8,51.0,37.0,37.0,7.45,7.45,51.0,51.0,51.0,0.47,0.29,1.0,0.0,8,6
2,25.0,31.95,2,0,172.7,3,0,5,0.000694,95.3,,122.0,703.03,,,,,3.0,6.0,5.0,,102.0,,68.0,,,,37.0,,36.7,,0.0,,,,88.0,48.0,88.0,48.0,96.0,68.0,,,102.0,68.0,102.0,68.0,21.0,8.0,98.0,91.0,,,148.0,105.0,148.0,105.0,37.0,36.7,,,88.0,58.0,88.0,58.0,96.0,78.0,,91.0,83.0,91.0,83.0,20.0,16.0,98.0,91.0,,,148.0,124.0,148.0,124.0,36.7,36.7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,5,3
3,81.0,22.64,2,0,165.1,8,2,2,0.000694,61.7,,203.0,1206.03,,,,0.6,4.0,6.0,5.0,185.0,114.0,25.9,60.0,30.0,142.0,7.39,4.0,,34.8,,1.0,8.0,62.0,30.0,48.0,42.0,48.0,42.0,116.0,92.0,92.0,52.0,84.0,84.0,84.0,84.0,23.0,7.0,100.0,95.0,164.0,78.0,158.0,84.0,158.0,84.0,38.0,34.8,62.0,44.0,62.0,44.0,,,100.0,96.0,71.0,92.0,71.0,,,12.0,11.0,100.0,99.0,136.0,106.0,136.0,106.0,,,35.6,34.8,,,,,,,,,,,185.0,88.0,,,11.6,8.9,34.0,25.9,1.6,1.1,,,198.0,43.0,5.0,3.5,,,9.0,8.0,,,,,,,11.6,11.6,34.0,34.0,,43.0,,8.8,37.0,27.0,7.44,7.34,337.0,102.0,342.5,236.666667,36.0,33.0,7.37,7.34,337.0,265.0,337.0,0.04,0.03,0.0,0.0,0,0
4,19.0,,2,1,188.0,15,0,5,0.073611,,,119.0,601.01,,,,,,,,,60.0,,103.0,,,,16.0,,36.7,,0.0,,,,99.0,57.0,99.0,57.0,89.0,60.0,,,104.0,90.0,104.0,90.0,18.0,16.0,100.0,96.0,,,147.0,120.0,147.0,120.0,37.2,36.7,,,99.0,68.0,99.0,68.0,89.0,76.0,,104.0,92.0,104.0,92.0,,,100.0,100.0,,,130.0,120.0,130.0,120.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,10,7


In [4]:
RANDOM_SEED = 2020
param = {
    "objective": "binary:logistic",
    'metric': 'auc',
    'num_leaves': 128,
    'learning_rate': 0.01,
    'n_estimators': 5000,
    'subsample': 0.7,
    "feature_fraction": 0.15,
    "lambda_l1": 4.1,
    "lambda_l2": 2.8,
    "max_depth": 8,
    "min_child_weight": 64,
    "min_split_gain": 0.02,
    'seed': RANDOM_SEED,
    'n_jobs': -1,
    'tree_method':'gpu_hist',
}

In [5]:
cv= 5
kf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_SEED)

oof_pred = np.zeros(df_train.shape[0])
y_pred = np.zeros(df_test.shape[0])
df_test[TARGET] = 0

for fold, (train_index, valid_index) in enumerate(kf.split(df_train, y = df_train[TARGET])):
    
    print("Traning on folder", fold, "...")
    X_train, X_valid = df_train.iloc[train_index][all_features], df_train.iloc[valid_index][all_features]
    y_train, y_valid = df_train.iloc[train_index][TARGET], df_train.iloc[valid_index][TARGET]
    
    model_lgb = xgb.XGBRegressor(**param)
    
    model_lgb.fit(X_train, y_train,
                    eval_set=[(X_train, y_train), (X_valid, y_valid)],
                    eval_metric="auc",
                    early_stopping_rounds=500,
                    verbose=1000)
    y_pred_valid = model_lgb.predict(X_valid)
    oof_pred[valid_index] = y_pred_valid
    
    df_test[TARGET] += model_lgb.predict(df_test[all_features]) / cv
    print("")
    
print("-------------------------------------")
print("Overall AUC score is", roc_auc_score(df_train[TARGET], oof_pred))

Traning on folder 0 ...
[0]	validation_0-auc:0.868421	validation_1-auc:0.857044
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 500 rounds.
[1000]	validation_0-auc:0.931999	validation_1-auc:0.902005
[2000]	validation_0-auc:0.949999	validation_1-auc:0.904494
Stopping. Best iteration:
[2326]	validation_0-auc:0.954649	validation_1-auc:0.904766


Traning on folder 1 ...
[0]	validation_0-auc:0.862488	validation_1-auc:0.855173
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 500 rounds.
[1000]	validation_0-auc:0.931013	validation_1-auc:0.906367
[2000]	validation_0-auc:0.949525	validation_1-auc:0.908858
[3000]	validation_0-auc:0.96269	validation_1-auc:0.909133
Stopping. Best iteration:
[2819]	validation_0-auc:0.960643	validation_1-auc:0.909173


Traning on folder 2 ...
[0]	validation_0-auc:0.866522	va

In [7]:
# 0.9067
WRITE_TOKEN = True
# submission should be (39308, 2)
print("submission shape is", df_test[["encounter_id", TARGET]].shape)
if WRITE_TOKEN:
    df_test[["encounter_id", TARGET]].to_csv("../submissions/sub-xgb-k-5-seed-1993-cv-9065.csv", index=None)
    print("submission is saved successfully!")

submission shape is (39308, 2)
submission is saved successfully!
