# EDA for WiDS competition

Link to competition: https://www.kaggle.com/c/widsdatathon2020/

Description:
The challenge is to create a model that uses data from the first 24 hours of intensive care to predict patient survival. MIT's GOSSIS community initiative, with privacy certification from the Harvard Privacy Lab, has provided a dataset of more than 130,000 hospital Intensive Care Unit (ICU) visits from patients, spanning a one-year timeframe. This data is part of a growing global effort and consortium spanning Argentina, Australia, New Zealand, Sri Lanka, Brazil, and more than 200 hospitals in the United States.

In [1]:
import pandas as pd
import numpy as np

raw_X = pd.read_csv("widsdatathon2020/training_v2.csv")
cols = pd.read_csv("widsdatathon2020/WiDS_Datathon_2020_Dictionary.csv")
cols = cols.drop([12, 187], axis=0) # drop icu_admit_type and pred

In [2]:
np.unique(cols["Data Type"], return_counts=True)

(array(['binary', 'integer', 'numeric', 'string'], dtype=object),
 array([ 16,   7, 152,  11]))

In [3]:
cols[cols["Data Type"] == "integer"]

Unnamed: 0,Category,Variable Name,Unit of Measure,Data Type,Description,Example
0,identifier,encounter_id,,integer,Unique identifier associated with a patient un...,
1,identifier,hospital_id,,integer,Unique identifier associated with a hospital,
2,identifier,patient_id,,integer,Unique identifier associated with a patient,
13,demographic,icu_id,,integer,A unique identifier for the unit to which the ...,
28,APACHE covariate,gcs_eyes_apache,,integer,The eye opening component of the Glasgow Coma ...,4.0
29,APACHE covariate,gcs_motor_apache,,integer,The motor component of the Glasgow Coma Scale ...,6.0
31,APACHE covariate,gcs_verbal_apache,,integer,The verbal component of the Glasgow Coma Scale...,5.0


In [4]:
cols["Variable Name"][cols["Category"] == "identifier"]

0    encounter_id
1     hospital_id
2      patient_id
Name: Variable Name, dtype: object

In [5]:
# remove identifier columns and prediction columns
ids = raw_X[cols["Variable Name"][cols["Category"] == "identifier"]]
ids.head()

Unnamed: 0,encounter_id,hospital_id,patient_id
0,66154,118,25312
1,114252,81,59342
2,119783,118,50777
3,79267,118,46918
4,92056,33,34377


In [6]:
X = raw_X[cols["Variable Name"][cols["Category"] != "identifier"]]
X = X.drop("hospital_death", axis=1)
X.head()

Unnamed: 0,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_id,icu_stay_type,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,68.0,22.73,0,Caucasian,M,180.3,Floor,Floor,92,admit,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,77.0,27.42,0,Caucasian,F,160.0,Floor,Floor,90,admit,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,25.0,31.95,0,Caucasian,F,172.7,Emergency Department,Accident & Emergency,93,admit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,81.0,22.64,1,Caucasian,F,165.1,Operating Room,Operating Room / Recovery,92,admit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,19.0,,0,Caucasian,M,188.0,,Accident & Emergency,91,admit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma


In [7]:
y = raw_X["hospital_death"]

In [8]:
# check imbalance of predictions
np.unique(y, return_counts=True)[1]/y.size

array([0.91369817, 0.08630183])

In [9]:
X_scaled = X[cols["Variable Name"][cols["Data Type"] == "numeric"]]
# fill in NA values
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
X_scaled = imp.fit_transform(X_scaled)

# PCA on numerical values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_scaled)
X_scaled

array([[0.71232877, 0.73814009, 0.13849782, ..., 0.32089595, 0.55276382,
        0.53299492],
       [0.83561644, 0.39047782, 0.14059582, ..., 0.02107605, 0.73869347,
        0.65482234],
       [0.12328767, 0.60798082, 0.13555836, ..., 0.32089595, 0.50251256,
        0.50761421],
       ...,
       [0.43835616, 0.56516527, 0.13580741, ..., 0.32089595, 0.52763819,
        0.5177665 ],
       [0.63437693, 0.3031341 , 0.13599985, ..., 0.32089595, 0.53768844,
        0.5177665 ],
       [0.90410959, 0.39047782, 0.1356527 , ..., 0.32089595, 0.59798995,
        0.54822335]])

In [10]:
from sklearn.decomposition import PCA
for i in range(2, 10):
    pca = PCA(n_components=i)
    pca.fit(X_scaled)
    print(pca.explained_variance_ratio_)

[0.18949449 0.07655298]
[0.18949449 0.07655298 0.06383089]
[0.18949449 0.07655298 0.06383089 0.05412132]
[0.18949449 0.07655298 0.06383089 0.05412132 0.04341893]
[0.18949449 0.07655298 0.06383089 0.05412132 0.04341897 0.03389489]
[0.18949449 0.07655298 0.06383089 0.05412132 0.04341897 0.03389469
 0.02809618]
[0.18949449 0.07655298 0.06383089 0.05412132 0.04341897 0.03389492
 0.02809557 0.0265101 ]
[0.18949449 0.07655298 0.06383089 0.05412132 0.04341897 0.0338949
 0.02809621 0.02650979 0.02507987]


In [11]:
# jk let's look at lasso
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty="l1", random_state=514).fit(X_scaled, y)




In [12]:
pd.set_option('display.max_columns', 200)
pd.DataFrame(np.round(clf.coef_, 2), columns=cols["Variable Name"][cols["Data Type"] == "numeric"])

Variable Name,age,height,pre_icu_los_days,weight,albumin_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,glucose_apache,heart_rate_apache,hematocrit_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2_apache,ph_apache,resprate_apache,sodium_apache,temp_apache,urineoutput_apache,wbc_apache,d1_diasbp_invasive_max,d1_diasbp_invasive_min,d1_diasbp_max,d1_diasbp_min,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_heartrate_max,d1_heartrate_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_max,d1_mbp_noninvasive_min,d1_resprate_max,d1_resprate_min,d1_spo2_max,d1_spo2_min,d1_sysbp_invasive_max,d1_sysbp_invasive_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_max,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,h1_diasbp_invasive_max,h1_diasbp_invasive_min,h1_diasbp_max,h1_diasbp_min,h1_diasbp_noninvasive_max,h1_diasbp_noninvasive_min,h1_heartrate_max,h1_heartrate_min,h1_mbp_invasive_max,h1_mbp_invasive_min,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,h1_temp_max,h1_temp_min,d1_albumin_max,d1_albumin_min,d1_bilirubin_max,d1_bilirubin_min,d1_bun_max,d1_bun_min,d1_calcium_max,d1_calcium_min,d1_creatinine_max,d1_creatinine_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_inr_max,d1_inr_min,d1_lactate_max,d1_lactate_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,h1_albumin_max,h1_albumin_min,h1_bilirubin_max,h1_bilirubin_min,h1_bun_max,h1_bun_min,h1_calcium_max,h1_calcium_min,h1_creatinine_max,h1_creatinine_min,h1_glucose_max,h1_glucose_min,h1_hco3_max,h1_hco3_min,h1_hemaglobin_max,h1_hemaglobin_min,h1_hematocrit_max,h1_hematocrit_min,h1_inr_max,h1_inr_min,h1_lactate_max,h1_lactate_min,h1_platelets_max,h1_platelets_min,h1_potassium_max,h1_potassium_min,h1_sodium_max,h1_sodium_min,h1_wbc_max,h1_wbc_min,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,apache_4a_hospital_death_prob,apache_4a_icu_death_prob
0,1.27,0.15,3.36,-0.59,0.11,0.0,-1.14,0.33,0.46,-0.36,-0.36,0.67,0.24,-0.09,-0.22,-0.43,0.31,0.36,-0.55,0.0,-0.16,-0.25,-0.09,0.19,-0.34,-0.11,0.0,0.0,1.65,-0.44,-0.17,-0.8,-0.16,-0.81,0.0,0.2,-0.2,1.25,-2.52,-2.02,-0.13,-0.51,-0.07,-0.94,0.0,-0.04,0.44,-1.39,0.0,0.67,0.64,0.32,0.0,0.0,-0.54,0.43,0.46,0.0,-0.28,0.54,-0.13,-1.26,0.12,3.93,-0.17,0.95,-0.32,-0.14,-0.11,-0.12,0.61,0.0,-0.66,-0.27,0.0,-0.78,2.11,0.95,0.0,2.18,0.21,0.0,-0.55,0.04,-0.07,0.87,-0.52,0.03,-0.82,-1.29,0.21,1.24,0.08,0.41,0.85,2.95,0.72,-1.15,0.33,-0.3,0.97,-0.79,-0.15,1.21,-0.13,-0.32,0.0,0.0,-0.08,0.0,-0.58,0.31,-0.0,-0.05,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.03,0.15,0.1,0.0,-0.63,0.0,0.01,-0.11,0.0,0.05,0.0,-0.43,0.0,1.17,-0.52,-0.74,0.76,0.13,0.66,-0.65,-0.5,0.0,0.08,-0.28,0.0,-1.53,1.42,0.36,0.0,5.54,-0.11


In [13]:
from sklearn import metrics
metrics.roc_auc_score(y, clf.predict(X_scaled))


0.6282668522567061

In [15]:
cols["Variable Name"][cols["Data Type"] == "numeric"].to_list()

['age',
 'height',
 'pre_icu_los_days',
 'weight',
 'albumin_apache',
 'bilirubin_apache',
 'bun_apache',
 'creatinine_apache',
 'fio2_apache',
 'glucose_apache',
 'heart_rate_apache',
 'hematocrit_apache',
 'map_apache',
 'paco2_apache',
 'paco2_for_ph_apache',
 'pao2_apache',
 'ph_apache',
 'resprate_apache',
 'sodium_apache',
 'temp_apache',
 'urineoutput_apache',
 'wbc_apache',
 'd1_diasbp_invasive_max',
 'd1_diasbp_invasive_min',
 'd1_diasbp_max',
 'd1_diasbp_min',
 'd1_diasbp_noninvasive_max',
 'd1_diasbp_noninvasive_min',
 'd1_heartrate_max',
 'd1_heartrate_min',
 'd1_mbp_invasive_max',
 'd1_mbp_invasive_min',
 'd1_mbp_max',
 'd1_mbp_min',
 'd1_mbp_noninvasive_max',
 'd1_mbp_noninvasive_min',
 'd1_resprate_max',
 'd1_resprate_min',
 'd1_spo2_max',
 'd1_spo2_min',
 'd1_sysbp_invasive_max',
 'd1_sysbp_invasive_min',
 'd1_sysbp_max',
 'd1_sysbp_min',
 'd1_sysbp_noninvasive_max',
 'd1_sysbp_noninvasive_min',
 'd1_temp_max',
 'd1_temp_min',
 'h1_diasbp_invasive_max',
 'h1_diasbp_inva

In [19]:
coef_vals = pd.DataFrame({"variable": cols["Variable Name"][cols["Data Type"] == "numeric"], "coef": np.round(clf.coef_, 2)[0]})
non_zero_coefs = coef_vals[coef_vals["coef"] != 0]
non_zero_coefs

Unnamed: 0,variable,coef
4,age,1.27
9,height,0.15
16,pre_icu_los_days,3.36
18,weight,-0.59
19,albumin_apache,0.11
25,bun_apache,-1.14
26,creatinine_apache,0.33
27,fio2_apache,0.46
32,glucose_apache,-0.36
33,heart_rate_apache,-0.36


In [23]:
X[non_zero_coefs['variable']]

Unnamed: 0,age,height,pre_icu_los_days,weight,albumin_apache,bun_apache,creatinine_apache,fio2_apache,glucose_apache,heart_rate_apache,hematocrit_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2_apache,ph_apache,resprate_apache,sodium_apache,urineoutput_apache,wbc_apache,d1_diasbp_invasive_max,d1_diasbp_invasive_min,d1_diasbp_max,d1_diasbp_min,d1_heartrate_max,d1_heartrate_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_min,d1_resprate_max,d1_resprate_min,d1_spo2_max,d1_spo2_min,d1_sysbp_invasive_max,d1_sysbp_invasive_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,h1_diasbp_invasive_min,h1_diasbp_max,h1_diasbp_min,h1_heartrate_max,h1_heartrate_min,h1_mbp_invasive_max,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_temp_max,h1_temp_min,d1_albumin_min,d1_bilirubin_max,d1_bilirubin_min,d1_bun_min,d1_calcium_max,d1_creatinine_max,d1_creatinine_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_inr_max,d1_inr_min,d1_lactate_max,d1_lactate_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,h1_albumin_max,h1_albumin_min,h1_bun_max,h1_calcium_max,h1_calcium_min,h1_creatinine_min,h1_hco3_max,h1_hematocrit_min,h1_inr_max,h1_inr_min,h1_lactate_min,h1_platelets_min,h1_potassium_max,h1_sodium_max,h1_wbc_max,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,apache_4a_hospital_death_prob,apache_4a_icu_death_prob
0,68.0,180.3,0.541667,73.9,2.3,31.0,2.51,,168.0,118.0,27.4,40.0,,,,,36.0,134.0,,14.10,46.0,32.0,68.0,37.0,119.0,72.0,66.0,40.0,89.0,46.0,46.0,34.0,10.0,100.0,74.0,122.0,64.0,131.0,73.0,73.0,39.9,37.2,,68.0,63.0,119.0,108.0,,86.0,85.0,86.0,85.0,26.0,18.0,100.0,74.0,,,131.0,115.0,131.0,39.5,37.5,2.3,0.4,0.4,30.0,8.5,2.51,2.23,168.0,109.0,19.0,15.0,8.9,8.9,27.4,27.4,,,1.3,1.0,233.0,233.0,4.0,3.4,136.0,134.0,14.10,14.10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.10,0.05
1,77.0,160.0,0.927778,70.2,,9.0,0.56,1.0,145.0,120.0,36.9,46.0,37.0,37.0,51.0,7.45,33.0,145.0,,12.70,,,95.0,31.0,118.0,72.0,,,120.0,38.0,38.0,32.0,12.0,100.0,70.0,,,159.0,67.0,67.0,36.3,35.1,,61.0,48.0,114.0,100.0,,85.0,57.0,85.0,57.0,31.0,28.0,95.0,70.0,,,95.0,71.0,95.0,36.3,36.3,1.6,0.5,0.5,9.0,8.6,0.71,0.56,145.0,128.0,27.0,26.0,11.3,11.1,36.9,36.1,1.300,1.300,3.5,3.5,557.0,487.0,4.2,3.8,145.0,145.0,23.30,12.70,,,9.0,8.6,8.6,0.56,27.0,36.9,1.300,1.300,3.5,557.0,4.2,145.0,12.70,37.0,37.0,7.45,7.45,51.0,51.0,54.800000,51.000000,37.000,7.45,51.0,51.0,51.000000,0.47,0.29
2,25.0,172.7,0.000694,95.3,,,,,,102.0,,68.0,,,,,37.0,,,,,,88.0,48.0,96.0,68.0,,,102.0,68.0,68.0,21.0,8.0,98.0,91.0,,,148.0,105.0,105.0,37.0,36.7,,88.0,58.0,96.0,78.0,,91.0,83.0,91.0,83.0,20.0,16.0,98.0,91.0,,,148.0,124.0,148.0,36.7,36.7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.00,0.00
3,81.0,165.1,0.000694,61.7,,,,0.6,185.0,114.0,25.9,60.0,30.0,30.0,142.0,7.39,4.0,,,8.00,62.0,30.0,48.0,42.0,116.0,92.0,92.0,52.0,84.0,84.0,84.0,23.0,7.0,100.0,95.0,164.0,78.0,158.0,84.0,84.0,38.0,34.8,44.0,62.0,44.0,100.0,96.0,92.0,92.0,71.0,,,12.0,11.0,100.0,99.0,136.0,106.0,136.0,106.0,,35.6,34.8,,,,,,,,185.0,88.0,,,11.6,8.9,34.0,25.9,1.600,1.100,,,198.0,43.0,5.0,3.5,,,9.00,8.00,,,,,,,,34.0,1.600,1.100,,43.0,,,8.80,37.0,27.0,7.44,7.34,337.0,102.0,342.500000,236.666667,33.000,7.37,337.0,265.0,337.000000,0.04,0.03
4,19.0,188.0,0.073611,,,,,,,60.0,,103.0,,,,,16.0,,,,,,99.0,57.0,89.0,60.0,,,104.0,90.0,90.0,18.0,16.0,100.0,96.0,,,147.0,120.0,120.0,37.2,36.7,,99.0,68.0,89.0,76.0,,104.0,92.0,104.0,92.0,,,100.0,100.0,,,130.0,120.0,130.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,67.0,190.5,0.000694,100.0,,13.0,0.71,,156.0,113.0,44.2,130.0,,,,,35.0,137.0,,10.90,,,100.0,61.0,113.0,83.0,,,127.0,80.0,80.0,32.0,10.0,97.0,91.0,,,173.0,107.0,107.0,36.8,36.6,,89.0,89.0,83.0,83.0,,111.0,111.0,111.0,111.0,12.0,12.0,97.0,97.0,,,143.0,143.0,143.0,36.7,36.7,,,,13.0,8.8,0.71,0.71,156.0,125.0,27.0,27.0,15.6,15.6,44.2,44.2,1.100,1.100,,,159.0,159.0,3.9,3.7,137.0,137.0,10.90,10.90,,,,,,,,,1.100,1.100,,,,,,,,,,,,,,,,,,,0.05,0.02
6,59.0,165.1,0.000694,156.6,,18.0,0.78,1.0,197.0,133.0,33.5,138.0,43.0,43.0,370.0,7.42,53.0,135.0,,5.90,107.0,65.0,76.0,68.0,112.0,70.0,138.0,84.0,117.0,97.0,97.0,38.0,16.0,100.0,87.0,191.0,116.0,151.0,133.0,133.0,37.2,35.0,79.0,107.0,79.0,79.0,72.0,138.0,117.0,117.0,117.0,117.0,18.0,18.0,100.0,100.0,191.0,163.0,191.0,163.0,,36.8,35.0,,,,11.0,9.3,0.85,0.78,197.0,129.0,33.0,30.0,11.9,10.7,37.5,33.5,,,,,295.0,278.0,5.0,4.2,136.0,135.0,9.30,5.90,,,18.0,8.7,8.7,0.78,30.0,33.5,,,,278.0,4.2,135.0,5.90,43.0,43.0,7.42,7.42,370.0,370.0,370.000000,370.000000,43.000,7.42,370.0,370.0,370.000000,0.10,0.05
7,70.0,165.0,0.002083,,,48.0,2.05,,164.0,120.0,22.6,60.0,,,,,28.0,140.0,,12.80,,,84.0,46.0,118.0,86.0,,,114.0,60.0,60.0,28.0,12.0,100.0,92.0,,,147.0,71.0,71.0,38.5,36.6,,74.0,55.0,118.0,114.0,,88.0,60.0,88.0,60.0,28.0,26.0,96.0,92.0,,,119.0,106.0,119.0,38.5,38.5,,,,48.0,7.8,2.05,2.05,129.0,129.0,29.0,29.0,7.8,7.8,25.5,25.5,,,,,260.0,260.0,5.8,2.4,140.0,140.0,12.80,12.80,,,,,,,,,,,,,,,,43.0,43.0,7.38,7.38,89.0,89.0,,,,,,,,0.11,0.06
8,45.0,170.2,0.009028,,2.7,15.0,1.16,1.0,380.0,82.0,37.9,66.0,60.0,60.0,92.0,7.14,14.0,142.0,,24.70,64.0,52.0,65.0,59.0,82.0,82.0,72.0,66.0,93.0,71.0,71.0,24.0,19.0,97.0,97.0,94.0,72.0,104.0,98.0,98.0,36.9,36.9,52.0,65.0,59.0,82.0,82.0,72.0,93.0,71.0,93.0,71.0,24.0,19.0,97.0,97.0,94.0,72.0,104.0,98.0,104.0,36.9,36.9,2.7,0.2,0.2,15.0,7.3,1.16,1.16,365.0,288.0,23.0,23.0,12.3,12.3,37.9,37.9,1.200,1.200,5.9,5.9,226.0,226.0,5.2,5.2,142.0,142.0,24.70,24.70,2.7,2.7,15.0,7.3,7.3,1.16,23.0,37.9,1.200,1.200,5.9,226.0,5.2,142.0,24.70,60.0,33.0,7.62,6.99,256.0,92.0,92.000000,92.000000,60.000,7.14,92.0,92.0,92.000000,,
9,50.0,175.3,0.060417,79.0,3.6,10.0,0.83,,134.0,94.0,37.2,58.0,,,,,46.0,139.0,,8.40,74.0,57.0,83.0,48.0,96.0,57.0,92.0,73.0,101.0,59.0,59.0,44.0,14.0,100.0,96.0,126.0,103.0,135.0,78.0,78.0,37.1,36.4,62.0,83.0,61.0,96.0,60.0,92.0,101.0,77.0,101.0,77.0,29.0,17.0,100.0,96.0,126.0,106.0,135.0,103.0,135.0,36.9,36.9,3.6,0.4,0.4,10.0,8.3,0.83,0.83,134.0,134.0,28.0,28.0,12.8,12.8,37.2,37.2,,,,,232.0,232.0,4.1,3.3,139.0,139.0,12.80,12.80,3.6,3.6,10.0,8.3,8.3,0.83,28.0,37.2,,,,232.0,3.3,139.0,12.80,,,,,,,,,,,,,,0.02,0.01


In [25]:
X_scaled = X[non_zero_coefs['variable']]
# fill in NA values
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
X_scaled = imp.fit_transform(X_scaled)

# PCA on numerical values
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_scaled)
X_scaled

array([[0.71232877, 0.73814009, 0.13849782, ..., 0.29853242, 0.55276382,
        0.53299492],
       [0.83561644, 0.39047782, 0.14059582, ..., 0.01327434, 0.73869347,
        0.65482234],
       [0.12328767, 0.60798082, 0.13555836, ..., 0.29853242, 0.50251256,
        0.50761421],
       ...,
       [0.43835616, 0.56516527, 0.13580741, ..., 0.29853242, 0.52763819,
        0.5177665 ],
       [0.63437693, 0.3031341 , 0.13599985, ..., 0.29853242, 0.53768844,
        0.5177665 ],
       [0.90410959, 0.39047782, 0.1356527 , ..., 0.29853242, 0.59798995,
        0.54822335]])

In [26]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis()
clf.fit(X_scaled, y)
metrics.roc_auc_score(y, clf.predict(X_scaled))



0.6602819293642654

In [96]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_scaled, y)
metrics.roc_auc_score(y, gnb.predict(X_scaled))

0.7077989881421823

In [50]:
cols["Data Type"][cols["Variable Name"] == "bmi"] = "numeric"

In [51]:
# let's add in the categoricals
X_cat = raw_X[cols["Variable Name"][cols["Data Type"] == "string"]]
X_cat.head()

Unnamed: 0,ethnicity,gender,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,apache_2_diagnosis,apache_3j_diagnosis,apache_3j_bodysystem,apache_2_bodysystem
0,Caucasian,M,Floor,Floor,admit,CTICU,113.0,502.01,Sepsis,Cardiovascular
1,Caucasian,F,Floor,Floor,admit,Med-Surg ICU,108.0,203.01,Respiratory,Respiratory
2,Caucasian,F,Emergency Department,Accident & Emergency,admit,Med-Surg ICU,122.0,703.03,Metabolic,Metabolic
3,Caucasian,F,Operating Room,Operating Room / Recovery,admit,CTICU,203.0,1206.03,Cardiovascular,Cardiovascular
4,Caucasian,M,,Accident & Emergency,admit,Med-Surg ICU,119.0,601.01,Trauma,Trauma


In [88]:
{var: pd.unique(X_cat[var]).size for var in cols["Variable Name"][cols["Data Type"] == "string"]}

{'ethnicity': 7,
 'gender': 3,
 'hospital_admit_source': 16,
 'icu_admit_source': 6,
 'icu_stay_type': 3,
 'icu_type': 8,
 'apache_2_diagnosis': 45,
 'apache_3j_diagnosis': 400,
 'apache_3j_bodysystem': 12,
 'apache_2_bodysystem': 11}

In [84]:
grouped = raw_X.groupby(["ethnicity", "hospital_death"])["hospital_death"].count()
pd.DataFrame(grouped)

Unnamed: 0_level_0,Unnamed: 1_level_0,hospital_death
ethnicity,hospital_death,Unnamed: 2_level_1
African American,0,8797
African American,1,750
Asian,0,1036
Asian,1,93
Caucasian,0,64516
Caucasian,1,6168
Hispanic,0,3420
Hispanic,1,376
Native American,0,718
Native American,1,70


In [86]:
np.unique([i if type(i)==str else "nan" for i in X_cat["ethnicity"] ], return_counts=True)

(array(['African American', 'Asian', 'Caucasian', 'Hispanic',
        'Native American', 'Other/Unknown', 'nan'], dtype='<U16'),
 array([ 9547,  1129, 70684,  3796,   788,  4374,  1395]))

In [69]:
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral3
output_notebook()
levels, counts =np.unique([i if type(i)==str else "nan" for i in X_cat["ethnicity"] ], return_counts=True)

p = figure(x_range=levels, plot_height=250, title="Fruit Counts",
           toolbar_location=None, tools="")

p.vbar(x=levels, top=counts, width=0.9)

p.xgrid.grid_line_color = None
p.y_range.start = 0

data = {'levels' : levels,
        '2015'   : [2, 1, 4, 3, 2, 4],
        '2016'   : [5, 3, 4, 2, 4, 6],
        '2017'   : [3, 2, 4, 4, 5, 3]}

p = figure(x_range=fruits, plot_height=250, title="Fruit Counts by Year",
           toolbar_location=None, tools="")

p.vbar_stack(years, x='fruits', width=0.9, color=colors, source=data,
             legend_label=years)

p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"

show(p)

In [99]:
pd.get_dummies(raw_X["icu_admit_source"], prefix="icu_admit_source")

Unnamed: 0,icu_admit_source_Accident & Emergency,icu_admit_source_Floor,icu_admit_source_Operating Room / Recovery,icu_admit_source_Other Hospital,icu_admit_source_Other ICU
0,0,1,0,0,0
1,0,1,0,0,0
2,1,0,0,0,0
3,0,0,1,0,0
4,1,0,0,0,0
5,1,0,0,0,0
6,1,0,0,0,0
7,1,0,0,0,0
8,0,0,0,1,0
9,1,0,0,0,0


In [100]:
X_add_cat = pd.DataFrame(X_scaled).join(pd.get_dummies(raw_X["ethnicity"], prefix="ethnicity"))
X_add_cat = pd.DataFrame(X_add_cat).join(pd.get_dummies(raw_X["gender"], prefix="gender"))
X_add_cat = pd.DataFrame(X_add_cat).join(pd.get_dummies(raw_X["hospital_admit_source"], prefix="hospital_admit_source"))
X_add_cat = pd.DataFrame(X_add_cat).join(pd.get_dummies(raw_X["icu_admit_source"], prefix="icu_admit_source"))
X_add_cat = pd.DataFrame(X_add_cat).join(pd.get_dummies(raw_X["icu_stay_type"], prefix="icu_stay_type"))
X_add_cat = pd.DataFrame(X_add_cat).join(pd.get_dummies(raw_X["icu_type"], prefix="icu_type"))

In [101]:
gnb = GaussianNB()
gnb.fit(X_add_cat, y)
metrics.roc_auc_score(y, gnb.predict(X_add_cat))

0.7243313655765495