In [1]:
import numpy as np
import pandas as pd
import utils

import copy, math, os, pickle, time 

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, GammaRegressor
from sklearn.svm import LinearSVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

from sklearn.pipeline import Pipeline

from scipy.stats import pearsonr, spearmanr, kendalltau

# To show all columns in a dataframe
pd.options.display.max_info_columns=250
pd.options.display.max_columns=500

# To make pretty plots
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-ticks')
sns.set_style('ticks')
plt.rcParams['figure.figsize'] = (6, 4)
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16

%matplotlib inline

In [2]:
df_train = pd.read_csv("../data/mimic-ft98-clustered-S0-train.csv")
df_train.drop(columns=["starttime", "endtime"], inplace=True)

label = "log_duration"

print(df_train.shape)
df_train.head()

(10121, 107)


Unnamed: 0,stay_id,admission_location,insurance,language,ethnicity,marital_status,gender,age,hours_in_hosp_before_intubation,weight,height,co2_total_max,co2_total_avg,co2_total_min,ph_max,ph_avg,ph_min,lactate_max,lactate_avg,lactate_min,pao2fio2ratio,heart_rate_max,heart_rate_avg,heart_rate_min,mbp_max,mbp_avg,mbp_min,mbp_ni_max,mbp_ni_avg,mbp_ni_min,resp_rate_max,resp_rate_avg,resp_rate_min,temp_max,temp_avg,temp_min,spo2_max,spo2_avg,spo2_min,glucose_max,glucose_avg,glucose_min,vasopressin,epinephrine,dobutamine,norepinephrine,phenylephrine,dopamine,count_of_vaso,fio2_max,fio2_avg,fio2_min,peep_max,peep_avg,peep_min,plateau_pressure_max,plateau_pressure_avg,plateau_pressure_min,rrt,sinus_rhythm,neuroblocker,congestive_heart_failure,cerebrovascular_disease,dementia,chronic_pulmonary_disease,rheumatic_disease,mild_liver_disease,diabetes_without_cc,diabetes_with_cc,paraplegia,renal_disease,malignant_cancer,severe_liver_disease,metastatic_solid_tumor,aids,SOFA,respiration,coagulation,liver,cardiovascular,cns,renal,apsiii,hr_score,mbp_score,temp_score,resp_rate_score,pao2_aado2_score,hematocrit_score,wbc_score,creatinine_score,uo_score,bun_score,sodium_score,albumin_score,bilirubin_score,glucose_score,acidbase_score,gcs_score,duration,log_duration,over72h,alive96h,pc1,pc2,pc3,cluster
0,38910812,EMERGENCY ROOM,Other,ENGLISH,UNKNOWN,SINGLE,M,56,17,77.0,,19.0,18.0,17.0,7.33,7.305,7.28,7.4,6.95,6.5,108.0,82.0,72.44,65.0,100.0,73.72,57.0,63.0,63.0,63.0,26.0,22.8,20.0,37.06,36.551667,36.0,98.0,94.84,92.0,136.0,102.4,62.0,0,0,0,0,1,0,1,50.0,50.0,50.0,6.0,5.6,5.0,16.0,16.0,16.0,0,1.0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,14,3.0,2.0,3.0,1.0,4,1.0,118,5.0,15.0,0.0,6.0,2.0,3.0,0.0,4.0,8.0,11.0,2.0,0.0,8.0,0.0,6.0,48.0,75.033333,4.317932,1,0,60.593486,-5.996556,-2.941236,4
1,38388229,EMERGENCY ROOM,Other,ENGLISH,BLACK/AFRICAN AMERICAN,MARRIED,M,81,45,95.5,180.0,23.0,22.5,22.0,7.44,7.435,7.43,,,,210.0,110.0,89.333333,54.0,103.0,83.269231,71.0,91.0,80.555556,71.0,33.0,23.94,16.0,38.61,37.426667,36.67,100.0,98.666667,96.0,205.0,162.666667,109.0,0,0,0,0,1,0,1,50.0,42.5,40.0,5.0,5.0,5.0,16.0,15.333333,15.0,0,0.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,5,,1.0,0.0,0.0,3,1.0,60,5.0,7.0,0.0,6.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,33.0,213.633333,5.364261,1,1,-0.342315,-19.28656,3.687725,3
2,31753166,TRANSFER FROM HOSPITAL,Medicare,ENGLISH,WHITE,MARRIED,M,91,73,79.5,175.0,27.0,26.333333,26.0,7.49,7.46,7.43,,,,300.0,74.0,61.65625,60.0,128.0,72.3625,45.0,95.0,73.681818,45.0,38.0,19.234375,14.0,38.0,37.13,36.22,100.0,98.15625,94.0,72.0,71.5,71.0,0,0,0,0,0,0,0,100.0,48.75,30.0,10.0,5.7,5.0,24.0,20.916667,20.0,0,0.0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,7,2.0,0.0,,0.0,3,2.0,72,0.0,10.0,0.0,6.0,5.0,3.0,0.0,7.0,4.0,11.0,0.0,,,0.0,2.0,24.0,90.416667,4.504429,1,1,7.868908,-4.134309,-5.318351,3
3,30003299,EMERGENCY ROOM,Other,ENGLISH,WHITE,SINGLE,M,26,1,120.0,178.0,29.0,24.888889,21.0,7.4,7.335556,7.27,4.0,2.777778,1.5,280.0,133.0,119.5,101.0,122.0,93.071429,70.0,,,,18.0,17.105263,12.0,37.44,36.971667,36.39,100.0,98.555556,96.0,185.0,152.166667,130.0,0,0,0,0,0,0,0,50.0,48.333333,40.0,5.0,5.0,5.0,25.0,23.6,22.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0.0,0.0,,0.0,3,0.0,48,7.0,7.0,0.0,0.0,0.0,3.0,0.0,0.0,4.0,0.0,0.0,,,0.0,12.0,15.0,154.183333,5.038142,1,1,-18.294562,-7.29107,3.911806,2
4,31166711,EMERGENCY ROOM,Other,ENGLISH,WHITE,SINGLE,M,42,77,97.6,183.0,32.0,20.75,15.0,7.22,7.1565,7.0,6.4,4.485,2.2,72.0,150.0,128.5,113.0,88.0,67.607143,47.0,,,,35.0,16.017857,10.0,39.8,38.15,37.3,100.0,90.62069,78.0,173.0,120.421053,77.0,1,1,0,1,1,0,4,100.0,100.0,100.0,16.0,12.769231,10.0,32.0,26.5,21.0,1,0.0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,14,4.0,1.0,2.0,4.0,1,2.0,92,13.0,15.0,0.0,9.0,0.0,3.0,0.0,7.0,5.0,7.0,2.0,11.0,5.0,0.0,12.0,3.0,420.283333,6.040929,1,1,19.069108,25.24142,6.178101,2


**Summary statistics**

In [3]:
df_train.describe()

Unnamed: 0,stay_id,age,hours_in_hosp_before_intubation,weight,height,co2_total_max,co2_total_avg,co2_total_min,ph_max,ph_avg,ph_min,lactate_max,lactate_avg,lactate_min,pao2fio2ratio,heart_rate_max,heart_rate_avg,heart_rate_min,mbp_max,mbp_avg,mbp_min,mbp_ni_max,mbp_ni_avg,mbp_ni_min,resp_rate_max,resp_rate_avg,resp_rate_min,temp_max,temp_avg,temp_min,spo2_max,spo2_avg,spo2_min,glucose_max,glucose_avg,glucose_min,vasopressin,epinephrine,dobutamine,norepinephrine,phenylephrine,dopamine,count_of_vaso,fio2_max,fio2_avg,fio2_min,peep_max,peep_avg,peep_min,plateau_pressure_max,plateau_pressure_avg,plateau_pressure_min,rrt,sinus_rhythm,neuroblocker,congestive_heart_failure,cerebrovascular_disease,dementia,chronic_pulmonary_disease,rheumatic_disease,mild_liver_disease,diabetes_without_cc,diabetes_with_cc,paraplegia,renal_disease,malignant_cancer,severe_liver_disease,metastatic_solid_tumor,aids,SOFA,respiration,coagulation,liver,cardiovascular,cns,renal,apsiii,hr_score,mbp_score,temp_score,resp_rate_score,pao2_aado2_score,hematocrit_score,wbc_score,creatinine_score,uo_score,bun_score,sodium_score,albumin_score,bilirubin_score,glucose_score,acidbase_score,gcs_score,duration,log_duration,over72h,alive96h,pc1,pc2,pc3,cluster
count,10121.0,10121.0,10121.0,10052.0,7645.0,9307.0,9307.0,9307.0,9307.0,9307.0,9307.0,7961.0,7961.0,7961.0,8919.0,10102.0,10102.0,10102.0,10102.0,10102.0,10102.0,8365.0,8365.0,8365.0,10102.0,10102.0,10102.0,9448.0,9448.0,9448.0,10098.0,10098.0,10098.0,10067.0,10079.0,10079.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0,10113.0,10113.0,10113.0,10101.0,10101.0,10101.0,8816.0,8816.0,8816.0,10121.0,10107.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0,8605.0,10084.0,6855.0,10103.0,10121.0,10119.0,10121.0,10103.0,10103.0,9568.0,10100.0,6717.0,10086.0,10085.0,10086.0,9870.0,10087.0,10082.0,5355.0,6855.0,10118.0,9135.0,9593.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0,10121.0
mean,35009300.0,63.818793,79.396601,83.35948,169.166014,26.624154,24.747415,22.842914,7.414035,7.365912,7.309904,3.178588,2.466506,1.83338,249.041288,107.520689,86.762397,71.336191,105.551458,76.331033,55.500709,93.967962,74.149098,59.173939,27.38532,19.586702,12.688725,37.742496,37.050551,36.326662,99.768865,97.776535,92.649039,242.200109,394.997971,110.192975,0.136844,0.076178,0.021737,0.402628,0.274084,0.041794,0.953265,68.694749,50.717608,43.053001,8.234768,6.943523,5.435967,21.248673,19.176749,17.207827,0.088529,0.641338,0.067681,0.287027,0.179528,0.031321,0.262029,0.031222,0.159767,0.216876,0.08922,0.06946,0.204328,0.111254,0.084379,0.04713,0.008102,8.286039,2.294364,0.692582,0.693508,1.993467,2.078846,1.107026,66.353819,4.033752,12.17381,1.238503,4.713168,1.482805,2.880726,1.052851,2.713167,4.873759,5.80004,0.610196,1.922876,1.95186,1.560486,5.795621,17.325237,128.66401,4.4544,0.519711,0.908803,0.133952,-0.023564,0.003595,2.146527
std,2877692.0,16.414636,255.690005,26.20366,10.641055,5.720919,5.677617,6.213692,0.072152,0.077016,0.1094,2.912881,2.060752,1.431109,130.009128,21.4288,16.720484,15.994705,28.940976,9.786836,13.839022,20.364397,11.347417,13.068822,6.286378,3.974822,4.110161,0.884026,0.713191,0.969678,0.829207,2.270479,7.500699,4982.651189,8623.867048,38.755421,0.3437,0.265296,0.145831,0.490451,0.446073,0.200129,1.06586,25.014432,12.417794,9.35578,4.002746,2.956503,2.851863,7.6771,5.017432,4.694608,0.284076,0.479632,0.25121,0.452397,0.383813,0.174192,0.43976,0.173926,0.366408,0.412138,0.285076,0.254247,0.403229,0.314462,0.277969,0.211927,0.08965,4.195505,1.486002,0.974586,1.10567,1.495971,1.387731,1.325734,28.296472,4.310755,5.199988,3.355488,3.966091,3.797653,0.5862,2.405633,3.362619,4.365005,4.297896,1.001144,3.416793,4.219608,2.170969,4.463503,17.091751,148.725237,0.850314,0.499636,0.287903,31.339567,11.934686,5.025754,1.065897
min,30000670.0,18.0,0.0,1.0,122.0,10.0,6.666667,4.0,7.03,6.994,6.7,0.4,0.4,0.2,18.0,49.0,40.12,2.0,57.0,44.451515,1.0,24.0,24.0,9.0,10.5,9.291667,1.0,31.6,30.414,15.0,79.0,53.5,1.0,51.0,51.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,21.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,3.178054,0.0,0.0,-64.220939,-34.889837,-19.166861,1.0
25%,32535120.0,54.0,2.0,66.4,163.0,23.0,21.160256,19.0,7.37,7.32,7.25,1.4,1.26,1.1,153.333333,92.0,74.791209,60.0,90.0,69.666667,50.0,80.0,66.677419,51.0,23.0,16.660977,10.0,37.17,36.713333,36.11,100.0,96.809524,91.0,135.0,113.666667,86.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,41.428571,40.0,5.0,5.0,5.0,17.0,15.666667,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,0.0,0.0,1.0,1.0,0.0,45.0,0.0,7.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,42.0,3.73767,0.0,1.0,-23.84966,-7.72254,-3.241901,1.0
50%,35030830.0,66.0,5.0,79.9,170.0,26.0,24.25,23.0,7.42,7.37,7.32,2.1,1.8,1.4,230.0,106.0,85.516129,70.0,100.0,74.893765,57.0,92.0,72.92,58.0,26.5,19.028595,13.0,37.67,37.070917,36.56,100.0,98.28,94.0,169.0,135.05,105.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,60.0,49.090909,40.0,6.1,5.3625,5.0,20.0,18.6,17.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,3.0,0.0,0.0,1.0,2.0,1.0,63.0,5.0,15.0,0.0,6.0,0.0,3.0,0.0,0.0,5.0,7.0,0.0,0.0,0.0,0.0,5.0,15.0,76.0,4.330733,1.0,1.0,-4.792806,-0.000998,-0.016408,2.0
75%,37463800.0,76.0,51.0,96.0,178.0,29.0,27.666667,26.0,7.46,7.42,7.39,3.8,2.866667,2.0,325.0,121.0,97.774457,81.0,112.0,81.622396,63.0,105.0,80.363636,66.0,31.0,21.984115,15.0,38.22,37.461206,36.89,100.0,99.333333,97.0,221.0,167.775,129.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,100.0,57.142857,50.0,10.0,8.333333,5.0,25.0,22.14881,20.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,4.0,1.0,1.0,4.0,3.0,2.0,85.0,7.0,15.0,0.0,6.0,0.0,3.0,1.0,7.0,7.0,11.0,2.0,4.0,0.0,3.0,12.0,29.0,158.5,5.065755,1.0,1.0,20.487845,7.423535,3.260959,3.0
max,39998270.0,98.0,8942.0,710.0,203.0,80.0,57.75,56.0,7.78,7.6,7.6,28.7,25.183333,21.2,1563.333333,237.0,161.677419,141.0,298.0,214.236607,109.0,236.0,150.0,150.0,69.0,39.5,32.0,42.3,40.010625,39.8,100.0,100.0,100.0,500036.0,500057.0,1103.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,100.0,100.0,100.0,84.0,28.857143,26.0,247.0,92.666667,43.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,23.0,4.0,4.0,4.0,4.0,4.0,4.0,189.0,17.0,23.0,20.0,18.0,15.0,3.0,19.0,10.0,15.0,12.0,4.0,11.0,16.0,9.0,12.0,48.0,2389.733333,7.778937,1.0,1.0,127.416892,51.255218,20.447119,4.0


**Drop constant variables**

In [4]:
df_train = df_train.loc[:, df_train.apply(pd.Series.nunique) != 1]
df_train.shape

(10121, 107)

In [5]:
# df_train = df_train[df_train.cluster==4]

### Feature selection

In [20]:
features=None

# # From Douglas
# features = ["apsiii",
#             "peep_min",
#             "gcs_score",
#             "acidbase_score",
#             "plateau_pressure_max",
#             "paraplegia",
#             "resp_rate_min",
#             "height",
#             "temp_max",
#             "co2_total_avg",
#             "fio2_min",
#             "cardiovascular",
#             "neuroblocker"]
# features = ["apsiii",
#             "peep_avg",
#             "gcs_score",
#             "acidbase_score",
#             "paraplegia",
#             "temp_max",
#             "height",
#             "resp_rate_min",
#             "SOFA",
#             "plateau_pressure_max",
#             "fio2_min",
#             "cardiovascular",
#             "neuroblocker",
#             "peep_min"]
# features = ['ph_max', 'spo2_min',
#             'heart_rate_min', 'heart_rate_max', 
#             'resp_rate_min', 'resp_rate_max',
#             'temp_min', 'temp_max', 
#             'glucose_max', 'glucose_min', 
#             'co2_total_max', 'co2_total_min', 
#             'mbp_max', 'mbp_ni_min', 
#             'apsiii', 
#             'peep_max', 'peep_min']
# features = ["acidbase_score",
#             "cns",
#             "co2_total_avg",
#             "gcs_score",
#             "height",
# #             "malignant_cancer",
# #             "paraplegia",
#             "peep_avg",
#             "peep_min",
#             "resp_rate_min",
#             "temp_avg",
#             "uo_score", 
# #             "cluster",
#            ]

# # All eICU features
# features = ['ph_max', 'spo2_min',
#        'heart_rate_min', 'heart_rate_max', 'resp_rate_min', 'resp_rate_max',
#        'temp_min', 'temp_max', 'glucose_max', 'glucose_min', 'co2_total_max',
#        'co2_total_min', 'mbp_max', 'mbp_ni_min', 'apsiii', 'peep_max',
#        'peep_min', 'co2_total_avg', 'fio2_min', 'plateau_pressure_max',
#        'height', 'peep_avg', 'temp_avg', 'hr_score', 'mbp_score', 'temp_score',
#        'resp_rate_score', 'pao2_aado2_score', 'hematocrit_score', 'wbc_score',
#        'creatinine_score', 'uo_score', 'bun_score', 'sodium_score',
#        'albumin_score', 'bilirubin_score', 'glucose_score', 'acidbase_score',
#        'gcs_score', 'SOFA', 'respiration', 'coagulation', 'liver',
#        'cardiovascular', 'cns', 'renal', 
#            ]

# Using SelectFromModel N=12
features = [
    'height', 
    'resp_rate_avg', 
    'temp_avg', 
    'peep_avg', 
    'peep_min',
    'plateau_pressure_max', 
    'plateau_pressure_avg', 
#     'paraplegia',
    'cns',
    'apsiii',
    'acidbase_score',
    'gcs_score',
    'hours_in_hosp_before_intubation',
]

# # Using SelectFromModel N=18
# features = [
#     'age', 
#     'hours_in_hosp_before_intubation',
#     'height', 
#     'co2_total_min',
#     'lactate_min', 
#     'resp_rate_avg', 
#     'temp_avg', 
#     'glucose_max', 
#     'peep_max',
#     'peep_avg', 
#     'peep_min', 
#     'plateau_pressure_max', 
#     'plateau_pressure_avg',
# #     'paraplegia', 
#     'cns', 
#     'apsiii', 
#     'acidbase_score', 
#     'gcs_score'
# ]

# # Using SequentialFeatureSelector
# features = ['height', 'co2_total_min', 'resp_rate_min', 'temp_avg', 
# #             'epinephrine',
#        'peep_avg', 'plateau_pressure_avg', 
# #             'paraplegia', 
#             'apsiii', 'mbp_score',
#        'acidbase_score', 'gcs_score']

X_train, y_train = utils.get_X_and_y(df_train, features=features, label=label)
print(X_train.shape, y_train.shape)

preprocessor = utils.define_preprocessor(X_train.columns)

(10121, 13) (10121,)


### Model development

In [21]:
regs = (
#     LinearRegression(),
#     Ridge(),
#     Lasso(),
#     ElasticNet(),
#     GammaRegressor(),
#     LinearSVR(),
#     GaussianProcessRegressor(),
#     DecisionTreeRegressor(),
#     RandomForestRegressor(),
#     ExtraTreesRegressor(),
    GradientBoostingRegressor(),
#     XGBRegressor(),
)

for reg in regs:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', reg)])
    scores = utils.benchmark_cv_score(pipe, X_train, y_train, head="reg")

________________________________________________________________________________

Model training: 
GradientBoostingRegressor()
train time: 11.040s

Average RMSE: 0.72 (+/- 0.04)
Average Pearson: 0.52 (+/- 0.04)
Average Spearman: 0.52 (+/- 0.04)


### Evaluate the model

In [22]:
preprocessor = utils.define_preprocessor(X_train.columns)
reg = GradientBoostingRegressor()

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', reg)])
df_train["y_pred"] = utils.benchmark_cv(pipe, X_train, y_train, head="reg")

________________________________________________________________________________

Model training: 
train time: 11.019s


In [23]:
print("Score: %0.2f" % pearsonr(df_train.log_duration, df_train.y_pred)[0])

Score: 0.52


In [43]:
age = 65

In [None]:
pearsonr(df_train[df_train.age >= age].log_duration, df_train[df_train.age >= age].y_pred)[0]

In [None]:
pearsonr(df_train[df_train.age < age].log_duration, df_train[df_train.age < age].y_pred)[0]

In [None]:
age = [20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]
above = []
below = []
for a in age:
    above.append(pearsonr(df_train[df_train.age >= a].log_duration, df_train[df_train.age >= a].y_pred)[0])
    below.append(pearsonr(df_train[df_train.age < a].log_duration, df_train[df_train.age < a].y_pred)[0])
    
sns.lineplot(x=age, y=above, label="above");
sns.lineplot(x=age, y=below, label="below");
plt.xlabel("Age");
plt.ylabel("Pearson's correlation coefficient");
plt.savefig('../results/reg train on mimic test on mimic age groups.jpeg', bbox_inches='tight', dpi=300)

In [None]:
dict(zip(age,above))

In [None]:
dict(zip(age,below))

In [None]:
y_train.mean(), y_train.std(), y_train.min(), y_train.max()

In [None]:
df_train.y_pred.mean(), df_train.y_pred.std(), df_train.y_pred.min(), df_train.y_pred.max()

In [None]:
palette = sns.color_palette("Set2", 2)
sns.lineplot(x=[3,8], y=[3,8], linestyle="--");
sns.scatterplot(y="log_duration", x="y_pred", hue="cluster", data=df_train, 
                palette=palette, alpha=0.8);
plt.legend(fontsize=14, title="Cluster", title_fontsize=16);


plt.xlim([3,6.8]);
plt.ylim([3,8]);
plt.ylabel("Observed log(duration)");
plt.xlabel("Predicted log(duration)");
# plt.savefig('../results/reg clusters MIMIC.jpeg', bbox_inches='tight', dpi=300)

In [None]:
sns.lmplot(x="log_duration", y="y_pred", col="cluster", data=df_train);

In [None]:
df_train.loc[df_train.cluster == 1, "y_pred"] = 4.5

In [None]:
for i in df_train.cluster.unique():
    print("For cluster %d, Pearson's rho = %.2f" % 
          (i, pearsonr(df_train[df_train.cluster == i].log_duration, df_train[df_train.cluster == i].y_pred)[0]))

In [None]:
df_train["residual"] = df_train.log_duration - df_train.y_pred
df_train["std_residual"] = (df_train.residual - df_train.residual.mean()) / df_train.residual.std()

In [None]:
sns.lineplot(x=[3,6], y=[0,0], linestyle="--", color="r");
sns.scatterplot(x="y_pred", y="std_residual", data=df_train, alpha=0.3);

# Evaluate on the test set

In [24]:
df_test = pd.read_csv("../data/mimic-ft98-clustered-S0-test.csv")
df_test.drop(columns=["starttime", "endtime"], inplace=True)
print(df_test.shape)
df_test.head()

(2531, 107)


Unnamed: 0,stay_id,admission_location,insurance,language,ethnicity,marital_status,gender,age,hours_in_hosp_before_intubation,weight,height,co2_total_max,co2_total_avg,co2_total_min,ph_max,ph_avg,ph_min,lactate_max,lactate_avg,lactate_min,pao2fio2ratio,heart_rate_max,heart_rate_avg,heart_rate_min,mbp_max,mbp_avg,mbp_min,mbp_ni_max,mbp_ni_avg,mbp_ni_min,resp_rate_max,resp_rate_avg,resp_rate_min,temp_max,temp_avg,temp_min,spo2_max,spo2_avg,spo2_min,glucose_max,glucose_avg,glucose_min,vasopressin,epinephrine,dobutamine,norepinephrine,phenylephrine,dopamine,count_of_vaso,fio2_max,fio2_avg,fio2_min,peep_max,peep_avg,peep_min,plateau_pressure_max,plateau_pressure_avg,plateau_pressure_min,rrt,sinus_rhythm,neuroblocker,congestive_heart_failure,cerebrovascular_disease,dementia,chronic_pulmonary_disease,rheumatic_disease,mild_liver_disease,diabetes_without_cc,diabetes_with_cc,paraplegia,renal_disease,malignant_cancer,severe_liver_disease,metastatic_solid_tumor,aids,SOFA,respiration,coagulation,liver,cardiovascular,cns,renal,apsiii,hr_score,mbp_score,temp_score,resp_rate_score,pao2_aado2_score,hematocrit_score,wbc_score,creatinine_score,uo_score,bun_score,sodium_score,albumin_score,bilirubin_score,glucose_score,acidbase_score,gcs_score,duration,log_duration,over72h,alive96h,pc1,pc2,pc3,cluster
0,39361378,EMERGENCY ROOM,Other,ENGLISH,UNKNOWN,SINGLE,F,35,2,80.0,168.0,27.0,25.333333,22.0,7.44,7.416667,7.39,4.9,4.0,3.2,367.5,134.0,102.933333,76.0,90.0,68.866667,56.0,,,,32.0,19.048387,13.0,39.44,37.451333,36.33,100.0,100.0,100.0,181.0,147.2,124.0,0,0,0,1,1,0,2,50.0,42.857143,40.0,5.0,5.0,5.0,17.0,15.5,14.0,0,1.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,12,2.0,1.0,2.0,3.0,4,0.0,105,7.0,15.0,0.0,6.0,0.0,3.0,1.0,0.0,5.0,0.0,0.0,6.0,6.0,3.0,5.0,48.0,128.75,4.857873,1,1,47.974668,-12.868124,4.931839,4
1,30754300,TRANSFER FROM HOSPITAL,Medicare,ENGLISH,WHITE,MARRIED,M,74,0,127.0,180.0,32.0,32.0,32.0,7.44,7.44,7.44,,,,194.0,117.0,82.730769,67.0,87.0,66.166667,34.0,87.0,66.166667,34.0,35.5,28.403846,20.0,37.5,37.135714,36.89,100.0,96.538462,94.0,140.0,119.2,96.0,0,0,0,0,0,0,0,50.0,50.0,50.0,12.0,11.714286,10.0,,,,0,0.0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,6,2.0,0.0,0.0,1.0,3,0.0,94,5.0,23.0,0.0,9.0,,3.0,0.0,0.0,7.0,11.0,0.0,,0.0,0.0,12.0,24.0,455.433333,6.121249,1,1,28.55929,6.192426,6.167873,3
2,31375344,TRANSFER FROM HOSPITAL,Medicare,ENGLISH,UNKNOWN,DIVORCED,M,69,16,70.0,178.0,28.0,24.166667,20.0,7.39,7.358333,7.3,1.6,1.433333,1.2,224.0,109.0,90.54,71.0,98.0,70.715686,52.0,,,,25.0,17.098039,9.0,37.8,36.844783,35.0,100.0,99.319149,91.0,129.0,103.16375,0.12,0,0,0,0,1,0,1,50.0,50.0,50.0,10.0,6.571429,0.0,14.0,13.5,13.0,0,1.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0.0,2.0,0.0,1.0,0,1.0,44,1.0,15.0,2.0,0.0,0.0,3.0,1.0,0.0,5.0,7.0,0.0,0.0,0.0,8.0,2.0,0.0,24.466667,3.197312,0,1,-27.826342,6.293059,-1.844269,1
3,31604434,EMERGENCY ROOM,Medicaid,ENGLISH,OTHER,SINGLE,M,38,2,130.0,180.0,25.0,22.666667,20.0,7.43,7.36,7.28,2.3,1.833333,1.3,217.5,106.0,96.923077,87.0,112.0,83.142857,63.0,103.0,90.5,78.0,20.0,18.980769,9.0,37.94,36.84125,35.56,100.0,97.846154,94.0,162.0,141.0,108.0,0,0,0,0,1,0,1,100.0,50.0,40.0,5.0,5.0,5.0,27.0,23.833333,22.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,3.0,1.0,0.0,1.0,3,3.0,67,1.0,7.0,2.0,0.0,0.0,3.0,1.0,10.0,15.0,7.0,0.0,,0.0,3.0,3.0,15.0,201.216667,5.304382,1,1,0.304676,3.200793,-12.423412,2
4,38669252,EMERGENCY ROOM,Medicare,ENGLISH,BLACK/AFRICAN AMERICAN,MARRIED,F,42,52,58.7,168.0,26.0,26.0,26.0,7.33,7.33,7.33,0.8,0.8,0.8,38.0,116.0,92.173913,77.0,117.0,86.782609,64.0,117.0,86.782609,64.0,26.0,16.673077,13.0,36.89,36.734,36.56,100.0,99.565217,95.0,115.0,87.578947,61.0,0,0,0,0,0,0,0,50.0,41.0,40.0,5.7,5.185714,5.0,25.0,18.333333,14.0,1,1.0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,7,,0.0,0.0,0.0,3,4.0,85,7.0,9.0,0.0,6.0,,3.0,0.0,7.0,15.0,12.0,2.0,0.0,0.0,9.0,,15.0,30.366667,3.413346,0,1,16.774853,10.935837,-5.808469,2


In [25]:
X_test, y_test = utils.get_X_and_y(df_test, features=features, label=label)
print(X_test.shape, y_test.shape)

(2531, 13) (2531,)


In [27]:
preprocessor = utils.define_preprocessor(X_train.columns)
reg = GradientBoostingRegressor()

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', reg)])

In [29]:
pipe.fit(X_train, y_train)
df_test["y_pred"] = pipe.predict(X_test)

In [30]:
print("Score: %0.2f" % pearsonr(df_test.log_duration, df_test.y_pred)[0])

Score: 0.53


In [None]:
age = 50

In [None]:
pearsonr(df_test[df_test.age >= age].log_duration, df_test[df_test.age >= age].y_pred)[0]

In [None]:
pearsonr(df_test[df_test.age < age].log_duration, df_test[df_test.age < age].y_pred)[0]

In [None]:
age = [20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]
above = []
below = []
for a in age:
    above.append(pearsonr(df_test[df_test.age >= a].log_duration, df_test[df_test.age >= a].y_pred)[0])
    below.append(pearsonr(df_test[df_test.age < a].log_duration, df_test[df_test.age < a].y_pred)[0])
    
sns.lineplot(x=age, y=above, label="above");
sns.lineplot(x=age, y=below, label="below");
plt.xlabel("Age");
plt.ylabel("Pearson's correlation coefficient");

In [None]:
plt.rcParams['figure.figsize'] = (6, 6)
sns.lineplot(x=[3,8], y=[3,8], linestyle="--");
sns.scatterplot(y="log_duration", x="y_pred", data=df_test, alpha=0.3);
plt.ylabel("Observed log(duration)");
plt.xlabel("Predicted log(duration)");
plt.xlim([3,6.8]);
plt.ylim([3,8]);
# plt.savefig('../results/reg train on mimic test on mimic.png', bbox_inches='tight', dpi=300)

# Validate on eICU

In [31]:
df_eicu = pd.read_csv("../data/eicu-ft58.csv")
df_eicu.drop(columns=["starttime", "endtime"], inplace=True)

print(df_eicu.shape)
df_eicu.head()

(21185, 62)


Unnamed: 0,stay_id,duration,over72h,ph_max,spo2_min,heart_rate_min,heart_rate_max,resp_rate_min,resp_rate_max,temp_min,temp_max,glucose_max,glucose_min,co2_total_max,co2_total_min,mbp_max,mbp_ni_min,apsiii,peep_max,peep_min,co2_total_avg,fio2_min,plateau_pressure_max,height,peep_avg,temp_avg,hr_score,mbp_score,temp_score,resp_rate_score,pao2_aado2_score,hematocrit_score,wbc_score,creatinine_score,uo_score,bun_score,sodium_score,albumin_score,bilirubin_score,glucose_score,acidbase_score,gcs_score,SOFA,respiration,coagulation,liver,cardiovascular,cns,renal,hospitalid,numbedscategory,region,teachingstatus,lactate_max,lactate_min,lactate_avg,resp_rate_avg,plateau_pressure_avg,plateau_pressure_min,age,hours_in_hosp_before_intubation,log_duration
0,2127890,44.217,0,,89.0,107.0,196.0,16.0,49.0,35.9,37.4,187.0,80.0,,,93.0,52.0,96.0,8.0,5.0,,70.0,21.0,162.6,6.364,36.569,17.0,15.0,0.0,11.0,15.0,3.0,19.0,7.0,15.0,11.0,2.0,0.0,6.0,5.0,12.0,0.0,12,3,4,2,1,0,2,307,L,South,False,9.1,4.5,6.875,24.6,20.2,19.0,55,38,3.789109
1,2519150,68.0,0,,88.0,96.0,116.0,13.0,28.0,37.4,38.4,194.0,106.0,,,90.0,60.0,39.0,10.0,5.0,,,29.0,165.1,7.167,37.843,5.0,7.0,0.0,6.0,0.0,3.0,0.0,4.0,15.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,7,3,1,0,1,1,1,338,XL,Midwest,False,1.3,1.3,1.3,18.6,20.5,12.0,29,4,4.219508
2,919705,39.25,0,7.51,85.0,58.0,73.0,15.0,20.0,35.8,36.4,288.0,219.0,,,129.0,61.0,35.0,15.0,5.0,,,38.0,166.0,6.8,36.04,0.0,6.0,8.0,6.0,15.0,3.0,0.0,0.0,15.0,7.0,0.0,11.0,0.0,5.0,12.0,0.0,4,1,0,0,1,1,1,146,L,West,False,,,,16.359,26.857,16.0,61,51,3.669951
3,1554681,28.0,0,7.4,91.0,87.0,113.0,0.0,23.0,36.8,37.4,,,,,88.0,56.0,71.0,,,,30.0,18.0,167.6,,37.117,5.0,15.0,0.0,0.0,0.0,3.0,5.0,0.0,15.0,0.0,0.0,11.0,6.0,0.0,0.0,0.0,6,3,0,2,1,0,0,248,M,Midwest,False,,,,16.136,18.0,18.0,66,36,3.332205
4,260998,30.8,0,7.25,75.0,109.0,121.0,28.0,35.0,36.3,37.3,278.0,92.0,,,82.5,43.0,140.0,8.0,8.0,,75.0,,172.72,8.0,36.825,7.0,15.0,0.0,18.0,0.0,3.0,19.0,7.0,15.0,7.0,2.0,11.0,5.0,5.0,4.0,24.0,16,1,3,2,4,2,4,79,XL,Midwest,False,7.1,5.5,6.133,31.333,,,64,1,3.427515


In [32]:
# df_eicu, _ = utils.cluster_by_severity(df_eicu, pca)

In [33]:
X_eicu, y_eicu = utils.get_X_and_y(df_eicu, features=features, label=label)
print(X_eicu.shape, y_eicu.shape)

(21185, 13) (21185,)


In [17]:
preprocessor = utils.define_preprocessor(X_train.columns)
reg = GradientBoostingRegressor()

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', reg)])

In [34]:
# pipe.fit(X_train, y_train)
df_eicu["y_pred"] = pipe.predict(X_eicu)

In [35]:
print("Score: %0.2f" % pearsonr(df_eicu.log_duration, df_eicu.y_pred)[0])

Score: 0.15


In [None]:
sns.lineplot(x=[3,8], y=[3,8], linestyle="--");
sns.scatterplot(y="log_duration", x="y_pred", data=df_eicu, alpha=0.3);
plt.ylabel("Observed log(duration)");
plt.xlabel("Predicted log(duration)");
plt.xlim([3,6.8]);
plt.ylim([3,8]);
# plt.savefig('../results/reg eICU.jpeg', bbox_inches='tight', dpi=300)

In [None]:
y_eicu.mean(), y_eicu.std(), y_eicu.min(), y_eicu.max()

In [None]:
df_eicu.y_pred.mean(), df_eicu.y_pred.std(), df_eicu.y_pred.min(), df_eicu.y_pred.max()

In [None]:
palette = sns.color_palette("Set2", 4)
sns.lineplot(x=[3,8], y=[3,8], linestyle="--");
sns.scatterplot(y="log_duration", x="y_pred", hue="cluster", data=df_eicu, 
                palette=palette, alpha=0.5);
plt.legend(fontsize=14, title="Cluster", title_fontsize=16);


plt.xlim([3,6.8]);
plt.ylim([3,8]);
plt.ylabel("Observed log(duration)");
plt.xlabel("Predicted log(duration)");
# plt.savefig('../results/reg clusters eICU.jpeg', bbox_inches='tight', dpi=300)

In [None]:
hospid = df_eicu.hospitalid.value_counts().head(5).index

In [None]:
pearsonr(df_eicu[df_eicu.hospitalid==hospid[0]].log_duration, df_eicu[df_eicu.hospitalid==hospid[0]].y_pred)[0]

In [None]:
pearsonr(df_eicu[df_eicu.hospitalid==hospid[1]].log_duration, df_eicu[df_eicu.hospitalid==hospid[1]].y_pred)[0]

In [None]:
pearsonr(df_eicu[df_eicu.hospitalid==hospid[2]].log_duration, df_eicu[df_eicu.hospitalid==hospid[2]].y_pred)[0]

In [None]:
def pearson_score(x):
    try:
        return pearsonr(x.log_duration, x.y_pred)[0]
    except:
        print("Only one observation")

In [None]:
scores = df_eicu.groupby("hospitalid").apply(pearson_score)
scores

In [None]:
scores.mean(), scores.std(), scores.max(), scores.min()

In [None]:
scores.hist(bins=25);

In [None]:
n_records = df_eicu.groupby("hospitalid").size()
n_records

In [None]:
df_hosp = pd.concat([scores, n_records], axis=1)
df_hosp.columns = ("score", "n_records")
df_hosp

In [None]:
df_hosp["over25records"] = np.where(df_hosp.n_records > 25, 1, 0)

In [None]:
df_hosp.loc[df_hosp.over25records==1, "score"].mean(), df_hosp.loc[df_hosp.over25records==1, "score"].std()

In [None]:
plt.rcParams['figure.figsize'] = (6, 6)

sns.scatterplot(x="n_records", y="score", data=df_hosp, hue="over25records", s=75);

plt.xlabel("Number of observations from the same hospital");
plt.ylabel("Pearson's correlation coeffitient");
plt.legend(title="> 25 records");
plt.savefig('../results/reg hospitals eICU.png', bbox_inches='tight', dpi=300)

In [None]:
pearsonr(df_eicu[df_eicu.numbedscategory=="XL"].log_duration, df_eicu[df_eicu.numbedscategory=="XL"].y_pred)[0]

In [None]:
pearsonr(df_eicu[df_eicu.numbedscategory=="L"].log_duration, df_eicu[df_eicu.numbedscategory=="L"].y_pred)[0]

In [None]:
pearsonr(df_eicu[df_eicu.numbedscategory=="M"].log_duration, df_eicu[df_eicu.numbedscategory=="M"].y_pred)[0]

In [None]:
pearsonr(df_eicu[df_eicu.numbedscategory=="S"].log_duration, df_eicu[df_eicu.numbedscategory=="S"].y_pred)[0]