In [33]:
from __future__ import print_function
import numpy as np
import pandas as pd

import psycopg2

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook', font_scale=1.2)

import matplotlib.style
matplotlib.style.use('ggplot')
%matplotlib inline

from IPython.display import display

In [34]:
import pymc3 as pm
import theano
import theano.tensor as T
from scipy.stats.stats import pearsonr

import sklearn.model_selection
import statsmodels.api as sm

In [35]:
ef =pd.read_pickle('icu_features_170527.pkl')
ef.head()

Unnamed: 0,icustay_id,hadm_id,subject_id,age,gender,height,weight,ethnicity,insurance,filter_vaso,...,pc_bronch,pc_cath,pc_echo,pc_pressor,pc_rhc,pc_thora,pc_vent,passed_filters,use_record,bmi
0,200001,152234,55973,22290 days 19:06:12,F,167.851667,27.669135,ASIAN - ASIAN INDIAN,Medicare,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,9.820741
1,200003,163557,27513,17625 days 19:50:04,M,177.8,78.224998,WHITE,Private,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,24.744692
2,200006,189514,10950,19736 days 11:28:14,M,165.1,82.400002,OTHER,Medicaid,False,...,,,,,,,,,,30.229647
3,200007,129310,20707,15818 days 10:03:37,M,177.8,126.0,WHITE,Private,False,...,,,,,,,,,,39.857223
4,200009,129607,29904,17353 days 10:34:32,F,160.02,85.833331,WHITE,Private,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,33.520264


In [36]:
len(ef)

61532

In [37]:
use_record = (ef['use_record'] == 1)
has_echo = ~ef['ea_key'].isnull()
#F = use_record & has_echo
F = has_echo

In [38]:
ef_ = ef.loc[F]
len(ef_)

19752

# Preprocessing
Add MDRD to table

In [39]:
def mdrd(data):
    creat = data.lab_creatinine
    age = data.age/np.timedelta64('1', 'Y')
    age.loc[age<0] = age.max()
    gender = data.gender.apply(lambda x: 0.742 if x == 'F' else 1)
    ethnicity = data.ethnicity.apply(lambda x: 1.212 if 'AFRICAN AMERICAN' in x else 1)
    return 175*creat**(-1.154) * age**(-0.203) * gender * ethnicity

In [40]:
value = mdrd(ef_)
ef_['mdrd'] = value
discrete = pd.cut(value, [0, 15, 30, 44, 60, 90, 1000000], labels=[5, 4, 3, 2, 1, 0], include_lowest=True)
ef_['mdrd_discrete'] = discrete
discrete.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0    5335
1    5248
2    2763
3    2139
4    2081
5    1361
dtype: int64

In [41]:
def normalize(x):
    x = x.copy()
    I = ~np.isnan(x)
    x[I] = (x[I] - x[I].mean())/x[I].std()
    return x

# Replicate the findings that Hyperdnamic LV means worse mortality
Panonessa 2015:
Logistic regression ajusted for age, sex, SOFA, Elixhauser score for comorbidities, vasopressor use, mechanical ventilation use
(OR ratio 1.38, 95% confidence interval 1.039-1.842, p = 0.02)

In [50]:
# build the features table
X = pd.DataFrame()

X['subject_id'] = ef_.subject_id

age = ef_.age/np.timedelta64('1', 'Y') # time in years
age_masked = age < 0 # if age is less than 0, then the person is above 89 (90?)
age = normalize(age)
age[age_masked] = 0
X['age'] = age
#X['age_over_90'] = age_masked.astype(float)

gender = ef_.gender
gender = (gender == 'M').astype(float)
X['male_gender'] = gender

apsiii = ef_.apsiii
apsiii = normalize(apsiii)
X['apsiii'] = apsiii

## on vasopressors
on_vaso = ef_.filter_vaso.astype(float)
X['on_vaso'] = on_vaso

## mechanically ventilated
mech_vent = ef_.vf_first_day_vent.astype(float)
X['mech_vent'] = mech_vent

#chf = ef.ex_congestive_heart_failure.astype(float)
#X['CHF'] = chf

no_echo = np.isnan(ef_.ea_key)
lvsys = ef_.ea_lv_systolic
lv_missing = (((np.isnan(lvsys)) | (lvsys < -2)) & ~no_echo).astype(float)
lv_hyperdynamic = (lvsys == -1).astype(float)
lv_normal = (lvsys == 0).astype(float)
lv_mild = (lvsys == 1).astype(float)
lv_depressed = ((lvsys == 2) | (lvsys == -2) | (lvsys ==3)).astype(float)
#lv_sev_depressed = (lvsys == 3).astype(float)
X['lv_missing'] = lv_missing
#X['lv_normal'] = lv_normal
#X['lv_mild'] = lv_mild
X['lv_hyper'] = lv_hyperdynamic
X['lv_depressed'] = lv_depressed
#X['lv_severe'] = lv_sev_depressed


## which ICU was treating the patient
sicu = ef_.st_sicu.astype(float)
sicu[np.isnan(sicu)] = 0
nsicu = ef_.st_nsicu.astype(float)
nsicu[np.isnan(nsicu)] = 0
micu = ef_.st_micu.astype(float)
micu[np.isnan(micu)] = 0
X['sicu'] = sicu
#X['nsicu'] = nsicu
X['micu'] = micu

## elixhauser score -- sum of all elixauser commorbidities
# Features from Echos
temp = ef_.filter(regex='ex_')
elixhauser_score = temp.sum(axis =1)
elixhauser_score = normalize(elixhauser_score)
X['elixhauser_score'] = elixhauser_score

## Outcome: mortality
y = (ef_.dod - ef_.intime) < np.timedelta64(30, 'D')

In [51]:
len(X)

19752

In [53]:
X.subject_id.nunique()

17122

In [14]:
len(y)

19752

In [15]:
X_=X[((X['sicu']==1) | (X['micu']==1)) & (X['lv_depressed'] ==0) & (X['lv_missing']==0)][['age', 'male_gender', 'apsiii', 'on_vaso', 'mech_vent', 'elixhauser_score', 'lv_hyper']]
X_ = sm.add_constant(X_)

y_ = y[X_.index]

In [16]:
table = X.groupby(['lv_hyper']).size()
table

lv_hyper
0.0    18769
1.0      983
dtype: int64

In [17]:
X_.head()

Unnamed: 0,const,age,male_gender,apsiii,on_vaso,mech_vent,elixhauser_score,lv_hyper
8,1,0.499096,1.0,0.183129,0.0,1.0,-0.822703,0.0
20,1,0.099981,1.0,0.47194,0.0,0.0,1.025203,0.0
50,1,0.324321,1.0,-0.683304,0.0,1.0,0.10125,0.0
62,1,0.30283,1.0,0.279399,0.0,1.0,-0.360726,0.0
74,1,0.327664,1.0,-1.21279,1.0,1.0,0.10125,0.0


In [18]:
len(X_)

6321

In [19]:
logit = sm.Logit(y_, X_)
result = logit.fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.460251
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 6321
Model:                          Logit   Df Residuals:                     6313
Method:                           MLE   Df Model:                            7
Date:                Mon, 29 May 2017   Pseudo R-squ.:                  0.1199
Time:                        19:00:26   Log-Likelihood:                -2909.2
converged:                       True   LL-Null:                       -3305.6
                                        LLR p-value:                7.142e-167
                       coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------
const               -2.1653      0.074    -29.377      0.000        -2.310    -2.021
age        

In [20]:
print(np.exp(result.conf_int()))

                         0         1
const             0.099289  0.132549
age               2.391284  4.546663
male_gender       0.867766  1.123887
apsiii            1.680927  1.920987
on_vaso           2.050694  2.874270
mech_vent         1.238918  1.614462
elixhauser_score  0.960298  1.096553
lv_hyper          1.175264  1.717618


In [21]:
print(np.exp(result.params))

const               0.114720
age                 3.297327
male_gender         0.987558
apsiii              1.796953
on_vaso             2.427808
mech_vent           1.414280
elixhauser_score    1.026167
lv_hyper            1.420794
dtype: float64


# Replicate finding that getting echo improves mortality


# Expand on base model of Hyperdynamic LV 
Add fluid measure, and then interaction terms, add additional echo terms.

In [22]:
ef.columns.tolist()

['icustay_id',
 'hadm_id',
 'subject_id',
 'age',
 'gender',
 'height',
 'weight',
 'ethnicity',
 'insurance',
 'filter_vaso',
 'filter_chronic_dialysis',
 'filter_angus_sepsis',
 'filter_hard_cardiogenic',
 'filter_adult',
 'filter_echo',
 'ed_chartdate',
 'ed_charttime',
 'ed_quality',
 'ed_indication',
 'ed_bsa',
 'ed_bp',
 'ed_bpsys',
 'ed_bpdias',
 'ed_hr',
 'ed_test',
 'ed_doppler',
 'ed_contrast',
 'ea_hadm_id',
 'ea_new_time',
 'ea_key',
 'ea_height',
 'ea_weight',
 'ea_sys',
 'ea_diastolic',
 'ea_hr',
 'ea_tv_pulm_htn',
 'ea_tv_regurgitation',
 'ea_tv_stenosis',
 'ea_lv_cavity',
 'ea_lv_diastolic',
 'ea_lv_systolic',
 'ea_lv_wall',
 'ea_rv_cavity',
 'ea_rv_volume_overload',
 'ea_rv_systolic',
 'ea_rv_wall',
 'ea_av_regurgitation',
 'ea_av_stenosis',
 'ea_mv_regurgitation',
 'ea_mv_stenosis',
 'ea_la_cavity',
 'ea_ra_dilated',
 'ea_ra_pressure',
 'ex_congestive_heart_failure',
 'ex_cardiac_aarrhythmias',
 'ex_valvular_disease',
 'ex_pulmonary_circulation',
 'ex_peripheral_vascu

In [24]:
echo_time = ef_['ed_charttime']-ef_['intime']
echo_time

1         0 days 16:49:56
4         0 days 21:59:28
8         1 days 17:34:12
12        0 days 03:11:48
17        0 days 02:47:48
18      -1 days +21:41:10
19        0 days 03:30:50
20        2 days 22:50:58
29      -1 days +18:13:19
44        0 days 00:12:36
45        0 days 04:16:11
46        0 days 15:43:15
47      -1 days +22:03:07
50        2 days 07:17:25
55      -1 days +23:42:00
56        0 days 01:53:32
58        0 days 17:33:50
60        1 days 03:15:11
62        2 days 12:08:32
67      -1 days +18:17:51
68        0 days 15:03:29
74        0 days 03:00:42
78        0 days 15:40:49
81        0 days 01:43:00
85        0 days 21:34:02
89        0 days 13:15:00
90        0 days 20:02:50
92        0 days 01:55:23
96        0 days 03:50:29
97        0 days 01:08:31
               ...       
61435   -1 days +23:52:07
61438     0 days 10:52:31
61446     1 days 13:16:58
61449     1 days 20:42:58
61450     2 days 04:50:20
61453     0 days 19:52:51
61456     0 days 01:18:43
61458   -1 d

In [25]:
use_record = (ef['use_record'] == 1)
has_echo = ~ef['ea_key'].isnull()
echo_time = (ef_['ed_charttime']-ef_['intime']).astype('timedelta64[h]')
F = use_record & has_echo & (echo_time > -8) & (echo_time <=24)
#F = use_record & has_echo 
ef_ = ef.loc[F]
len(ef_)

1972

### fluid features

In [26]:
## fluid features
fluid_day1 = ef_.fb_day1_balance_ml.values.copy()
f1_missing = np.isnan(fluid_day1)
fluid_day1[~f1_missing] = normalize(fluid_day1[~f1_missing])
fluid_day1[f1_missing] = 0

fluid_day2 = ef_.fb_day2_balance_ml.values.copy()
f2_missing = np.isnan(fluid_day1) | np.isnan(fluid_day2)
fluid_day2[~f2_missing] = normalize(fluid_day2[~f2_missing])
fluid_day2[f2_missing] = 0


fluid_day3 = ef_.fb_day3_balance_ml.values.copy()
f3_missing = np.isnan(fluid_day1) | np.isnan(fluid_day2) | np.isnan(fluid_day3)
fluid_day3[~f3_missing] = normalize(fluid_day3[~f3_missing])
fluid_day3[f3_missing] = 0

fluid_day1_input = ef_.fb_day1_input_ml.values.copy()
f1_input_missing = np.isnan(fluid_day1_input)
fluid_day1_input[~f1_input_missing] = normalize(fluid_day1_input[~f1_input_missing])
fluid_day1_input[f1_input_missing] = 0

fluid_day2_input = ef_.fb_day2_input_ml.values.copy()
f2_input_missing = np.isnan(fluid_day2_input)
fluid_day2_input[~f2_input_missing] = normalize(fluid_day2_input[~f2_input_missing])
fluid_day2_input[f2_input_missing] = 0

fluid_day3_input = ef_.fb_day3_input_ml.values.copy()
f3_input_missing = np.isnan(fluid_day3_input)
fluid_day3_input[~f3_input_missing] = normalize(fluid_day3_input[~f3_input_missing])
fluid_day3_input[f3_input_missing] = 0

### echo features

In [29]:
# build the features table
X = pd.DataFrame()

age = ef_.age/np.timedelta64('1', 'Y') # time in years
age_masked = age < 0 # if age is less than 0, then the person is above 89 (90?)
age = normalize(age)
age[age_masked] = 0
X['age'] = age
#X['age_over_90'] = age_masked.astype(float)

gender = ef_.gender
gender = (gender == 'M').astype(float)
X['male_gender'] = gender

apsiii = ef_.apsiii
apsiii = normalize(apsiii)
X['apsiii'] = apsiii

## on vasopressors
on_vaso = ef_.filter_vaso.astype(float)
X['on_vaso'] = on_vaso

## mechanically ventilated
mech_vent = ef_.vf_first_day_vent.astype(float)
X['mech_vent'] = mech_vent

#chf = ef_.ex_congestive_heart_failure.astype(float)
#X['chf'] = chf

no_echo = np.isnan(ef_.ea_key)
lvsys = ef_.ea_lv_systolic
lv_missing = (((np.isnan(lvsys)) | (lvsys < -2)) & ~no_echo).astype(float)
lv_hyperdynamic = (lvsys == -1).astype(float)
lv_normal = (lvsys == 0).astype(float)
lv_mild = (lvsys == 1).astype(float)
lv_depressed = ((lvsys == 2) | (lvsys == -2) | (lvsys ==3)).astype(float)
#lv_sev_depressed = (lvsys == 3).astype(float)
X['lv_missing'] = lv_missing
#X['lv_normal'] = lv_normal
X['lv_mild'] = lv_mild
X['lv_hyper'] = lv_hyperdynamic
X['lv_depressed'] = lv_depressed
#X['lv_severe'] = lv_sev_depressed


## which ICU was treating the patient
sicu = ef_.st_sicu.astype(float)
sicu[np.isnan(sicu)] = 0
nsicu = ef_.st_nsicu.astype(float)
nsicu[np.isnan(nsicu)] = 0
micu = ef_.st_micu.astype(float)
micu[np.isnan(micu)] = 0
X['sicu'] = sicu
#X['nsicu'] = nsicu
X['micu'] = micu

## elixhauser score -- sum of all elixauser commorbidities
# Features from Echos
temp = ef_.filter(regex='ex_')
elixhauser_score = temp.sum(axis =1)
elixhauser_score = normalize(elixhauser_score)
X['elixhauser_score'] = elixhauser_score

## fluids
X['fluid'] = fluid_day2_input
X['fluid_missing'] = f2_input_missing.astype(float)

## Outcome: mortality
y = (ef_.dod - ef_.intime) < np.timedelta64(30, 'D')

X_=X[((X['sicu']==1) | (X['micu']==1)) & (X['lv_depressed'] ==0) & (X['lv_missing']==0)][['age', 'male_gender', 'apsiii', 'on_vaso', 'mech_vent', 'elixhauser_score', 'lv_mild', 'fluid']]
X_['fluid*lv_mild'] = X_['fluid']*X_['lv_mild']
X_ = sm.add_constant(X_)

y_ = y[X_.index]

In [30]:
logit = sm.Logit(y_, X_)
result = logit.fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.518397
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 1364
Model:                          Logit   Df Residuals:                     1354
Method:                           MLE   Df Model:                            9
Date:                Mon, 29 May 2017   Pseudo R-squ.:                  0.1348
Time:                        19:01:25   Log-Likelihood:                -707.09
converged:                       True   LL-Null:                       -817.21
                                        LLR p-value:                 1.860e-42
                       coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------
const               -1.5079      0.157     -9.613      0.000        -1.815    -1.200
age        

In [31]:
print(np.exp(result.conf_int()))

                         0         1
const             0.162779  0.301047
age               1.039128  4.367242
male_gender       0.845201  1.417510
apsiii            1.678256  2.230056
on_vaso           2.159172  4.093459
mech_vent         0.888552  1.514641
elixhauser_score  0.948763  1.230872
lv_mild           0.324235  0.983038
fluid             0.720171  0.980434
fluid*lv_mild     0.480206  1.719404


In [32]:
print(np.exp(result.params))

const               0.221369
age                 2.130287
male_gender         1.094569
apsiii              1.934581
on_vaso             2.972958
mech_vent           1.160102
elixhauser_score    1.080651
lv_mild             0.564566
fluid               0.840286
fluid*lv_mild       0.908662
dtype: float64


In [56]:
ef[(ef['use_record'] == 1) & (ef['ea_lv_systolic'] == 1)].fb_day2_balance_ml

415              NaN
961      -212.305913
1014    -2674.000000
1717    -1967.000000
2086    -3050.000000
2487     -199.485309
2586    -1911.439695
2679    -2282.000000
3227    -1376.700000
3425    -1770.000000
3660     -280.000000
4960    -2946.500000
5105      467.500992
5844      377.500000
6050     2519.005225
7017    -2094.125000
7109             NaN
7316     4085.996042
7439    -4410.000000
7785     5334.160919
7816     -374.856551
8025     4198.000000
8093             NaN
8258     1116.096915
8885    -2756.999990
9237             NaN
9269     4617.166381
9622     -879.665917
9986     -349.000000
10247            NaN
            ...     
47963    4442.293949
48672   -4394.669599
49826    8000.000000
50123   -2041.333355
50246   -8000.000000
51870    7173.349209
52768    3377.000000
54054            NaN
54330    2414.027331
54408    -411.666672
54410   -2063.210059
54943            NaN
55050    -235.000000
55347   -1198.750000
55386   -1089.500000
57218    5185.000000
57692     561

In [59]:
conn = psycopg2.connect(host="localhost",database="mimic", user="postgres", password='postgres', port = '5432')

In [60]:
sql = """
-- first identify outpatient echos with lv_systolic: 
-- 4411 distinct subjects have Outpatient echos, 3864 distinct subjects have Outpatient echos with LV_systolic status
-- (6542 subjects had hadm_id == null, 1452 of whom had no LV systolic)
with outpatient_echos as
(select subject_id, count(*)
from echo_annotations_with_status
where status = 'Outpatient' and lv_systolic is not null
group by subject_id
)
-- next find inpatients with sepsis and lv_systolic reading
-- 9320 patients pass all filters (vaso, angus, hard_cardiogenic, chronic dialysis, adult, micu, sicu, nsicu)
-- 6913 of 9320 did not have congestive heart failure
-- 2497 patients had sepsis and lv_systolic reading
,echo_sepsis as
(select distinct(subject_id) as echo_sepsis_subject_id
from icu_features
--where use_record = 1 and ea_lv_systolic is not null and ex_congestive_heart_failure = 0)
where passed_filters = 1 and ea_lv_systolic is not null and ex_congestive_heart_failure = 0)

-- 2449 patients who have sepsis + inpatient echo + outpatient echos (either before or after)
-- 991 patients who have sepsis + inpatient echo with abnormal lv_systolic + outpatient echos (either before or after)
, interesting_patients as
(select ea.subject_id, ea.hadm_id
--from echo_annotations_with_status as ea 
from icu_features_addoutpatientstatus as ea
join outpatient_echos as out
on  ea.subject_id = out.subject_id
where ea.ea_status = 'Inpatient' and ea_lv_systolic is not null and ea_lv_systolic !=0
group by ea.subject_id, ea.hadm_id)

-- for patients with sepis + icu echo + outpatient echo, list lv_systolic
select status, ip.hadm_id, ip.subject_id, new_time, lv_systolic 
  ,case when status = 'Inpatient' then 1 else 0 end as isInpatient 
from echo_annotations_with_status as ea
join interesting_patients as ip
on ea.subject_id = ip.subject_id
where lv_systolic is not null
order by subject_id, new_time
"""
opdf = pd.read_sql_query(sql, conn)

In [66]:
len(np.unique(opdf['subject_id']))

848

In [67]:
opdf[opdf['subject_id'] == 17]

Unnamed: 0,status,hadm_id,subject_id,new_time,lv_systolic,isinpatient
0,Inpatient,161087,17,2135-04-29 10:16:00,0,1
1,Outpatient,161087,17,2135-05-05 14:00:00,0,0
2,Inpatient,161087,17,2135-05-09 13:02:00,1,1
3,Inpatient,161087,17,2135-05-09 14:41:00,0,1
4,Inpatient,161087,17,2136-04-06 08:57:00,0,1


In [69]:
opdf[opdf['subject_id'] == 249]  # not sure why there are duplicates... 2 different hadm_ids

Unnamed: 0,status,hadm_id,subject_id,new_time,lv_systolic,isinpatient
11,Inpatient,149546,249,2149-12-19 11:04:00,-2,1
12,Inpatient,116935,249,2149-12-19 11:04:00,-2,1
13,Outpatient,116935,249,2150-03-25 14:00:00,1,0
14,Outpatient,149546,249,2150-03-25 14:00:00,1,0
15,Outpatient,116935,249,2153-12-05 11:00:00,0,0
16,Outpatient,149546,249,2153-12-05 11:00:00,0,0
17,Inpatient,116935,249,2155-02-04 17:23:00,3,1
18,Inpatient,149546,249,2155-02-04 17:23:00,3,1
19,Outpatient,116935,249,2155-09-16 13:00:00,2,0
20,Outpatient,149546,249,2155-09-16 13:00:00,2,0


In [70]:
opdf[opdf['subject_id'] == 317]

Unnamed: 0,status,hadm_id,subject_id,new_time,lv_systolic,isinpatient
21,Inpatient,173307,317,2113-10-24 09:40:00,-1,1
22,Inpatient,173307,317,2113-10-24 12:00:00,-1,1
23,Inpatient,173307,317,2113-10-27 11:07:00,0,1
24,Outpatient,173307,317,2114-02-01 11:00:00,0,0


In [71]:
opdf[opdf['subject_id'] == 323]

Unnamed: 0,status,hadm_id,subject_id,new_time,lv_systolic,isinpatient
25,Inpatient,192631,323,2115-05-20 12:39:00,2,1
26,Outpatient,192631,323,2116-03-02 11:00:00,3,0
27,Outpatient,192631,323,2118-10-10 13:00:00,3,0
