In [1]:
import numpy as np
import pandas as pd
import psycopg2

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook', font_scale = 1.5)

import matplotlib.style
matplotlib.style.use('ggplot')
%matplotlib inline

In [2]:
# create a database connection
sqluser = 'mimic'
dbname = 'mimic'
schema_name = 'mimiciii'

cur = None

In [3]:
if cur: 
    cur.close()
    con.close()

con = psycopg2.connect(dbname = dbname, user = sqluser)
cur = con.cursor()
cur.execute('SET search_path to ' + schema_name)

Rewrite labs to get closest lab, if more than one lab, average. 

In [None]:
query = \
"""
SELECT * FROM echo_features_labs; 
"""
all_labs = pd.read_sql_query(query, con)
all_labs.head()

In [None]:
# check that only 1 observed value per (row_id, label, abs_dt)
all_labs.groupby(['row_id', 'label', 'abs_dt'])['valuenum'].count().value_counts()

In [None]:
query = \
"""
SELECT * FROM echo_features_labs; 
"""
squashed_labs = pd.read_sql_query(query, con)
squashed_labs.head()

In [None]:
len(squashed_labs['row_id'].unique())

Some subjects have no data:

In [None]:
squashed_labs.loc[squashed_labs['label'].isnull()].shape

In [None]:
squashed_labs.loc[squashed_labs['label'].isnull()].head()

In [None]:
# check that only 1 observed value per (row_id, label, abs_dt)
squashed_labs.groupby(['row_id', 'label', 'abs_dt'])['valuenum'].count().value_counts()

In [None]:
closest_labs = squashed_labs.sort_values('abs_dt').groupby(['row_id', 'label']).first()['valuenum']
closest_labs.unstack('label').head()

In [None]:
len(closest_labs.dropna().reset_index()['row_id'].unique())

In [None]:
query = \
"""
SELECT * FROM echo_features_labs; 
"""
summary_labs = pd.read_sql_query(query, con)
summary_labs.loc[summary_labs['row_id'] == 59653]

In [None]:
all_labs.loc[(all_labs['row_id'] == 59653) & (all_labs['label'] == 'WBC')].sort_values('abs_dt')

Implement filters. 

In [None]:
query = \
"""
SELECT * FROM echo_filtered; 
"""
filtered = pd.read_sql_query(query, con)
filtered.head()

In [None]:
# hospital admission ids match
set(most_recent_hadm['hadm_id'].unique()) - set(uniq['hadm_id'].unique())

In [None]:
most_recent_hadm_ids = filtered.loc[filtered.groupby('subject_id').apply(lambda x: x['admittime'].idxmax()), 'hadm_id'].unique()
most_recent_hadm = filtered.loc[filtered['hadm_id'].apply(lambda i: i in most_recent_hadm_ids)]

first_echo = most_recent_hadm.loc[most_recent_hadm.groupby('subject_id').apply(lambda x: x['ed_charttime'].idxmin())]

check_ids = set(first_echo['row_id'].unique())

first_echo.head()

In [None]:
for subject_id, group in most_recent_hadm.sort_values('ed_charttime').groupby('subject_id'):
    if subject_id == 16751:
        break
group

In [None]:
query = \
"""
SELECT * FROM echo_filtered; 
"""
uniq = pd.read_sql_query(query, con)
final_ids = set(uniq['row_id'])
print(uniq.shape)
uniq.head()

In [None]:
list(check_ids - final_ids)[:5]

In [None]:
filtered.loc[filtered['row_id'] == 64129]

In [None]:
filtered.loc[filtered['subject_id'] == 16751, ['row_id', 'subject_id', 'admittime', 'ed_charttime']]

In [None]:
filtered.loc[filtered['subject_id'] == 16751, 'ed_charttime'].idxmin()

In [None]:
uniq.loc[uniq['subject_id'] == 16751, ['row_id', 'subject_id', 'admittime', 'ed_charttime']]

Based on the master table, make some plots

In [4]:
query = \
"""
SELECT * FROM echo_filtered; 
"""
filtered = pd.read_sql_query(query, con)
filtered['mortality_30d'] = filtered['ea_days_after_discharge_death'] <= 30

filtered.head()

Unnamed: 0,row_id,icustay_id,hadm_id,subject_id,age_at_intime,gender,height,weight,bmi,ethnicity,...,fl_day1_balance_ml,fl_day2_input_ml,fl_day2_output_ml,fl_day2_balance_ml,fl_day3_input_ml,fl_day3_output_ml,fl_day3_balance_ml,most_recent_hadm,first_echo,mortality_30d
0,67805,280987,182104,36,25443 days 10:29:14,M,,,,WHITE,...,1116.0,508.0,1201.0,-693.0,,,,1,1,False
1,67118,216609,116009,62,25097 days 00:20:44,M,68.0,150.0,22.804931,PATIENT DECLINED TO ANSWER,...,2083.0,505.0,1525.0,-1020.0,335.5,730.0,-394.5,1,1,False
2,80320,281607,112086,75,28018 days 10:35:50,F,,,,WHITE,...,1630.0,12.0,1475.0,-1463.0,,,,1,1,False
3,80083,206222,190243,86,16521 days 11:51:54,F,,,,WHITE,...,1813.0,1006.0,1430.0,-424.0,,,,1,1,False
4,82745,232514,114585,115,27643 days 13:22:28,F,63.0,207.0,36.664399,WHITE,...,2255.0,1105.5,1466.0,-360.5,,,,1,1,False


Does LV systolic function impact 30-day mortality?

In [None]:
sns.barplot(data = filtered[['ea_lv_systolic', 'mortality_30d']], x = 'ea_lv_systolic', y = 'mortality_30d')
plt.ylabel('30-day mortality')

In [None]:
to_plot = filtered.groupby(['ea_lv_systolic', 'mortality_30d'])['row_id'].count().unstack()
to_plot.div(to_plot.sum(axis = 1), axis = 'index')

Test with Fisher. 

In [None]:
to_plot = filtered.groupby(['ea_lv_systolic', 'mortality_30d'])['row_id'].count().unstack()
import scipy as sp
import scipy.stats
to_plot.head()

In [None]:
sp.stats.fisher_exact(to_plot.loc[[3, 0],:])

In [None]:
(filtered['age_at_intime'] / np.timedelta64('1', 'Y')).hist()

Does age affect 30-day mortality? 

In [None]:
filtered['ex_congestive_heart_failure'].value_counts()

In [None]:
filtered.to_csv("../../data/generated/master_161208.csv")
filtered.head()

Does variation in fluid conditioned on LV systolic function impact 30-day mortality?

In [5]:
d = filtered[['ea_lv_systolic', 'fl_day1_balance_ml', 'mortality_30d']].copy()
d = d.loc[d['ea_lv_systolic'] != -50].dropna()
d['ea_lv_systolic'] = d['ea_lv_systolic'] + 1

In [30]:
import sklearn as sk
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.linear_model
import sklearn.metrics

encoder = sk.preprocessing.OneHotEncoder(sparse = False)

X = encoder.fit_transform(d[['ea_lv_systolic']])
X = np.hstack([X, d[['fl_day1_balance_ml']]])
y = d['mortality_30d']

poly = sk.preprocessing.PolynomialFeatures(2)
X = poly.fit_transform(X)

X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y)

model = sk.linear_model.LogisticRegressionCV()
model.fit(X_train, y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [31]:
print(sk.metrics.classification_report(y_test, model.predict(X_test)))

             precision    recall  f1-score   support

      False       0.92      0.77      0.84       383
       True       0.19      0.43      0.26        46

avg / total       0.84      0.73      0.78       429



In [32]:
pd.Series(model.coef_[0])

0    -1.118906e-06
1    -7.297807e-09
2    -8.804808e-07
3    -1.638701e-07
4    -5.514856e-08
5    -1.210851e-08
6    -2.773400e-04
7    -7.297807e-09
8     0.000000e+00
9     0.000000e+00
10    0.000000e+00
11    0.000000e+00
12    1.117224e-04
13   -8.804808e-07
14    0.000000e+00
15    0.000000e+00
16    0.000000e+00
17   -1.889621e-04
18   -1.638701e-07
19    0.000000e+00
20    0.000000e+00
21   -1.579813e-04
22   -5.514856e-08
23    0.000000e+00
24   -8.134915e-05
25   -1.210851e-08
26    3.923015e-05
27    1.038453e-08
dtype: float64