# Defining the study cohort

In [1]:
import numpy as np
import pandas as pd
import psycopg2

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.style
matplotlib.style.use('ggplot')
%matplotlib inline


In [2]:
# create a database connection
sqluser = 'mimic'
dbname = 'mimic'
schema_name = 'mimiciii'

cur = None

In [50]:
if cur: 
    cur.close()
    con.close()

con = psycopg2.connect(dbname = dbname, user = sqluser)
cur = con.cursor()
cur.execute('SET search_path to ' + schema_name)

Inclusion/exclusion criteria: 

+ Age >= 18 (at time of echo?)
+ Patient can only be included once: 1 ICU stay, 1 hospitalization
+ Time filter

In [51]:
query = \
"""
SELECT * FROM echo_icustay;
"""
icu = pd.read_sql_query(query, con)
icu.head()

Unnamed: 0,row_id,hadm_id,charttime,icustay_id,los_icu,icustay_seq,first_icu_stay,outtime,gender,subject_id,...,admittime,dischtime,ethnicity,age,diagnosis,time_to_echo,max_icustay_seq,time_filter,single_stay_filter,age_filter
0,76338,100003,2150-04-18 12:36:00,209281,2.0,1,Y,2150-04-19 14:12:52,M,54610,...,2150-04-17 15:34:00,2150-04-21 17:30:00,WHITE,21864 days 15:34:00,UPPER GI BLEED,0 days 21:02:00,1,True,True,True
1,68597,100006,2108-04-10 10:01:00,291788,5.0,1,Y,2108-04-11 15:18:03,F,9895,...,2108-04-06 15:49:00,2108-04-18 17:18:00,BLACK/AFRICAN AMERICAN,17850 days 15:49:00,COPD FLARE,3 days 18:12:00,1,False,True,True
2,77734,100009,2162-05-17 14:55:00,253656,2.0,1,Y,2162-05-19 22:05:14,M,533,...,2162-05-16 15:56:00,2162-05-21 13:37:00,WHITE,22187 days 15:56:00,CORONARY ARTERY DISEASE,0 days 22:59:00,1,True,True,True
3,76985,100012,2177-03-12 14:54:00,239289,4.0,1,Y,2177-03-18 00:42:15,M,60039,...,2177-03-12 11:48:00,2177-03-22 14:30:00,WHITE,24711 days 11:48:00,CORONARY ARTERY DISEASE,0 days 03:06:00,1,True,True,True
4,76984,100012,2177-03-14 12:46:00,239289,4.0,1,Y,2177-03-18 00:42:15,M,60039,...,2177-03-12 11:48:00,2177-03-22 14:30:00,WHITE,24711 days 11:48:00,CORONARY ARTERY DISEASE,2 days 00:58:00,1,False,True,True


Number of echos dropped by keeping patients only with a single ICU stay (defined across hospital admission IDs):

In [52]:
icu['single_stay_filter'].value_counts()

True     25867
False    13184
Name: single_stay_filter, dtype: int64

Number of echos dropped by keeping patients with an echo within -8h:48h

In [53]:
icu.loc[icu['single_stay_filter'], 'time_filter'].value_counts()

True     14369
False    11498
Name: time_filter, dtype: int64

Number of echos per subject/icustay/hospital admission (which should be equivalent at this point): 

In [54]:
icu.loc[icu['single_stay_filter'], 'hadm_id'].value_counts().value_counts().sort_index()

1     13717
2      3416
3       832
4       304
5       148
6        65
7        29
8        15
9        11
10        3
11        1
13        1
Name: hadm_id, dtype: int64

In [55]:
icu.loc[icu['single_stay_filter'] & icu['time_filter'], 'hadm_id'].value_counts().value_counts().sort_index()

1    11174
2     1097
3      212
4       69
5       14
6        2
7        1
Name: hadm_id, dtype: int64

In [56]:
icu.loc[icu['hadm_id'] == 139187].sort_values('time_to_echo')

Unnamed: 0,row_id,hadm_id,charttime,icustay_id,los_icu,icustay_seq,first_icu_stay,outtime,gender,subject_id,...,admittime,dischtime,ethnicity,age,diagnosis,time_to_echo,max_icustay_seq,time_filter,single_stay_filter,age_filter
15417,74014,139187,2151-10-30 23:10:00,269991,4.0,1,Y,2151-11-04 00:12:15,F,2762,...,2151-10-31 01:31:00,2151-11-06 20:08:00,WHITE,32330 days 01:31:00,CARDIAC TAMPONADE,-1 days +21:39:00,1,True,True,True
15420,74054,139187,2151-10-31 01:59:00,269991,4.0,1,Y,2151-11-04 00:12:15,F,2762,...,2151-10-31 01:31:00,2151-11-06 20:08:00,WHITE,32330 days 01:31:00,CARDIAC TAMPONADE,0 days 00:28:00,1,True,True,True
15421,74031,139187,2151-10-31 13:04:00,269991,4.0,1,Y,2151-11-04 00:12:15,F,2762,...,2151-10-31 01:31:00,2151-11-06 20:08:00,WHITE,32330 days 01:31:00,CARDIAC TAMPONADE,0 days 11:33:00,1,True,True,True
15418,74030,139187,2151-11-01 01:40:00,269991,4.0,1,Y,2151-11-04 00:12:15,F,2762,...,2151-10-31 01:31:00,2151-11-06 20:08:00,WHITE,32330 days 01:31:00,CARDIAC TAMPONADE,1 days 00:09:00,1,True,True,True
15419,74015,139187,2151-11-01 05:50:00,269991,4.0,1,Y,2151-11-04 00:12:15,F,2762,...,2151-10-31 01:31:00,2151-11-06 20:08:00,WHITE,32330 days 01:31:00,CARDIAC TAMPONADE,1 days 04:19:00,1,True,True,True
15423,74013,139187,2151-11-01 09:38:00,269991,4.0,1,Y,2151-11-04 00:12:15,F,2762,...,2151-10-31 01:31:00,2151-11-06 20:08:00,WHITE,32330 days 01:31:00,CARDIAC TAMPONADE,1 days 08:07:00,1,True,True,True
15425,73887,139187,2151-11-01 14:04:00,269991,4.0,1,Y,2151-11-04 00:12:15,F,2762,...,2151-10-31 01:31:00,2151-11-06 20:08:00,WHITE,32330 days 01:31:00,CARDIAC TAMPONADE,1 days 12:33:00,1,True,True,True
15424,73986,139187,2151-11-02 09:03:00,269991,4.0,1,Y,2151-11-04 00:12:15,F,2762,...,2151-10-31 01:31:00,2151-11-06 20:08:00,WHITE,32330 days 01:31:00,CARDIAC TAMPONADE,2 days 07:32:00,1,False,True,True
15422,73985,139187,2151-11-03 11:06:00,269991,4.0,1,Y,2151-11-04 00:12:15,F,2762,...,2151-10-31 01:31:00,2151-11-06 20:08:00,WHITE,32330 days 01:31:00,CARDIAC TAMPONADE,3 days 09:35:00,1,False,True,True


In [57]:
len(set(icu.loc[icu['single_stay_filter'] & icu['time_filter'], 'hadm_id']))

12569

In [58]:
len(set(icu.loc[icu['single_stay_filter'] & icu['time_filter'], 'icustay_id']))

12569

In [59]:
len(set(icu.loc[icu['single_stay_filter'] & icu['time_filter'], 'subject_id']))

12569

We can either select the first echo, or select icustays where there was only 1 echo: 

In [60]:
icu_ = icu.loc[icu['single_stay_filter'] & icu['time_filter']]
icu_.loc[~icu_['hadm_id'].duplicated(keep = False)].shape[0]

11174

And finally filter for age: 

In [62]:
icu_.loc[~icu_['hadm_id'].duplicated(keep = False), 'age_filter'].value_counts()

True     10984
False      190
Name: age_filter, dtype: int64

Leaving us with ~10000 echos. 