In [1]:
import numpy as np
import pandas as pd
import psycopg2

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.style
matplotlib.style.use('ggplot')
%matplotlib inline

In [2]:
# create a database connection
sqluser = 'mimic'
dbname = 'mimic'
schema_name = 'mimiciii'

cur = None

In [69]:
if cur: 
    cur.close()
    con.close()

con = psycopg2.connect(dbname = dbname, user = sqluser)
cur = con.cursor()
cur.execute('SET search_path to ' + schema_name)

In [73]:
query = \
"""
select * from icu_features;
"""
res = pd.read_sql_query(query, con)
res.head()

Unnamed: 0,icustay_id,hadm_id,subject_id,age,gender,height,weight,bmi,ethnicity,insurance,...,ea_tv_pulm_htn,ea_tv_tr,ea_lv_cavity,ea_lv_diastolic,ea_lv_systolic,ea_lv_wall,ea_rv_cavity,ea_rv_diastolic_fluid,ea_rv_systolic,ea_rv_wall
0,200001,152234,55973,22290 days 19:06:12,F,167.851667,27.669135,9.820741,ASIAN - ASIAN INDIAN,Medicare,...,,,,,,,,,,
1,200003,163557,27513,17625 days 19:50:04,M,,78.224998,,WHITE,Private,...,1.0,0.0,0.0,0.0,2.0,0.0,0.0,,0.0,
2,200006,189514,10950,19736 days 11:28:14,M,165.1,82.400002,30.229647,OTHER,Medicaid,...,,,,,,,,,,
3,200007,129310,20707,15818 days 10:03:37,M,177.8,126.0,39.857223,WHITE,Private,...,,,,,,,,,,
4,200009,129607,29904,17353 days 10:34:32,F,160.02,85.833331,33.520264,WHITE,Private,...,,0.0,,,0.0,,0.0,,0.0,


## Check lab values

In [9]:
(~res[[c for c in res.columns if 'lab' in c]].isnull()).apply(lambda x: x.value_counts())

Unnamed: 0,lab_albumin,lab_bicarbonate,lab_ckmb,lab_creatinine,lab_crp,lab_egfr,lab_hematocrit,lab_inr,lab_lactate,lab_platelet,lab_ntprobnp,lab_ph,lab_tropi,lab_tropt,lab_wbc
False,47515,8265,56046,9810,61049,61532.0,5511,20476,39227,6094,60801,29346,60579,50715,6260
True,14017,53267,5486,51722,483,,56021,41056,22305,55438,731,32186,953,10817,55272


In [10]:
(~ res.creatinine_last.isnull()).value_counts()

True     37270
False    24262
Name: creatinine_last, dtype: int64

In [11]:
(~ res.creatinine_max.isnull()).value_counts()

True     37270
False    24262
Name: creatinine_max, dtype: int64

The number of people with creatinine last/max values should always be more than the number of people with first day creatinine values, but this is not the case. 

In [12]:
query = \
"""
select * from labs;
"""
res = pd.read_sql_query(query, con)
res.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,label,valuenum
0,3,145834,211552,2101-10-23 03:45:00,TROPI,18.6
1,3,145834,211552,2101-10-23 03:45:00,HEMATOCRIT,30.9
2,3,145834,211552,2101-10-23 03:45:00,HEMOGLOBIN,10.5
3,3,145834,211552,2101-10-23 03:45:00,INR,1.5
4,3,145834,211552,2101-10-23 03:45:00,PLATELET,133.0


Check the lab first day and secondary outcomes scripts. 

In [13]:
query = \
"""
SELECT ls.*, ic.intime, ic.intime + interval '24' hour as endtime 
FROM labs ls
INNER JOIN icustays ic
    ON ic.icustay_id = ls.icustay_id
WHERE ls.charttime BETWEEN ic.intime and ic.intime + interval '24' hour
"""
res = pd.read_sql_query(query, con)
res.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,label,valuenum,intime,endtime
0,3,145834,211552,2101-10-20 19:26:00,HEMATOCRIT,24.9,2101-10-20 19:10:11,2101-10-21 19:10:11
1,3,145834,211552,2101-10-20 19:26:00,HEMOGLOBIN,7.8,2101-10-20 19:10:11,2101-10-21 19:10:11
2,3,145834,211552,2101-10-20 19:26:00,INR,1.7,2101-10-20 19:10:11,2101-10-21 19:10:11
3,3,145834,211552,2101-10-20 19:26:00,PLATELET,190.0,2101-10-20 19:10:11,2101-10-21 19:10:11
4,3,145834,211552,2101-10-20 19:26:00,WBC,11.3,2101-10-20 19:10:11,2101-10-21 19:10:11


In [14]:
len(res.loc[res['label'] == 'CREATININE', 'icustay_id'].unique())

51722

Secondary outcomes: we want the max lab, and we want the last lab. 

To select max lab:

In [20]:
query = \
"""
SELECT ls.icustay_id, ls.label
    ,MAX(valuenum) AS max_valuenum
FROM labs ls 
WHERE label = 'CREATININE'
GROUP BY ls.icustay_id, label
"""

res = pd.read_sql_query(query, con)
res.head()

Unnamed: 0,icustay_id,label,max_valuenum
0,200001,CREATININE,4.7
1,200003,CREATININE,1.0
2,200006,CREATININE,1.1
3,200007,CREATININE,0.9
4,200009,CREATININE,0.5


In [21]:
len(res['icustay_id'].unique())

53775

And to select the last lab, in python:

In [28]:
query = \
"""
SELECT * from labs
WHERE label = 'CREATININE'
"""

labs = pd.read_sql_query(query, con)
labs.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,label,valuenum
0,3,145834,211552,2101-10-20 19:59:00,CREATININE,2.5
1,3,145834,211552,2101-10-20 19:26:00,CREATININE,2.4
2,3,145834,211552,2101-10-20 16:40:00,CREATININE,3.2
3,3,145834,211552,2101-10-22 04:00:00,CREATININE,1.9
4,3,145834,211552,2101-10-22 21:15:00,CREATININE,1.7


In [31]:
labs.sort_values('charttime', ascending = False).groupby('icustay_id').head(n = 1).sort_values('icustay_id').head()

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,label,valuenum
540136,55973,152234,200001,2181-12-04 05:58:00,CREATININE,3.5
388153,27513,163557,200003,2199-08-22 03:32:00,CREATININE,0.6
162817,10950,189514,200006,2159-09-05 08:30:00,CREATININE,0.8
306418,20707,129310,200007,2109-02-20 07:05:00,CREATININE,0.8
433598,29904,129607,200009,2189-12-05 06:55:00,CREATININE,0.5


In [26]:
query = \
"""
SELECT DISTINCT ON (icustay_id) 
    ls.icustay_id, ls.valuenum, ls.charttime
FROM labs ls
WHERE label = 'CREATININE'
ORDER BY icustay_id, charttime DESC
LIMIT 10;
"""

res = pd.read_sql_query(query, con)
res.head()

Unnamed: 0,icustay_id,valuenum,charttime
0,200001,3.5,2181-12-04 05:58:00
1,200003,0.6,2199-08-22 03:32:00
2,200006,0.8,2159-09-05 08:30:00
3,200007,0.8,2109-02-20 07:05:00
4,200009,0.5,2189-12-05 06:55:00


Check lab values again.

In [34]:
query = \
"""
select * from icu_features;
"""
res = pd.read_sql_query(query, con)
res.head()

Unnamed: 0,icustay_id,hadm_id,subject_id,age,gender,height,weight,bmi,ethnicity,insurance,...,ea_tv_pulm_htn,ea_tv_tr,ea_lv_cavity,ea_lv_diastolic,ea_lv_systolic,ea_lv_wall,ea_rv_cavity,ea_rv_diastolic_fluid,ea_rv_systolic,ea_rv_wall
0,200001,152234,55973,22290 days 19:06:12,F,170.18,27.669135,9.553853,ASIAN - ASIAN INDIAN,Medicare,...,,,,,,,,,,
1,200003,163557,27513,17625 days 19:50:04,M,,78.224998,,WHITE,Private,...,1.0,0.0,0.0,0.0,2.0,0.0,0.0,,0.0,
2,200006,189514,10950,19736 days 11:28:14,M,,82.400002,,OTHER,Medicaid,...,,,,,,,,,,
3,200007,129310,20707,15818 days 10:03:37,M,177.8,126.0,39.857223,WHITE,Private,...,,,,,,,,,,
4,200009,129607,29904,17353 days 10:34:32,F,160.02,85.833331,33.520264,WHITE,Private,...,,0.0,,,0.0,,0.0,,0.0,


In [35]:
(~res[[c for c in res.columns if 'lab' in c]].isnull()).apply(lambda x: x.value_counts())

Unnamed: 0,lab_albumin,lab_bicarbonate,lab_ckmb,lab_creatinine,lab_crp,lab_egfr,lab_hematocrit,lab_inr,lab_lactate,lab_platelet,lab_ntprobnp,lab_ph,lab_tropi,lab_tropt,lab_wbc
False,47515,8265,56046,9810,61049,61532.0,5511,20476,39227,6094,60801,29346,60579,50715,6260
True,14017,53267,5486,51722,483,,56021,41056,22305,55438,731,32186,953,10817,55272


In [36]:
(~ res.creatinine_last.isnull()).value_counts()

True     53775
False     7757
Name: creatinine_last, dtype: int64

In [37]:
(~ res.creatinine_max.isnull()).value_counts()

True     53775
False     7757
Name: creatinine_max, dtype: int64

Ok. That works now. Pull out prescriptions. 

In [60]:
query = \
"""
select distinct ps.drug, ps.drug_name_poe, ps.drug_name_generic from prescriptions ps
where concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*epinephrine.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*dopamine.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*dopexamine.*'
"""
res = pd.read_sql_query(query, con)
res.to_csv("inotropes.csv")

In [61]:
query = \
"""
select distinct ps.drug, ps.drug_name_poe, ps.drug_name_generic from prescriptions ps
where concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*furosemide.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*lasix.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*bumetanide.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*bumex.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*ethacrynic.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*edecrin.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*torsemide.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*demadex.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*chlorothiazide.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*diuril.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*metolazone.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*zaroxolyn.*'
"""
res = pd.read_sql_query(query, con)
res.to_csv("diuretics.csv")

In [67]:
query = \
"""
select distinct ps.drug, ps.drug_name_poe, ps.drug_name_generic from prescriptions ps
where concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*hydrochlorothiazide.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*microzide.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*bendroflumethiazide.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*chlorthalidone.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*acetazolamide.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*diamox.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*amiloride.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*triamterene.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*dyrenium.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*spironolactone.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*aldactone.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*eplerenone.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*inspra.*' 
"""
res = pd.read_sql_query(query, con)
res.to_csv("weak_diuretics.csv")
res.head()

Unnamed: 0,drug,drug_name_poe,drug_name_generic
0,AcetaZOLAMIDE,AcetaZOLAMIDE,AcetaZOLAMIDE
1,AcetaZOLAMIDE,AcetaZOLAMIDE,AcetaZOLAMIDE S.R.
2,AcetaZOLAMIDE,AcetaZOLAMIDE,AcetaZOLAMIDE Sodium
3,AcetaZOLAMIDE Sodium,,
4,AcetaZOLamide,AcetaZOLamide,AcetaZOLamide


In [70]:
query = \
"""
select distinct ps.drug, ps.drug_name_poe, ps.drug_name_generic from prescriptions ps
where concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*hydrocortisone.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*hydrocort.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*cortef.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*solucortef.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*methylprednisolone.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*solumedrol.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*amethapred.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*depomedrol.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*prednisone.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*deltasone.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*prednisolone.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*dexamethasone.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*decadron.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*fludrocortisone.*' OR 
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*florinef.*'
"""
res = pd.read_sql_query(query, con)
res.to_csv("steroids.csv")
res.head()

Unnamed: 0,drug,drug_name_poe,drug_name_generic
0,Anusol-HC,Anusol-HC,Hydrocortisone (Rectal) 2.5% Cream
1,Anusol-HC Suppository,Anusol-HC Suppository,Hydrocortisone Acetate Suppository
2,Cortef,Cortef,Cortef
3,Cortef,Cortef,Hydrocortisone
4,De,De,Dexamethasone Sod Phosphate


In [71]:
query = \
"""
select distinct ps.drug, ps.drug_name_poe, ps.drug_name_generic from prescriptions ps
where concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*nitric oxide.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) = 'NO' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*epoprostenol.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*flolan.*' OR
concat(ps.drug, ps.drug_name_poe, ps.drug_name_generic) ~* '.*velitri.*'
"""
res = pd.read_sql_query(query, con)
res.to_csv("inhaled_vasodilators.csv")
res.head()

Unnamed: 0,drug,drug_name_poe,drug_name_generic
0,Epoprostenol,,
1,Epoprostenol Na,,
2,Sterile Diluent for Flolan,,


# Lab values

In [74]:
res[[c for c in res.columns if 'lab' in c]].describe()

Unnamed: 0,lab_albumin,lab_bicarbonate,lab_ckmb,lab_creatinine,lab_crp,lab_hematocrit,lab_inr,lab_lactate,lab_platelet,lab_ntprobnp,lab_ph,lab_tropi,lab_tropt,lab_wbc
count,14017.0,53267.0,5486.0,51722.0,483.0,56021.0,41056.0,22305.0,55438.0,731.0,32186.0,953.0,10817.0,55272.0
mean,3.035746,23.89474,6.856833,1.42674,75.27853,33.192976,1.503353,2.321195,224.720449,9287.860465,7.370334,7.932247,0.821191,12.173168
std,0.66525,4.500554,5.232243,1.541556,81.8617,7.249758,0.848735,1.904945,113.307401,12461.587347,0.073654,11.161581,2.243329,8.956761
min,1.0,5.0,0.1,0.1,0.12,8.65,0.6,0.3,6.0,11.5,6.74,0.1,0.01,0.1
25%,2.6,21.0,2.5,0.7,9.53,28.25,1.133333,1.275,151.0,1256.5,7.335,0.7,0.04,7.9
50%,3.0,24.0,6.0,0.933333,40.0,31.7,1.3,1.8,207.5,4232.0,7.376667,2.6,0.11,10.85
75%,3.5,26.0,9.9,1.4,123.3,36.325,1.5,2.666667,277.0,12027.0,7.415,10.2,0.45,14.75
max,6.3,53.0,65.8,29.966667,299.9,70.9,32.4,26.95,1714.0,68886.0,7.695,49.8,26.8,619.725


In [77]:
res['height'].isnull().value_counts()

False    36995
True     24537
Name: height, dtype: int64

In [78]:
res['weight'].isnull().value_counts()

False    52770
True      8762
Name: weight, dtype: int64

max(inr) = 12