In [2]:
# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

# You only need to authenticate once per session.
auth.authenticate_user()

In [3]:
# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_gbq

# below imports are used for pretty pandas dataframes and plots
from IPython.display import display, HTML
%matplotlib inline
plt.style.use('ggplot')

# Set up environment variables
project_id = 'genuine-box-350018'
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
pandas_gbq.context.project = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
    return pd.io.gbq.read_gbq(
        query,
        project_id=project_id,
        dialect="standard"
    )

if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project at the top of this cell.')

  # test it works
df = run_query("""
SELECT *
FROM `physionet-data.mimic_core.patients`
WHERE subject_id = 10012853
""")
assert df.shape[0] >= 1, 'unable to query MIMIC!'
display(df)


Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10012853,F,91,2175,2014 - 2016,NaT


**OUTPUTEVENTS**

Known itemids for variables from outputevents table as per https://github.com/USC-Melady/Benchmarking_DL_MIMICIII/blob/master/Codes/mimic3_mvcv/config/99plusf.csv.

In [None]:
output_known_itemids = [226571, 226573, 226575, 226576, 226579, 226627, 226583, 226580, 227701, 226582, 226588, 226589, 226599, 226626, 226633, 227510, 227511]

Finding itemids for variables listed in the PLAN, taken from Supplementary Info, (https://github.com/suvdzul/Reproduce-RNN-paper/blob/main/PLAN.md), that don't have their itemids from outputevents table in mimic_icu.

In [None]:
output_unknown =run_query(f'''
SELECT itemid, label, unitname
FROM `physionet-data.mimic_icu.d_items`
WHERE LOWER(label) LIKE '%foley%' 
OR LOWER(label) LIKE '%void%' 
OR LOWER(label) LIKE '%condom cath%' 
''')
display(output_unknown)

In [None]:
# choosing relevant unknown item ids for inputevents
output_unknown_chosen = [226559, 226560, 226561]
output_str_unknown = ','.join([str(s) for s in output_unknown_chosen])

unknown_output_chosen = run_query(f'''
SELECT itemid, label, unitname
FROM `physionet-data.mimic_icu.d_items`
WHERE itemid IN ({output_str_unknown})
''')
display(unknown_output_chosen)

Unnamed: 0,itemid,label,unitname
0,226559,Foley,mL
1,226560,Void,mL
2,226561,Condom Cath,mL


In [None]:
# combining known and unknown together
output_itemids = output_known_itemids + output_unknown_chosen
output_itemids = ','.join([str(s) for s in output_itemids])

Compiling data from outputevents while applying exclusions/inclusions:

Exclude N/A charttime or value
Only include data for first 48h after admission

In [None]:
output_data = run_query(f'''
SELECT fco.subject_id, fco.hadm_id, fco.stay_id, fco.intime
, op.itemid, it.label, op.charttime, op.value, op.valueuom
, CASE WHEN DATE_DIFF(op.charttime,  CAST(fco.intime AS date), hour) > 48 THEN 1 ELSE 0 END AS exclude_after48h
, CASE WHEN op.charttime IS NULL THEN 1 ELSE 0 END AS exclude_null_time
, CASE WHEN op.value IS NULL THEN 1 ELSE 0 END AS exclude_null_value
FROM `physionet-data.mimic_icu.outputevents` op
INNER JOIN `genuine-box-350018.rnn_dataset.final-cohort-table` fco
ON fco.stay_id = op.stay_id
LEFT JOIN `physionet-data.mimic_icu.d_items` it
ON it.itemid = op.itemid
WHERE op.itemid IN ({output_itemids}) 
''')
display(output_data)


  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes


Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,charttime,value,valueuom,exclude_after48h,exclude_null_time,exclude_null_value
0,10003700,28623837,30600691,2165-04-24 05:43:00+00:00,226559,Foley,2165-04-24 05:40:00,300.0,ml,0,0,0
1,10009035,28324362,38507547,2161-04-27 10:38:12+00:00,226588,Chest Tube #1,2161-04-27 12:00:00,5.0,ml,0,0,0
2,10009035,28324362,38507547,2161-04-27 10:38:12+00:00,226627,OR Urine,2161-04-27 12:25:00,720.0,ml,0,0,0
3,10009035,28324362,38507547,2161-04-27 10:38:12+00:00,226559,Foley,2161-04-27 13:00:00,400.0,ml,0,0,0
4,10009035,28324362,38507547,2161-04-27 10:38:12+00:00,226588,Chest Tube #1,2161-04-27 13:00:00,75.0,ml,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1290731,19999068,21606769,30143796,2161-08-24 05:26:00+00:00,226559,Foley,2161-08-30 12:00:00,180.0,ml,1,0,0
1290732,19999068,21606769,30143796,2161-08-24 05:26:00+00:00,226559,Foley,2161-08-30 14:00:00,320.0,ml,1,0,0
1290733,19999068,21606769,30143796,2161-08-24 05:26:00+00:00,226559,Foley,2161-08-30 15:00:00,120.0,ml,1,0,0
1290734,19999068,21606769,30143796,2161-08-24 05:26:00+00:00,226559,Foley,2161-08-30 18:00:00,400.0,ml,1,0,0


In [None]:
pandas_gbq.to_gbq(output_data,'rnn_dataset.output-table', project_id=project_id, if_exists='replace')

1it [00:56, 56.36s/it]


In [4]:
output_data = run_query(f'''
SELECT *
FROM `genuine-box-350018.rnn_dataset.output-table` 
''')
display(output_data)

  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes


Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,charttime,value,valueuom,exclude_after48h,exclude_null_time,exclude_null_value
0,10009035,28324362,38507547,2161-04-27 10:38:12+00:00,226627,OR Urine,2161-04-27 12:25:00+00:00,720.0,ml,0,0,0
1,10026161,24614671,39625056,2133-11-11 12:46:18+00:00,226626,OR EBL,2133-11-11 18:53:00+00:00,100.0,ml,0,0,0
2,10026161,24614671,39625056,2133-11-11 12:46:18+00:00,226627,OR Urine,2133-11-11 18:53:00+00:00,70.0,ml,0,0,0
3,10026161,24614671,39625056,2133-11-11 12:46:18+00:00,226582,Ostomy (output),2133-11-12 18:00:00+00:00,0.0,ml,0,0,0
4,10026161,24614671,39625056,2133-11-11 12:46:18+00:00,226582,Ostomy (output),2133-11-12 21:00:00+00:00,0.0,ml,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1290731,19906669,25207997,34049525,2140-10-31 09:57:51+00:00,226599,Jackson Pratt #1,2140-11-03 06:00:00+00:00,150.0,ml,1,0,0
1290732,19906669,25207997,34049525,2140-10-31 09:57:51+00:00,226599,Jackson Pratt #1,2140-11-03 11:00:00+00:00,50.0,ml,1,0,0
1290733,19906669,25207997,34049525,2140-10-31 09:57:51+00:00,226599,Jackson Pratt #1,2140-11-03 15:00:00+00:00,0.0,ml,1,0,0
1290734,19935888,25634582,39209887,2141-08-12 12:47:58+00:00,226599,Jackson Pratt #1,2141-08-12 12:00:00+00:00,230.0,ml,0,0,0


Check how many records are excluded

In [None]:
print(output_data['exclude_after48h'].sum())

786978


In [None]:
print(output_data['exclude_null_time'].sum())

0


In [None]:
print(output_data['exclude_null_value'].sum())

0


In [5]:
# subsetting the dataset and removing excluded records 
output_data=output_data[(output_data.exclude_after48h==0)&(output_data.exclude_null_time==0)&(output_data.exclude_null_value==0)][['subject_id','hadm_id', 'stay_id', 'intime', 'itemid', 'label', 'charttime', 'value', 'valueuom']]

According to PLAN, only 3 covariates have multiple itemids:
* Gastric Tube = [226571,226573,226575,226576]
* Fecal bag = [226580, 227701]
* TF Residual = [227510, 227511]

In [None]:
# For variables with duplicate codes - count occurence to see which label/measurement to use
output_dup_itemids = [226571,226573,226575,226576,226580,227701,227510,227511]
for x in output_dup_itemids:
  print(x)
  print(len(output_data[output_data['itemid'] == x]))


226571
1686
226573
337
226575
2712
226576
3892
226580
252
227701
486
227510
1043
227511
147


Analyzing Gastric tube - since there are 4 itemids, I will analyze 2 by 2 

In [11]:
output_data[(output_data['itemid']==226571)|(output_data['itemid']==226576)]

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,charttime,value,valueuom
37,10120959,22645182,32040322,2182-02-11 01:41:00+00:00,226571,Emesis,2182-02-12 14:00:00+00:00,50.0,ml
40,10128902,27901643,37156219,2175-05-30 18:16:04+00:00,226571,Emesis,2175-05-30 22:00:00+00:00,900.0,ml
41,10128902,27901643,37156219,2175-05-30 18:16:04+00:00,226571,Emesis,2175-05-30 23:34:00+00:00,300.0,ml
61,10174759,25186243,36139734,2173-04-18 12:53:00+00:00,226571,Emesis,2173-04-19 12:54:00+00:00,250.0,ml
113,10253803,29365017,31615601,2140-12-26 13:20:26+00:00,226571,Emesis,2140-12-27 14:15:00+00:00,60.0,ml
...,...,...,...,...,...,...,...,...,...
1175684,19914028,21080533,30521860,2146-05-13 08:37:47+00:00,226576,Oral Gastric,2146-05-14 08:00:00+00:00,150.0,ml
1175685,19914028,21080533,30521860,2146-05-13 08:37:47+00:00,226576,Oral Gastric,2146-05-14 18:00:00+00:00,600.0,ml
1175686,19914028,21080533,30521860,2146-05-13 08:37:47+00:00,226576,Oral Gastric,2146-05-15 00:00:00+00:00,600.0,ml
1175692,19970491,25338284,30244200,2129-05-17 17:57:50+00:00,226576,Oral Gastric,2129-05-18 04:25:00+00:00,400.0,ml


In [12]:
output_data[(output_data['itemid']==226573)|(output_data['itemid']==226575)]

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,charttime,value,valueuom
206,10381011,20375394,30593294,2159-11-04 21:05:00+00:00,226573,Gastric Tube,2159-11-05 20:00:00+00:00,0.0,ml
207,10381011,20375394,30593294,2159-11-04 21:05:00+00:00,226573,Gastric Tube,2159-11-06 00:00:00+00:00,0.0,ml
305,10641795,22739609,32089974,2146-04-28 00:58:00+00:00,226573,Gastric Tube,2146-04-28 01:48:00+00:00,600.0,ml
361,10781524,27068788,33136793,2119-08-05 10:34:04+00:00,226573,Gastric Tube,2119-08-06 03:44:00+00:00,60.0,ml
519,11006152,27052050,30139579,2143-09-25 09:20:51+00:00,226573,Gastric Tube,2143-09-25 19:04:00+00:00,100.0,ml
...,...,...,...,...,...,...,...,...,...
1144245,19666969,25334580,39897666,2151-02-18 21:50:00+00:00,226575,Nasogastric,2151-02-19 12:31:00+00:00,200.0,ml
1144256,19894936,24553760,37241550,2124-12-26 12:31:39+00:00,226575,Nasogastric,2124-12-27 07:00:00+00:00,200.0,ml
1144257,19899642,27603399,35617945,2127-07-01 04:21:55+00:00,226575,Nasogastric,2127-07-01 18:19:00+00:00,200.0,ml
1144265,19924210,24346726,39740212,2158-09-19 14:21:57+00:00,226575,Nasogastric,2158-09-20 05:06:00+00:00,150.0,ml


For Gastric tube all UoM is in ml - just need to replace the label with the majority label, which is Oral Gastric (itemid=226576).

In [None]:
output_data.loc[(output_data['itemid']==226571)|(output_data['itemid']==226573)|(output_data['itemid']==226575)|(output_data['itemid']==226576), 'label'] = 'Oral Gastric'

Analyzing Fecal bag's multiple itemids.

In [None]:
output_data[(output_data['itemid']==226580)|(output_data['itemid']==227701)]

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,charttime,value,valueuom
23,10043122,25076224,39371028,2167-09-15 04:33:44+00:00,226580,Fecal Bag,2167-09-15 23:48:00+00:00,300.0,ml
24,10043122,25076224,39371028,2167-09-15 04:33:44+00:00,226580,Fecal Bag,2167-09-16 01:00:00+00:00,600.0,ml
25,10043122,25076224,39371028,2167-09-15 04:33:44+00:00,226580,Fecal Bag,2167-09-16 06:00:00+00:00,300.0,ml
45,10159535,22517101,36344115,2194-06-04 02:28:21+00:00,227701,Drainage Bag,2194-06-04 06:19:00+00:00,50.0,ml
46,10159535,22517101,36344115,2194-06-04 02:28:21+00:00,227701,Drainage Bag,2194-06-04 09:00:00+00:00,75.0,ml
...,...,...,...,...,...,...,...,...,...
42664,19285161,26484506,32494329,2153-02-25 21:02:44+00:00,227701,Drainage Bag,2153-02-26 11:09:00+00:00,110.0,ml
42665,19285161,26484506,32494329,2153-02-25 21:02:44+00:00,227701,Drainage Bag,2153-02-26 18:00:00+00:00,20.0,ml
42710,19438229,25379540,31582530,2153-06-12 18:53:08+00:00,227701,Drainage Bag,2153-06-13 06:57:00+00:00,100.0,ml
42733,19482688,25795011,39113715,2175-05-29 00:14:00+00:00,226580,Fecal Bag,2175-05-30 09:00:00+00:00,150.0,ml


Also since all UoM is in ml, no need to convert - just need to replace the label with the majority label (itemid=227701), which is Drainage Bag.

In [None]:
output_data.loc[output_data['itemid']==226580, 'label'] = 'Drainage Bag'


Analyzing TF Residual's multiple itemid:

In [None]:
output_data[(output_data['itemid']==227510)|(output_data['itemid']==227511)]

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,charttime,value,valueuom
318,10712217,21561906,38908441,2177-11-17 02:47:00+00:00,227511,TF Residual Output,2177-11-17 15:03:00+00:00,20.0,ml
319,10712217,21561906,38908441,2177-11-17 02:47:00+00:00,227511,TF Residual Output,2177-11-17 20:00:00+00:00,15.0,ml
320,10712217,21561906,38908441,2177-11-17 02:47:00+00:00,227511,TF Residual Output,2177-11-18 01:00:00+00:00,40.0,ml
321,10712217,21561906,38908441,2177-11-17 02:47:00+00:00,227511,TF Residual Output,2177-11-18 15:51:00+00:00,150.0,ml
724,11400494,25827583,39351152,2124-10-13 19:23:00+00:00,227511,TF Residual Output,2124-10-14 20:00:00+00:00,5.0,ml
...,...,...,...,...,...,...,...,...,...
1166767,19497741,22929448,39915779,2179-08-02 01:21:00+00:00,227510,TF Residual,2179-08-03 00:00:00+00:00,0.0,ml
1166768,19497741,22929448,39915779,2179-08-02 01:21:00+00:00,227510,TF Residual,2179-08-03 04:00:00+00:00,0.0,ml
1166787,19747328,25894336,30092827,2151-11-24 18:27:37+00:00,227510,TF Residual,2151-11-26 00:54:00+00:00,0.0,ml
1166790,19752825,25712095,36481446,2112-01-06 22:19:00+00:00,227510,TF Residual,2112-01-07 20:19:00+00:00,0.0,ml


Again, since all UoM is in ml, no need to convert - just need to replace the label with the majority label (itemid=227510), which is TF Residual.

In [None]:
output_data.loc[output_data['itemid']==227511, 'label'] = 'TF Residual'


Creating time series data

In [None]:
# Calculating time difference of each record of outputevents from their admission
output_data['timediff'] = (output_data['charttime'] - output_data['intime']).dt.total_seconds()/60/60

In [None]:
# Creating hour variable for time series
output_data['hour'] = np.ceil(output_data['timediff']).astype(int)

In [None]:
# I will call the label column as covariates 
output_data['covariate'] = output_data['label']

In [None]:
# Keeping only the relevant columns
output_data = output_data[['subject_id', 'hadm_id', 'stay_id', 'hour', 'covariate', 'value']]

Aggregating - I am not sure how each outputevents variables need to be aggregated, it makes sense for me to average them.

In [None]:
output_data2=output_data.groupby(['subject_id','hadm_id','stay_id','hour', 'covariate'], as_index=False)['value'].mean()

In [None]:
display(output_data2)

Unnamed: 0,subject_id,hadm_id,stay_id,hour,covariate,value
0,10000980,26913865,39765666,1,Foley,450.0
1,10000980,26913865,39765666,1,Pre-Admission,400.0
2,10000980,26913865,39765666,3,Foley,600.0
3,10000980,26913865,39765666,5,Foley,800.0
4,10000980,26913865,39765666,6,Foley,1000.0
...,...,...,...,...,...,...
486425,19999987,23865745,36195440,18,Oral Gastric,400.0
486426,19999987,23865745,36195440,20,Foley,75.0
486427,19999987,23865745,36195440,22,Foley,45.0
486428,19999987,23865745,36195440,24,Foley,60.0


In [None]:
pandas_gbq.to_gbq(output_data2,'rnn_dataset.timeseries-output-table', project_id=project_id)

1it [00:16, 16.06s/it]
