In [13]:
# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

# You only need to authenticate once per session.
auth.authenticate_user()

In [14]:
# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_gbq

# below imports are used for pretty pandas dataframes and plots
from IPython.display import display, HTML
%matplotlib inline
plt.style.use('ggplot')

# Set up environment variables
project_id = 'genuine-box-350018'
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
pandas_gbq.context.project = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
    return pd.io.gbq.read_gbq(
        query,
        project_id=project_id,
        dialect="standard"
    )

if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project at the top of this cell.')

  # test it works
df = run_query("""
SELECT *
FROM `physionet-data.mimic_core.patients`
WHERE subject_id = 10012853
""")
assert df.shape[0] >= 1, 'unable to query MIMIC!'
display(df)


Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10012853,F,91,2175,2014 - 2016,NaT


**OUTPUTEVENTS**

Known itemids for variables from outputevents table as per https://github.com/USC-Melady/Benchmarking_DL_MIMICIII/blob/master/Codes/mimic3_mvcv/config/99plusf.csv.

In [15]:
output_known_itemids = [226571, 226573, 226575, 226576, 226579, 226627, 226583, 226580, 227701, 226582, 226588, 226589, 226599, 226626, 226633, 227510, 227511]

Finding itemids for variables listed in the PLAN, taken from Supplementary Info, (https://github.com/suvdzul/Reproduce-RNN-paper/blob/main/PLAN.md), that don't have their itemids from outputevents table in mimic_icu.

In [None]:
output_unknown =run_query(f'''
SELECT itemid, label, unitname
FROM `physionet-data.mimic_icu.d_items`
WHERE LOWER(label) LIKE '%foley%' 
OR LOWER(label) LIKE '%void%' 
OR LOWER(label) LIKE '%condom cath%' 
''')
display(output_unknown)

In [16]:
# choosing relevant unknown item ids for inputevents
output_unknown_chosen = [226559, 226560, 226561]
output_str_unknown = ','.join([str(s) for s in output_unknown_chosen])

unknown_output_chosen = run_query(f'''
SELECT itemid, label, unitname
FROM `physionet-data.mimic_icu.d_items`
WHERE itemid IN ({output_str_unknown})
''')
display(unknown_output_chosen)

Unnamed: 0,itemid,label,unitname
0,226559,Foley,mL
1,226560,Void,mL
2,226561,Condom Cath,mL


In [17]:
# combining known and unknown together
output_itemids = output_known_itemids + output_unknown_chosen
output_itemids = ','.join([str(s) for s in output_itemids])

Compiling data from outputevents while applying exclusions/inclusions:

- Exclude N/A charttime or value or valueuom
- Exclude 0 values
- Only include data for first 48h after admission

In [18]:
output_data = run_query(f'''
SELECT fco.subject_id, fco.hadm_id, fco.stay_id, fco.intime
, op.itemid, it.label, op.charttime, op.value, op.valueuom
, CASE WHEN DATE_DIFF(op.charttime,  CAST(fco.intime AS date), hour) > 48 THEN 1 ELSE 0 END AS exclude_after48h
, CASE WHEN op.charttime IS NULL THEN 1 ELSE 0 END AS exclude_null_time
, CASE WHEN op.value IS NULL THEN 1 ELSE 0 END AS exclude_null_value
, CASE WHEN op.value = 0 THEN 1 ELSE 0 END AS exclude_zero_value
, CASE WHEN op.valueuom IS NULL THEN 1 ELSE 0 END AS exclude_null_valueuom
FROM `physionet-data.mimic_icu.outputevents` op
INNER JOIN `genuine-box-350018.rnn_dataset.final-cohort-table` fco
ON fco.stay_id = op.stay_id
LEFT JOIN `physionet-data.mimic_icu.d_items` it
ON it.itemid = op.itemid
WHERE op.itemid IN ({output_itemids}) 
''')
display(output_data)


  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes


Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,charttime,value,valueuom,exclude_after48h,exclude_null_time,exclude_null_value,exclude_zero_value,exclude_null_valueuom
0,10003700,28623837,30600691,2165-04-24 05:43:00+00:00,226559,Foley,2165-04-24 05:40:00,300.0,ml,0,0,0,0,0
1,10009035,28324362,38507547,2161-04-27 10:38:12+00:00,226588,Chest Tube #1,2161-04-27 12:00:00,5.0,ml,0,0,0,0,0
2,10009035,28324362,38507547,2161-04-27 10:38:12+00:00,226627,OR Urine,2161-04-27 12:25:00,720.0,ml,0,0,0,0,0
3,10009035,28324362,38507547,2161-04-27 10:38:12+00:00,226559,Foley,2161-04-27 13:00:00,400.0,ml,0,0,0,0,0
4,10009035,28324362,38507547,2161-04-27 10:38:12+00:00,226588,Chest Tube #1,2161-04-27 13:00:00,75.0,ml,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1290731,19999068,21606769,30143796,2161-08-24 05:26:00+00:00,226559,Foley,2161-08-30 12:00:00,180.0,ml,1,0,0,0,0
1290732,19999068,21606769,30143796,2161-08-24 05:26:00+00:00,226559,Foley,2161-08-30 14:00:00,320.0,ml,1,0,0,0,0
1290733,19999068,21606769,30143796,2161-08-24 05:26:00+00:00,226559,Foley,2161-08-30 15:00:00,120.0,ml,1,0,0,0,0
1290734,19999068,21606769,30143796,2161-08-24 05:26:00+00:00,226559,Foley,2161-08-30 18:00:00,400.0,ml,1,0,0,0,0


In [19]:
pandas_gbq.to_gbq(output_data,'rnn_dataset.output-table', project_id=project_id, if_exists='replace')

1it [01:35, 95.71s/it]


Check how many records are excluded

In [20]:
print(output_data['exclude_after48h'].sum())

786978


In [21]:
print(output_data['exclude_null_time'].sum())

0


In [22]:
print(output_data['exclude_null_value'].sum())

0


In [23]:
print(output_data['exclude_zero_value'].sum())

43706


In [24]:
print(output_data['exclude_null_valueuom'].sum())

0


In [25]:
# subsetting the dataset and removing excluded records 
output_data=output_data[(output_data.exclude_after48h==0)&(output_data.exclude_null_time==0)&(output_data.exclude_null_value==0)&(output_data.exclude_null_valueuom==0)&(output_data.exclude_zero_value==0)][['subject_id','hadm_id', 'stay_id', 'intime', 'itemid', 'label', 'charttime', 'value', 'valueuom']]

In [26]:
display(output_data)

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,charttime,value,valueuom
0,10003700,28623837,30600691,2165-04-24 05:43:00+00:00,226559,Foley,2165-04-24 05:40:00,300.0,ml
1,10009035,28324362,38507547,2161-04-27 10:38:12+00:00,226588,Chest Tube #1,2161-04-27 12:00:00,5.0,ml
2,10009035,28324362,38507547,2161-04-27 10:38:12+00:00,226627,OR Urine,2161-04-27 12:25:00,720.0,ml
3,10009035,28324362,38507547,2161-04-27 10:38:12+00:00,226559,Foley,2161-04-27 13:00:00,400.0,ml
4,10009035,28324362,38507547,2161-04-27 10:38:12+00:00,226588,Chest Tube #1,2161-04-27 13:00:00,75.0,ml
...,...,...,...,...,...,...,...,...,...
1290668,19999068,21606769,30143796,2161-08-24 05:26:00+00:00,226559,Foley,2161-08-25 18:44:00,260.0,ml
1290669,19999068,21606769,30143796,2161-08-24 05:26:00+00:00,226559,Foley,2161-08-25 19:00:00,90.0,ml
1290670,19999068,21606769,30143796,2161-08-24 05:26:00+00:00,226559,Foley,2161-08-25 21:00:00,275.0,ml
1290671,19999068,21606769,30143796,2161-08-24 05:26:00+00:00,226559,Foley,2161-08-25 22:00:00,125.0,ml


According to PLAN, only 3 covariates have multiple itemids:
* Gastric Tube = [226571,226573,226575,226576]
* Fecal bag = [226580, 227701]
* TF Residual = [227510, 227511]

In [27]:
# For variables with duplicate codes - count occurence to see which label/measurement to use
output_dup_itemids = [226571,226573,226575,226576,226580,227701,227510,227511]
for x in output_dup_itemids:
  print(x)
  print(len(output_data[output_data['itemid'] == x]))


226571
1674
226573
295
226575
2400
226576
3389
226580
248
227701
460
227510
678
227511
101


Analyzing Gastric tube - since there are 4 itemids, I will analyze 2 by 2 

In [28]:
output_data[(output_data['itemid']==226571)|(output_data['itemid']==226576)]

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,charttime,value,valueuom
6,10009035,28324362,38507547,2161-04-27 10:38:12+00:00,226576,Oral Gastric,2161-04-27 14:00:00,100.0,ml
417,10038933,25129047,32166508,2148-09-10 13:19:00+00:00,226576,Oral Gastric,2148-09-11 05:19:00,500.0,ml
420,10038933,25129047,32166508,2148-09-10 13:19:00+00:00,226576,Oral Gastric,2148-09-11 08:00:00,125.0,ml
975,10089085,28910506,36182571,2117-06-01 20:37:23+00:00,226576,Oral Gastric,2117-06-02 08:00:00,250.0,ml
984,10089085,28910506,36182571,2117-06-01 20:37:23+00:00,226576,Oral Gastric,2117-06-02 16:00:00,50.0,ml
...,...,...,...,...,...,...,...,...,...
1288491,19935090,25668681,31678181,2115-08-05 12:35:00+00:00,226571,Emesis,2115-08-06 18:25:00,30.0,ml
1288492,19935090,25668681,31678181,2115-08-05 12:35:00+00:00,226571,Emesis,2115-08-06 20:00:00,150.0,ml
1288494,19935090,25668681,31678181,2115-08-05 12:35:00+00:00,226571,Emesis,2115-08-06 22:00:00,200.0,ml
1289414,19970491,25338284,30244200,2129-05-17 17:57:50+00:00,226576,Oral Gastric,2129-05-18 04:25:00,400.0,ml


In [29]:
output_data[(output_data['itemid']==226573)|(output_data['itemid']==226575)]

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,charttime,value,valueuom
2020,10163605,26885321,33328050,2178-01-10 17:07:08+00:00,226575,Nasogastric,2178-01-11 05:24:00,200.0,ml
2343,10199711,29568306,35648103,2149-07-11 12:25:00+00:00,226575,Nasogastric,2149-07-11 16:58:00,100.0,ml
5194,10282931,23897542,38921444,2128-12-27 21:30:00+00:00,226575,Nasogastric,2128-12-28 08:30:00,150.0,ml
5205,10282931,23897542,38921444,2128-12-27 21:30:00+00:00,226575,Nasogastric,2128-12-28 18:00:00,250.0,ml
5692,10337364,21819527,30824925,2130-09-12 00:14:33+00:00,226575,Nasogastric,2130-09-12 07:25:00,30.0,ml
...,...,...,...,...,...,...,...,...,...
1284877,19666969,25334580,39897666,2151-02-18 21:50:00+00:00,226575,Nasogastric,2151-02-19 12:31:00,200.0,ml
1287826,19894936,24553760,37241550,2124-12-26 12:31:39+00:00,226575,Nasogastric,2124-12-27 07:00:00,200.0,ml
1287893,19899642,27603399,35617945,2127-07-01 04:21:55+00:00,226575,Nasogastric,2127-07-01 18:19:00,200.0,ml
1288432,19924210,24346726,39740212,2158-09-19 14:21:57+00:00,226575,Nasogastric,2158-09-20 05:06:00,150.0,ml


For Gastric tube all UoM is in ml - just need to replace the label with the majority label, which is Oral Gastric (itemid=226576) with 3389 occurrences.

In [30]:
output_data.loc[(output_data['itemid']==226571)|(output_data['itemid']==226573)|(output_data['itemid']==226575)|(output_data['itemid']==226576), 'label'] = 'Oral Gastric'

Analyzing Fecal bag's multiple itemids.

In [31]:
output_data[(output_data['itemid']==226580)|(output_data['itemid']==227701)]

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,charttime,value,valueuom
552,10043122,25076224,39371028,2167-09-15 04:33:44+00:00,226580,Fecal Bag,2167-09-15 23:48:00,300.0,ml
554,10043122,25076224,39371028,2167-09-15 04:33:44+00:00,226580,Fecal Bag,2167-09-16 01:00:00,600.0,ml
558,10043122,25076224,39371028,2167-09-15 04:33:44+00:00,226580,Fecal Bag,2167-09-16 06:00:00,300.0,ml
1783,10159535,22517101,36344115,2194-06-04 02:28:21+00:00,227701,Drainage Bag,2194-06-04 06:19:00,50.0,ml
1786,10159535,22517101,36344115,2194-06-04 02:28:21+00:00,227701,Drainage Bag,2194-06-04 09:00:00,75.0,ml
...,...,...,...,...,...,...,...,...,...
1279864,19285161,26484506,32494329,2153-02-25 21:02:44+00:00,227701,Drainage Bag,2153-02-26 11:09:00,110.0,ml
1279870,19285161,26484506,32494329,2153-02-25 21:02:44+00:00,227701,Drainage Bag,2153-02-26 18:00:00,20.0,ml
1281612,19438229,25379540,31582530,2153-06-12 18:53:08+00:00,227701,Drainage Bag,2153-06-13 06:57:00,100.0,ml
1282607,19482688,25795011,39113715,2175-05-29 00:14:00+00:00,226580,Fecal Bag,2175-05-30 09:00:00,150.0,ml


Also since all UoM is in ml, no need to convert - just need to replace the label with the majority label (itemid=227701), which is Drainage Bag with 460 occurrences.

In [32]:
output_data.loc[output_data['itemid']==226580, 'label'] = 'Drainage Bag'


Analyzing TF Residual's multiple itemid:

In [33]:
output_data[(output_data['itemid']==227510)|(output_data['itemid']==227511)]

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,charttime,value,valueuom
1767,10148533,26200962,38372401,2113-02-28 20:47:00+00:00,227510,TF Residual,2113-03-01 03:00:00,50.0,ml
6417,10398333,23217649,31460673,2165-03-22 22:15:00+00:00,227510,TF Residual,2165-03-24 00:29:00,5.0,ml
9123,10613952,23277993,33658263,2123-10-10 23:49:00+00:00,227510,TF Residual,2123-10-11 08:00:00,30.0,ml
9128,10613952,23277993,33658263,2123-10-10 23:49:00+00:00,227510,TF Residual,2123-10-11 12:00:00,10.0,ml
10111,10712217,21561906,38908441,2177-11-17 02:47:00+00:00,227511,TF Residual Output,2177-11-17 15:03:00,20.0,ml
...,...,...,...,...,...,...,...,...,...
1273538,18877980,28555997,36750672,2144-04-10 14:18:29+00:00,227511,TF Residual Output,2144-04-11 19:00:00,50.0,ml
1273544,18877980,28555997,36750672,2144-04-10 14:18:29+00:00,227510,TF Residual,2144-04-12 00:33:00,10.0,ml
1279422,19270543,21315786,37099881,2145-02-10 18:27:00+00:00,227510,TF Residual,2145-02-11 20:15:00,10.0,ml
1282699,19497741,22929448,39915779,2179-08-02 01:21:00+00:00,227510,TF Residual,2179-08-02 20:00:00,15.0,ml


Again, since all UoM is in ml, no need to convert - just need to replace the label with the majority label (itemid=227510), which is TF Residual with 678 occurrences.

In [34]:
output_data.loc[output_data['itemid']==227511, 'label'] = 'TF Residual'


Creating time series data

In [35]:
output_data['intime'] = pd.to_datetime(output_data['intime']).dt.tz_convert(None)


In [36]:
# Calculating time difference of each record of outputevents from their admission
output_data['timediff'] = (output_data['charttime'] - output_data['intime']).dt.total_seconds()/60/60

In [37]:
# Creating hour variable for time series
output_data['hour'] = np.ceil(output_data['timediff']).astype(int)

In [38]:
# I will call the label column as covariates 
output_data['covariate'] = output_data['label']

In [39]:
# Keeping only the relevant columns
output_data = output_data[['subject_id', 'hadm_id', 'stay_id', 'hour', 'covariate', 'value']]

Aggregating - I am not sure how each outputevents variables need to be aggregated, it makes sense for me to average them.

In [40]:
output_data2=output_data.groupby(['subject_id','hadm_id','stay_id','hour', 'covariate'], as_index=False)['value'].mean()

In [41]:
display(output_data2)

Unnamed: 0,subject_id,hadm_id,stay_id,hour,covariate,value
0,10000980,26913865,39765666,1,Foley,450.0
1,10000980,26913865,39765666,1,Pre-Admission,400.0
2,10000980,26913865,39765666,3,Foley,600.0
3,10000980,26913865,39765666,5,Foley,800.0
4,10000980,26913865,39765666,6,Foley,1000.0
...,...,...,...,...,...,...
471548,19999987,23865745,36195440,18,Oral Gastric,400.0
471549,19999987,23865745,36195440,20,Foley,75.0
471550,19999987,23865745,36195440,22,Foley,45.0
471551,19999987,23865745,36195440,24,Foley,60.0


In [43]:
pandas_gbq.to_gbq(output_data2,'rnn_dataset.timeseries-output-table', project_id=project_id, if_exists='replace')

1it [00:14, 14.09s/it]
