In [1]:
# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

# You only need to authenticate once per session.
auth.authenticate_user()

In [2]:
# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_gbq

# below imports are used for pretty pandas dataframes and plots
from IPython.display import display, HTML
%matplotlib inline
plt.style.use('ggplot')

# Set up environment variables
project_id = 'genuine-box-350018'
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
pandas_gbq.context.project = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
    return pd.io.gbq.read_gbq(
        query,
        project_id=project_id,
        dialect="standard"
    )

if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project at the top of this cell.')

  # test it works
df = run_query("""
SELECT *
FROM `physionet-data.mimic_core.patients`
WHERE subject_id = 10012853
""")
assert df.shape[0] >= 1, 'unable to query MIMIC!'
display(df)


Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10012853,F,91,2175,2014 - 2016,NaT


**INPUTEVENTS**

Known itemids for variables from inputevents table as per  https://github.com/USC-Melady/Benchmarking_DL_MIMICIII/blob/master/Codes/mimic3_mvcv/config/99plusf.csv.

In [3]:
input_known_itemids = [220864, 220970, 221385, 221456, 221668, 221749, 221794, 221828, 221906, 222011, 222056, 223258, 
                       223262, 225154, 225166, 225168, 225799, 225823, 225828, 225834, 225943, 225944, 226089, 226364, 
                       226372, 226452, 226453, 227522, 222011, 227523, 227524 ]

Finding itemids for variables listed in the PLAN, taken from Supplementary Info, (https://github.com/suvdzul/Reproduce-RNN-paper/blob/main/PLAN.md), that don't have their itemids from inputevents table in mimic_icu.

In [None]:
input_unknown =run_query(f'''
SELECT itemid, label, unitname
FROM `physionet-data.mimic_icu.d_items`
WHERE LOWER(label) LIKE '%dextrose%' 
OR LOWER(label) LIKE '%insulin%' 
OR LOWER(label) LIKE '%heparin sodium%' 
OR LOWER(label) LIKE '%metoprolol%' 
''')
display(input_unknown)

Unnamed: 0,itemid,label,unitname
0,225947,Dextrose PN,grams
1,223257,Insulin - 70/30,units
2,223258,Insulin - Regular,units
3,223259,Insulin - NPH,units
4,223260,Insulin - Glargine,units
5,223261,Insulin - Humalog 75/25,units
6,223262,Insulin - Humalog,units
7,225152,Heparin Sodium,units
8,225974,Metoprolol,mg
9,225975,Heparin Sodium (Prophylaxis),dose


In [4]:
# choosing relevant unknown item ids for inputevents
input_unknown_chosen = [220949, 223260, 223261, 225152, 225974]
input_str_unknown = ','.join([str(s) for s in input_unknown_chosen])

unknown_input_chosen = run_query(f'''
SELECT itemid, label, unitname
FROM `physionet-data.mimic_icu.d_items`
WHERE itemid IN ({input_str_unknown})
''')
display(unknown_input_chosen)

Unnamed: 0,itemid,label,unitname
0,223260,Insulin - Glargine,units
1,223261,Insulin - Humalog 75/25,units
2,225152,Heparin Sodium,units
3,225974,Metoprolol,mg
4,220949,Dextrose 5%,mL


In [5]:
# combining known and unknown together
input_itemids = input_known_itemids + input_unknown_chosen
input_itemids = ','.join([str(s) for s in input_itemids])

Compiling data from inputevents while applying exclusions/inclusions:
- Exclude N/A starttime or amount or amountuom
- Exclude zero amount
- Only include data for first 48h after admission

In [8]:
input_data = run_query(f'''
SELECT fco.subject_id, fco.hadm_id, fco.stay_id, fco.intime
, inp.itemid, it.label, inp.starttime, inp.amount, inp.amountuom
, CASE WHEN DATE_DIFF(inp.starttime,  CAST(fco.intime AS date), hour) > 48 THEN 1 ELSE 0 END AS exclude_after48h
, CASE WHEN inp.starttime IS NULL THEN 1 ELSE 0 END AS exclude_null_time
, CASE WHEN inp.amount IS NULL THEN 1 ELSE 0 END AS exclude_null_amount
, CASE WHEN inp.amount = 0 THEN 1 ELSE 0 END AS exclude_zero_amount
, CASE WHEN inp.amountuom IS NULL THEN 1 ELSE 0 END AS exclude_null_amountuom
FROM `physionet-data.mimic_icu.inputevents` inp
INNER JOIN `genuine-box-350018.rnn_dataset.final-cohort-table` fco
ON fco.stay_id = inp.stay_id
LEFT JOIN `physionet-data.mimic_icu.d_items` it
ON it.itemid = inp.itemid
WHERE inp.itemid IN ({input_itemids}) 
''')
display(input_data)

  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes


Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,starttime,amount,amountuom,exclude_after48h,exclude_null_time,exclude_null_amount,exclude_zero_amount,exclude_null_amountuom
0,15062911,28321065,33682028,2179-03-29 20:26:26+00:00,223258,Insulin - Regular,2179-04-12 14:51:00,2.645912,units,1,0,0,0,0
1,16003514,25763947,39430726,2172-12-18 06:20:14+00:00,223258,Insulin - Regular,2172-12-24 18:00:00,32.416667,units,1,0,0,0,0
2,17165725,29460006,36350237,2138-01-08 20:11:33+00:00,223258,Insulin - Regular,2138-01-26 16:57:00,5.720864,units,1,0,0,0,0
3,14348484,22857510,36612865,2169-11-24 02:50:57+00:00,223258,Insulin - Regular,2169-11-29 20:02:00,9.515790,units,1,0,0,0,0
4,14022343,27452708,31891318,2199-07-22 20:04:20+00:00,223258,Insulin - Regular,2199-07-31 23:26:00,9.992941,units,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1449454,18111896,25726133,30676468,2132-11-19 13:46:43+00:00,226372,OR Cell Saver Intake,2132-11-23 00:00:00,1000.000000,ml,1,0,0,0,0
1449455,11505705,26612177,30469936,2149-12-05 04:56:00+00:00,226364,OR Crystalloid Intake,2149-12-06 15:00:00,3050.000000,ml,0,0,0,0,0
1449456,10118315,24099382,34550442,2111-06-11 14:31:19+00:00,226364,OR Crystalloid Intake,2111-06-11 15:30:00,3000.000000,ml,0,0,0,0,0
1449457,17835047,24928041,36057803,2175-11-11 19:01:52+00:00,226364,OR Crystalloid Intake,2175-11-11 19:10:00,10000.000000,ml,0,0,0,0,0


In [10]:
pandas_gbq.to_gbq(input_data,'rnn_dataset.input-table', project_id=project_id, if_exists='replace')

1it [01:05, 65.18s/it]


Checking how many records are excluded

In [11]:
print(input_data['exclude_after48h'].sum())

860522


In [12]:
print(input_data['exclude_null_time'].sum())

0


In [13]:
print(input_data['exclude_null_amount'].sum())

0


In [14]:
print(input_data['exclude_zero_amount'].sum())

3988


In [15]:
print(input_data['exclude_null_amountuom'].sum())

0


In [16]:
# subsetting the dataset and removing excluded records 
input_data=input_data[(input_data.exclude_after48h==0)&(input_data.exclude_null_time==0)&(input_data.exclude_null_amount==0)&(input_data.exclude_null_amountuom==0)&(input_data.exclude_zero_amount==0)][['subject_id','hadm_id', 'stay_id', 'intime', 'itemid', 'label', 'starttime', 'amount', 'amountuom']]

In [17]:
display(input_data)

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,starttime,amount,amountuom
21,17549093,22317404,34917578,2180-09-19 11:19:49+00:00,223258,Insulin - Regular,2180-09-21 00:00:00,18.136975,units
31,14191549,20914648,32996470,2130-10-12 06:24:00+00:00,223258,Insulin - Regular,2130-10-12 17:04:00,22.959840,units
119,19320785,27273082,30180072,2152-10-03 20:16:41+00:00,223258,Insulin - Regular,2152-10-04 20:05:00,4.065323,units
121,11356031,28547759,37990311,2188-10-29 11:53:43+00:00,225152,Heparin Sodium,2188-10-30 01:12:00,6258.442647,units
124,18267805,20796767,31748589,2192-09-24 18:48:46+00:00,225152,Heparin Sodium,2192-09-25 18:20:00,5999.999834,units
...,...,...,...,...,...,...,...,...,...
1449452,12176219,21610721,34535596,2113-03-25 12:00:59+00:00,226372,OR Cell Saver Intake,2113-03-25 15:20:00,500.000000,ml
1449453,11196689,23676524,39682828,2125-06-03 10:08:17+00:00,226372,OR Cell Saver Intake,2125-06-03 11:43:00,250.000000,ml
1449455,11505705,26612177,30469936,2149-12-05 04:56:00+00:00,226364,OR Crystalloid Intake,2149-12-06 15:00:00,3050.000000,ml
1449456,10118315,24099382,34550442,2111-06-11 14:31:19+00:00,226364,OR Crystalloid Intake,2111-06-11 15:30:00,3000.000000,ml


According to PLAN, only 2 covariates have multiple itemids:
* Insulin - Regular = [223258, 223262]
* Magnesium Sulfate (Bolus) = [222011, 227523, 227524]

In [18]:
# For variables with duplicate codes - count occurence to see which label/measurement to use
input_dup_itemids = [223258, 223262, 222011, 227523, 227524]
for x in input_dup_itemids:
  print(x)
  print(len(input_data[input_data['itemid'] == x]))


223258
45578
223262
7095
222011
11627
227523
8079
227524
23


Analyzing Insulin - Regular duplicate itemids:


In [19]:
input_data[(input_data['itemid'] == 223258) | (input_data['itemid'] == 223262)]

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,starttime,amount,amountuom
21,17549093,22317404,34917578,2180-09-19 11:19:49+00:00,223258,Insulin - Regular,2180-09-21 00:00:00,18.136975,units
31,14191549,20914648,32996470,2130-10-12 06:24:00+00:00,223258,Insulin - Regular,2130-10-12 17:04:00,22.959840,units
119,19320785,27273082,30180072,2152-10-03 20:16:41+00:00,223258,Insulin - Regular,2152-10-04 20:05:00,4.065323,units
24501,17257913,28666809,33484351,2149-06-02 17:25:00+00:00,223258,Insulin - Regular,2149-06-02 23:00:00,5.835911,units
24507,14010701,21001960,33843685,2138-01-09 11:46:12+00:00,223258,Insulin - Regular,2138-01-10 00:15:00,6.499904,units
...,...,...,...,...,...,...,...,...,...
1311646,11334897,22688036,37186667,2168-03-09 04:26:15+00:00,223262,Insulin - Humalog,2168-03-09 06:12:00,14.000000,units
1311649,11337495,20374655,39033634,2136-10-18 00:13:00+00:00,223262,Insulin - Humalog,2136-10-19 06:00:00,2.000000,units
1311651,14671796,29557424,33976461,2144-10-18 03:04:00+00:00,223262,Insulin - Humalog,2144-10-18 12:00:00,2.000000,units
1311653,19666098,23226557,31811849,2170-10-15 15:08:00+00:00,223262,Insulin - Humalog,2170-10-16 16:22:00,2.000000,units


For Insulin - Regular, itemid = 223262 has 7092 occurrences vs itemid = 223258 has 45578 occurrences.
Not sure why itemid = 223262 was included in the original study, b/c the label is "Insulin - Humalog 25/75". I guess I just need to convert the label into "Insulin - Regular" since UoM is the same anyways.

In [20]:
input_data.loc[input_data['itemid']==223262, 'label'] = 'Insulin - Regular'

Analyzing Magnesium Sulfate duplicate itemids:

In [21]:
input_data[(input_data['itemid'] == 222011) | (input_data['itemid'] == 227523) | (input_data['itemid'] == 227524)]

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,starttime,amount,amountuom
128,10408562,29857554,38114103,2161-09-18 13:01:00+00:00,227524,Magnesium Sulfate (OB-GYN),2161-09-18 23:00:00,18.000001,grams
137,16835119,24918721,38365724,2189-08-21 21:08:19+00:00,227524,Magnesium Sulfate (OB-GYN),2189-08-22 12:37:00,11.066667,grams
261427,10819425,26557151,33748358,2135-11-20 12:16:23+00:00,222011,Magnesium Sulfate,2135-11-21 04:30:00,2.000000,grams
261428,10837867,27312376,33764351,2173-04-30 11:19:08+00:00,222011,Magnesium Sulfate,2173-05-01 15:00:00,2.000000,grams
261429,19112694,28299105,30654646,2185-09-05 23:34:41+00:00,222011,Magnesium Sulfate,2185-09-06 05:34:00,2.000000,grams
...,...,...,...,...,...,...,...,...,...
1447699,13238013,28563772,36445657,2162-05-31 16:52:51+00:00,227523,Magnesium Sulfate (Bolus),2162-05-31 18:57:00,49.999999,ml
1447702,13800523,23525668,35094960,2176-09-08 04:26:04+00:00,227523,Magnesium Sulfate (Bolus),2176-09-09 03:57:00,49.999999,ml
1447703,19640587,22643222,34216849,2164-02-02 11:06:06+00:00,227523,Magnesium Sulfate (Bolus),2164-02-03 04:23:00,49.999999,ml
1447704,15484528,29588888,33323197,2176-03-08 10:57:50+00:00,227523,Magnesium Sulfate (Bolus),2176-03-09 06:06:00,49.999999,ml


Since 222011 = Magnesium Sulfate has the highest occurence (11627) and UoM is grams, 227523 = Magnesium Sulfate (Bolus) has 8079 occurences and will be converted to grams since its UoM is ml, 227524 = Magnesium Sulfate (OB-GYN) has 23 occurrences and they don't need to be converted b/c UoM is grams. I think for Magnesium Sulfate 1g is approximately 2ml. Replace all label names by "Magnesium Sulfate". 

In [22]:
input_data.loc[(input_data['itemid']==227523)|(input_data['itemid']==227524), 'label'] = 'Magnesium Sulfate'
input_data.loc[(input_data['itemid']==227523), 'amountuom'] = 'grams'
input_data.loc[(input_data['itemid']==227523), 'amount'] = input_data.amount/2

Creating time series data - time interval is hourly

In [30]:
# removing timestamp from intime
input_data['intime'] = pd.to_datetime(input_data['intime']).dt.tz_convert(None)

In [31]:
# Calculating time difference of each record of inputevents from their admission
input_data['timediff'] = (input_data['starttime'] - input_data['intime']).dt.total_seconds()/60/60

In [32]:
# Creating hour variable for time series
input_data['hour'] = np.ceil(input_data['timediff']).astype(int)

In [33]:
# I will call the label column as covariates 
input_data['covariate'] = input_data['label']

In [34]:
# Keeping only the relevant columns
input_data = input_data[['subject_id', 'hadm_id', 'stay_id', 'hour', 'covariate', 'amount']]

Aggregating - I am not sure how each inputevents variables need to be aggregated, it makes sense for me to average them.

In [35]:
input_data2=input_data.groupby(['subject_id','hadm_id','stay_id','hour', 'covariate'], as_index=False)['amount'].mean()

In [36]:
display(input_data2)

Unnamed: 0,subject_id,hadm_id,stay_id,hour,covariate,amount
0,10000980,26913865,39765666,-1,Dextrose 5%,200.000000
1,10000980,26913865,39765666,3,Dextrose 5%,50.000000
2,10000980,26913865,39765666,4,Dextrose 5%,100.000000
3,10000980,26913865,39765666,5,Dextrose 5%,69.477793
4,10000980,26913865,39765666,5,Furosemide (Lasix),40.000003
...,...,...,...,...,...,...
493600,19999987,23865745,36195440,17,Solution,33.789555
493601,19999987,23865745,36195440,20,Solution,6.505005
493602,19999987,23865745,36195440,21,Solution,7.553390
493603,19999987,23865745,36195440,22,Solution,50.000000


In [39]:
pandas_gbq.to_gbq(input_data2,'rnn_dataset.timeseries-input-table', project_id=project_id, if_exists='replace')

1it [00:11, 11.76s/it]
