In [1]:
# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

# You only need to authenticate once per session.
auth.authenticate_user()

In [2]:
# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_gbq

# below imports are used for pretty pandas dataframes and plots
from IPython.display import display, HTML
%matplotlib inline
plt.style.use('ggplot')

# Set up environment variables
project_id = 'genuine-box-350018'
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
pandas_gbq.context.project = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
    return pd.io.gbq.read_gbq(
        query,
        project_id=project_id,
        dialect="standard"
    )

if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project at the top of this cell.')

  # test it works
df = run_query("""
SELECT *
FROM `physionet-data.mimic_core.patients`
WHERE subject_id = 10012853
""")
assert df.shape[0] >= 1, 'unable to query MIMIC!'
display(df)


Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10012853,F,91,2175,2014 - 2016,NaT


**INPUTEVENTS**

Known itemids for variables from inputevents table as per  https://github.com/USC-Melady/Benchmarking_DL_MIMICIII/blob/master/Codes/mimic3_mvcv/config/99plusf.csv.

In [None]:
input_known_itemids = [220864, 220970, 221385, 221456, 221668, 221749, 221794, 221828, 221906, 222011, 222056, 223258, 
                       223262, 225154, 225166, 225168, 225799, 225823, 225828, 225834, 225943, 225944, 226089, 226364, 
                       226372, 226452, 226453, 227522, 222011, 227523, 227524 ]

Finding itemids for variables listed in the PLAN, taken from Supplementary Info, (https://github.com/suvdzul/Reproduce-RNN-paper/blob/main/PLAN.md), that don't have their itemids from inputevents table in mimic_icu.

In [None]:
input_unknown =run_query(f'''
SELECT itemid, label, unitname
FROM `physionet-data.mimic_icu.d_items`
WHERE LOWER(label) LIKE '%dextrose%' 
OR LOWER(label) LIKE '%insulin%' 
OR LOWER(label) LIKE '%heparin sodium%' 
OR LOWER(label) LIKE '%metoprolol%' 
''')
display(input_unknown)

Unnamed: 0,itemid,label,unitname
0,225947,Dextrose PN,grams
1,223257,Insulin - 70/30,units
2,223258,Insulin - Regular,units
3,223259,Insulin - NPH,units
4,223260,Insulin - Glargine,units
5,223261,Insulin - Humalog 75/25,units
6,223262,Insulin - Humalog,units
7,225152,Heparin Sodium,units
8,225974,Metoprolol,mg
9,225975,Heparin Sodium (Prophylaxis),dose


In [None]:
# choosing relevant unknown item ids for inputevents
input_unknown_chosen = [220949, 223260, 223261, 225152, 225974]
input_str_unknown = ','.join([str(s) for s in input_unknown_chosen])

unknown_input_chosen = run_query(f'''
SELECT itemid, label, unitname
FROM `physionet-data.mimic_icu.d_items`
WHERE itemid IN ({input_str_unknown})
''')
display(unknown_input_chosen)

Unnamed: 0,itemid,label,unitname
0,223260,Insulin - Glargine,units
1,223261,Insulin - Humalog 75/25,units
2,225152,Heparin Sodium,units
3,225974,Metoprolol,mg
4,220949,Dextrose 5%,mL


In [None]:
# combining known and unknown together
input_itemids = input_known_itemids + input_unknown_chosen
input_itemids = ','.join([str(s) for s in input_itemids])

Compiling data from inputevents while applying exclusions/inclusions:
- Exclude N/A starttime or amount 
- Only include data for first 48h after admission

In [None]:
input_data = run_query(f'''
SELECT fco.subject_id, fco.hadm_id, fco.stay_id, fco.intime
, inp.itemid, it.label, inp.starttime, inp.amount, inp.amountuom
, CASE WHEN DATE_DIFF(inp.starttime,  CAST(fco.intime AS date), hour) > 48 THEN 1 ELSE 0 END AS exclude_after48h
, CASE WHEN inp.starttime IS NULL THEN 1 ELSE 0 END AS exclude_null_time
, CASE WHEN inp.amount IS NULL THEN 1 ELSE 0 END AS exclude_null_amount
FROM `physionet-data.mimic_icu.inputevents` inp
INNER JOIN `genuine-box-350018.rnn_dataset.final-cohort-table` fco
ON fco.stay_id = inp.stay_id
LEFT JOIN `physionet-data.mimic_icu.d_items` it
ON it.itemid = inp.itemid
WHERE inp.itemid IN ({input_itemids}) 
''')
display(input_data)

  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes


Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,starttime,amount,amountuom,exclude_after48h,exclude_null_time,exclude_null_amount
0,14808763,27285064,31419499,2181-04-22 14:49:45+00:00,223258,Insulin - Regular,2181-05-17 17:37:00,23.550000,units,1,0,0
1,16646694,27548876,38839165,2177-04-17 09:02:41+00:00,223258,Insulin - Regular,2177-04-18 18:18:00,9.400359,units,0,0,0
2,16442766,24793343,32880944,2176-10-08 02:35:00+00:00,223258,Insulin - Regular,2176-10-25 16:58:00,16.187500,units,1,0,0
3,17503907,25133113,31608691,2154-03-29 20:17:00+00:00,223258,Insulin - Regular,2154-04-08 18:00:00,84.527781,units,1,0,0
4,13526177,21051213,35552300,2141-04-27 22:20:00+00:00,223258,Insulin - Regular,2141-05-09 17:06:00,89.855993,units,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1449454,10994222,20850356,39561740,2135-11-02 15:03:45+00:00,226364,OR Crystalloid Intake,2135-11-02 15:12:00,2000.000000,ml,0,0,0
1449455,11466804,28549449,36986506,2117-12-21 10:35:00+00:00,226364,OR Crystalloid Intake,2117-12-21 16:45:00,4000.000000,ml,0,0,0
1449456,19123001,24009253,34036243,2146-08-21 14:27:57+00:00,226364,OR Crystalloid Intake,2146-08-21 20:15:00,2000.000000,ml,0,0,0
1449457,17921248,29394605,33613030,2130-08-29 22:28:20+00:00,226364,OR Crystalloid Intake,2130-08-30 02:34:00,2500.000000,ml,0,0,0


In [None]:
pandas_gbq.to_gbq(input_data,'rnn_dataset.input-table', project_id=project_id)

1it [01:53, 113.11s/it]


In [3]:
input_data = run_query(f'''
SELECT *
FROM `genuine-box-350018.rnn_dataset.input-table` fco
''')
display(input_data)

  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes


Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,starttime,amount,amountuom,exclude_after48h,exclude_null_time,exclude_null_amount
0,16646694,27548876,38839165,2177-04-17 09:02:41+00:00,223258,Insulin - Regular,2177-04-18 18:18:00+00:00,9.400359,units,0,0,0
1,17342214,28100246,32724815,2135-09-26 22:37:01+00:00,223258,Insulin - Regular,2135-09-27 18:00:00+00:00,20.000000,units,0,0,0
2,14191549,20914648,32996470,2130-10-12 06:24:00+00:00,223258,Insulin - Regular,2130-10-13 16:05:00+00:00,2.568633,units,0,0,0
3,11356031,28547759,37990311,2188-10-29 11:53:43+00:00,223258,Insulin - Regular,2188-10-30 01:12:00+00:00,23.643004,units,0,0,0
4,18319326,20891516,38495563,2180-05-08 17:37:13+00:00,227524,Magnesium Sulfate (OB-GYN),2180-05-08 21:34:00+00:00,19.668002,grams,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1449454,13126146,23971756,38370066,2182-03-31 00:09:10+00:00,227522,KCL (Bolus),2182-04-09 04:08:00+00:00,99.999998,ml,1,0,0
1449455,16122451,25659249,35134015,2139-08-25 13:11:00+00:00,227522,KCL (Bolus),2139-08-27 05:00:00+00:00,99.999998,ml,1,0,0
1449456,12662773,20446083,32238735,2145-05-03 20:54:38+00:00,227522,KCL (Bolus),2145-06-11 04:29:00+00:00,99.999998,ml,1,0,0
1449457,13336511,28936109,34956862,2160-09-20 06:25:00+00:00,227522,KCL (Bolus),2160-09-27 17:00:00+00:00,99.999998,ml,1,0,0


Checking how many records are excluded

In [9]:
print(input_data['exclude_after48h'].sum())

860522


In [10]:
print(input_data['exclude_null_time'].sum())

0


In [11]:
print(input_data['exclude_null_amount'].sum())

0


In [4]:
# subsetting the dataset and removing excluded records 
input_data=input_data[(input_data.exclude_after48h==0)&(input_data.exclude_null_time==0)&(input_data.exclude_null_amount==0)][['subject_id','hadm_id', 'stay_id', 'intime', 'itemid', 'label', 'starttime', 'amount', 'amountuom']]

According to PLAN, only 2 covariates have multiple itemids:
* Insulin - Regular = [223258, 223262]
* Magnesium Sulfate (Bolus) = [222011, 227523, 227524]

In [5]:
# For variables with duplicate codes - count occurence to see which itemid/measurement to use
input_dup_itemids = [223258, 223262, 222011, 227523, 227524]
for x in input_dup_itemids:
  print(x)
  print(len(input_data[input_data['itemid'] == x]))


223258
99080
223262
22507
222011
21788
227523
15313
227524
24


Analyzing Insulin - Regular duplicate itemids:


In [None]:
input_data[(input_data['itemid'] == 223258) | (input_data['itemid'] == 223262)]

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,starttime,amount,amountuom,exclude_after48h,exclude_null_time,exclude_null_amount
0,14808763,27285064,31419499,2181-04-22 14:49:45+00:00,223258,Insulin - Regular,2181-05-17 17:37:00,23.550000,units,1,0,0
1,16646694,27548876,38839165,2177-04-17 09:02:41+00:00,223258,Insulin - Regular,2177-04-18 18:18:00,9.400359,units,0,0,0
2,16442766,24793343,32880944,2176-10-08 02:35:00+00:00,223258,Insulin - Regular,2176-10-25 16:58:00,16.187500,units,1,0,0
3,17503907,25133113,31608691,2154-03-29 20:17:00+00:00,223258,Insulin - Regular,2154-04-08 18:00:00,84.527781,units,1,0,0
4,13526177,21051213,35552300,2141-04-27 22:20:00+00:00,223258,Insulin - Regular,2141-05-09 17:06:00,89.855993,units,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1312034,18232850,23664218,31217827,2162-02-19 17:14:51+00:00,223262,Insulin - Humalog,2162-03-02 17:00:00,2.000000,units,1,0,0
1312035,14224981,27119701,30462482,2166-02-10 17:19:39+00:00,223262,Insulin - Humalog,2166-02-11 19:37:00,20.000000,units,0,0,0
1312036,10359479,28433266,34349287,2148-05-15 20:08:00+00:00,223262,Insulin - Humalog,2148-05-19 08:31:00,6.000000,units,1,0,0
1312037,16783434,25077442,38529336,2169-02-06 19:07:08+00:00,223262,Insulin - Humalog,2169-02-14 06:07:00,4.000000,units,1,0,0


For Insulin - Regular, not sure why itemid = 223262 was included in the original study, b/c the label is "Insulin - Humalog 25/75". I guess I just need to convert the label into "Insulin - Regular" since UoM is the same anyways.

In [5]:
input_data.loc[input_data['itemid']==223262, 'label'] = 'Insulin - Regular'

Analyzing Magnesium Sulfate duplicate itemids:

In [None]:
input_data[(input_data['itemid'] == 222011) | (input_data['itemid'] == 227523) | (input_data['itemid'] == 227524)]

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,starttime,amount,amountuom,exclude_after48h,exclude_null_time,exclude_null_amount
137,10546500,20839562,38712665,2169-10-22 23:46:37+00:00,227524,Magnesium Sulfate (OB-GYN),2169-10-23 06:44:00,2.000000,grams,0,0,0
140,18319326,20891516,38495563,2180-05-08 17:37:13+00:00,227524,Magnesium Sulfate (OB-GYN),2180-05-08 21:34:00,19.668002,grams,0,0,0
144,15960369,27513772,32455570,2158-12-21 03:38:00+00:00,227524,Magnesium Sulfate (OB-GYN),2158-12-21 04:16:00,17.500000,grams,0,0,0
260326,10819425,26557151,33748358,2135-11-20 12:16:23+00:00,222011,Magnesium Sulfate,2135-11-21 19:30:00,2.000000,grams,0,0,0
260327,16255318,24607734,35069090,2146-03-21 04:41:39+00:00,222011,Magnesium Sulfate,2146-03-23 05:01:00,2.000000,grams,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1447700,14259397,28383475,30073532,2140-10-26 17:26:42+00:00,227523,Magnesium Sulfate (Bolus),2140-10-26 20:15:00,49.999999,ml,0,0,0
1447701,13316810,22329391,34349398,2151-08-09 10:01:31+00:00,227523,Magnesium Sulfate (Bolus),2151-08-09 13:41:00,49.999999,ml,0,0,0
1447702,12236081,26716986,34996554,2178-11-30 00:27:14+00:00,227523,Magnesium Sulfate (Bolus),2178-11-30 04:17:00,49.999999,ml,0,0,0
1447703,17539265,23550723,37531469,2182-06-07 11:17:00+00:00,227523,Magnesium Sulfate (Bolus),2182-06-13 18:46:00,49.999999,ml,1,0,0


Since 222011 = Magnesium Sulfate has the highest occurence and UoM is grams, 227523 = Magnesium Sulfate (Bolus) will be converted to grams since its UoM is ml, 227524 = Magnesium Sulfate (OB-GYN) doesn't need to be converted b/c UoM is grams. I think for Magnesium Sulfate 1g is approximately 2ml. Replace all label names by "Magnesium Sulfate". 

In [6]:
input_data.loc[(input_data['itemid']==227523)|(input_data['itemid']==227524), 'label'] = 'Magnesium Sulfate'
input_data.loc[(input_data['itemid']==227523), 'amountuom'] = 'grams'
input_data.loc[(input_data['itemid']==227523), 'amount'] = input_data.amount/2

Creating time series data - time interval is hourly

In [7]:
# Calculating time difference of each record of inputevents from their admission
input_data['timediff'] = (input_data['starttime'] - input_data['intime']).dt.total_seconds()/60/60

In [8]:
# Creating hour variable for time series
input_data['hour'] = np.ceil(input_data['timediff']).astype(int)

In [9]:
# I will call the label column as covariates 
input_data['covariate'] = input_data['label']

In [12]:
# Keeping only the relevant columns
input_data = input_data[['subject_id', 'hadm_id', 'stay_id', 'hour', 'covariate', 'amount']]

Aggregating - I am not sure how each inputevents variables need to be aggregated, it makes sense for me to average them.

In [32]:
input_data2=input_data.groupby(['subject_id','hadm_id','stay_id','hour', 'covariate'], as_index=False)['amount'].mean()

In [34]:
display(input_data2)

Unnamed: 0,subject_id,hadm_id,stay_id,hour,covariate,amount
0,10000980,26913865,39765666,-1,Dextrose 5%,200.000000
1,10000980,26913865,39765666,3,Dextrose 5%,50.000000
2,10000980,26913865,39765666,4,Dextrose 5%,100.000000
3,10000980,26913865,39765666,5,Dextrose 5%,69.477793
4,10000980,26913865,39765666,5,Furosemide (Lasix),40.000003
...,...,...,...,...,...,...
494622,19999987,23865745,36195440,17,Solution,33.789555
494623,19999987,23865745,36195440,20,Solution,6.505005
494624,19999987,23865745,36195440,21,Solution,7.553390
494625,19999987,23865745,36195440,22,Solution,50.000000


In [35]:
pandas_gbq.to_gbq(input_data2,'rnn_dataset.timeseries-input-table', project_id=project_id)

1it [00:17, 17.18s/it]
