In [1]:
# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

# You only need to authenticate once per session.
auth.authenticate_user()

In [2]:
# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_gbq

# below imports are used for pretty pandas dataframes and plots
from IPython.display import display, HTML
%matplotlib inline
plt.style.use('ggplot')

# Set up environment variables
project_id = 'genuine-box-350018'
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
pandas_gbq.context.project = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
    return pd.io.gbq.read_gbq(
        query,
        project_id=project_id,
        dialect="standard"
    )

if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project at the top of this cell.')

  # test it works
df = run_query("""
SELECT *
FROM `physionet-data.mimic_core.patients`
WHERE subject_id = 10012853
""")
assert df.shape[0] >= 1, 'unable to query MIMIC!'
display(df)


Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10012853,F,91,2175,2014 - 2016,NaT


**PRESCRIPTIONS**

All drug names for variables from prescriptions table are known as per https://github.com/USC-Melady/Benchmarking_DL_MIMICIII/blob/master/Codes/mimic3_mvcv/config/99plusf.csv.

In [3]:
drugs = ["ASA325", "ASA81", "BISA10R", "BISA5", "DOCU100", "HEPBASE", "INHRIV", "INSULIN", "KCL40I",
                "MICROK10", "MAG2PM", "METO25", "METO50", "NACLFLUSH", "NS1000", "PANT40", "PANT40I"]

These drug codes don't exist in prescriptions table... I will just use their names as per Supplementary Info.

In [5]:
drugs = ['Aspirin', 'Bisacodyl', 'Docusate Sodium', 'D5W', 'Humulin-R Insulin', 'Potassium Chloride', 'Magnesium Sulfate', 'Metoprolol Tartrate', 
        'Sodium Chloride 0.9% Flush', 'Pantoprazole']


In [6]:
drug = run_query(f'''
SELECT fco.subject_id, fco.hadm_id, fco.stay_id, fco.intime
, dr.drug, dr.starttime, dr.dose_val_rx, dr.dose_unit_rx
, CASE WHEN DATE_DIFF(dr.starttime,  CAST(fco.intime AS date), hour) > 48 THEN 1 ELSE 0 END AS exclude_after48h
, CASE WHEN dr.starttime IS NULL THEN 1 ELSE 0 END AS exclude_null_time
, CASE WHEN dr.dose_val_rx IS NULL THEN 1 ELSE 0 END AS exclude_null_dose
, CASE WHEN dr.dose_val_rx = '0' THEN 1 ELSE 0 END AS exclude_zero_dose
, CASE WHEN dr.dose_unit_rx IS NULL THEN 1 ELSE 0 END AS exclude_null_unit
FROM `physionet-data.mimic_hosp.prescriptions` dr
INNER JOIN `genuine-box-350018.rnn_dataset.final-cohort-table` fco
ON fco.hadm_id = dr.hadm_id
WHERE LOWER(dr.drug) IN ('aspirin', 'bisacodyl', 'docusate sodium', 'd5w', 'humulin-r insulin', 'potassium chloride', 'magnesium sulfate', 'metoprolol tartrate', 
        'sodium chloride 0.9% flush', 'pantoprazole')
''')
display(drug)

  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes


Unnamed: 0,subject_id,hadm_id,stay_id,intime,drug,starttime,dose_val_rx,dose_unit_rx,exclude_after48h,exclude_null_time,exclude_null_dose,exclude_zero_dose,exclude_null_unit
0,14673382,22246381,33599448,2140-05-19 15:18:00+00:00,Potassium Chloride,2140-05-19 21:00:00,20-40,mEq,0,0,0,0,0
1,12662773,20446083,32238735,2145-05-03 20:54:38+00:00,Metoprolol Tartrate,2145-05-22 13:00:00,7.5,mg,1,0,0,0,0
2,11579351,20838061,39633000,2120-05-18 23:07:00+00:00,Potassium Chloride,2120-05-21 11:00:00,20-40,mEq,1,0,0,0,0
3,19134199,29400785,39949005,2187-07-23 12:51:55+00:00,Potassium Chloride,2187-07-23 22:00:00,20-40,mEq,0,0,0,0,0
4,18562803,23970176,34179883,2144-07-10 14:32:51+00:00,D5W,2144-07-18 09:00:00,3000,mL,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
274417,10008454,20291550,31959184,2110-11-30 17:11:36+00:00,Metoprolol Tartrate,2110-12-03 22:00:00,5-10,mg,1,0,0,0,0
274418,12262068,21108691,30299949,2166-10-19 06:20:03+00:00,Metoprolol Tartrate,2166-10-20 06:00:00,5-10,mg,0,0,0,0,0
274419,17944362,29450847,36781177,2117-07-26 11:49:43+00:00,Metoprolol Tartrate,2117-07-29 21:00:00,5-10,mg,1,0,0,0,0
274420,13398773,22178217,35428141,2189-10-27 09:55:00+00:00,Bisacodyl,2189-10-29 10:00:00,5-10,mg,1,0,0,0,0


In [7]:
drug.loc[drug['drug']=='aspirin','drug']='Aspirin'

In [8]:
pandas_gbq.to_gbq(drug,'rnn_dataset.prescription-table', project_id=project_id, if_exists='replace')

1it [00:15, 15.94s/it]


In [9]:
drug = run_query(f'''
SELECT *
FROM `genuine-box-350018.rnn_dataset.prescription-table`
''')
display(drug)

  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes


Unnamed: 0,subject_id,hadm_id,stay_id,intime,drug,starttime,dose_val_rx,dose_unit_rx,exclude_after48h,exclude_null_time,exclude_null_dose,exclude_zero_dose,exclude_null_unit
0,10386303,23372208,30920322,2181-01-29 03:35:00+00:00,Potassium Chloride,2181-01-29 05:00:00+00:00,,,0,0,1,0,1
1,19611909,26061152,37946973,2160-11-20 18:54:00+00:00,Potassium Chloride,2160-11-21 04:00:00+00:00,,,0,0,1,0,1
2,19611909,26061152,37946973,2160-11-20 18:54:00+00:00,Magnesium Sulfate,2160-11-21 04:00:00+00:00,,,0,0,1,0,1
3,11075865,28595464,37475623,2186-06-05 22:16:56+00:00,Potassium Chloride,2186-06-06 00:00:00+00:00,,,0,0,1,0,1
4,10532326,23582907,34172423,2162-09-17 22:44:00+00:00,Metoprolol Tartrate,2162-09-18 01:00:00+00:00,,,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
274417,16435470,22252925,37935070,2187-07-26 07:01:27+00:00,Potassium Chloride,2187-07-27 15:00:00+00:00,20,mEq,0,0,0,0,0
274418,11028696,24857884,31469055,2177-07-13 11:23:51+00:00,Potassium Chloride,2177-07-15 11:00:00+00:00,20,mEq,1,0,0,0,0
274419,15170887,29502188,34726446,2139-12-13 21:07:08+00:00,Potassium Chloride,2139-12-03 17:00:00+00:00,30,mmol,0,0,0,0,0
274420,11010228,24850230,36283577,2124-01-23 21:48:04+00:00,Potassium Chloride,2124-01-25 11:00:00+00:00,15,mmol,1,0,0,0,0


In [10]:
print(drug['exclude_after48h'].sum())

135524


In [11]:
print(drug['exclude_null_time'].sum())

116


In [12]:
print(drug['exclude_null_dose'].sum())

84


In [13]:
print(drug['exclude_zero_dose'].sum())

10


In [14]:
print(drug['exclude_null_unit'].sum())

84


In [15]:
# subsetting the dataset and removing excluded records 
drug=drug[(drug.exclude_after48h==0)&(drug.exclude_null_time==0)&(drug.exclude_null_dose==0)&(drug.exclude_null_unit==0)&(drug.exclude_zero_dose==0)][['subject_id','hadm_id', 'stay_id', 'intime', 'drug', 'starttime', 'dose_val_rx', 'dose_unit_rx']]

In [16]:
display(drug)

Unnamed: 0,subject_id,hadm_id,stay_id,intime,drug,starttime,dose_val_rx,dose_unit_rx
84,18068553,21442729,31297449,2162-06-05 07:28:00+00:00,Magnesium Sulfate,2162-06-05 13:00:00+00:00,2,gm
85,18068553,21442729,31297449,2162-06-05 07:28:00+00:00,Magnesium Sulfate,2162-06-05 13:00:00+00:00,2-4,gm
86,18068553,21442729,31297449,2162-06-05 07:28:00+00:00,Magnesium Sulfate,2162-06-05 13:00:00+00:00,4,gm
88,16815463,27022067,30022281,2123-04-04 15:28:00+00:00,Magnesium Sulfate,2123-04-05 01:00:00+00:00,2,gm
90,16815463,27022067,30022281,2123-04-04 15:28:00+00:00,Magnesium Sulfate,2123-04-05 01:00:00+00:00,2-4,gm
...,...,...,...,...,...,...,...,...
274415,16435470,22252925,37935070,2187-07-26 07:01:27+00:00,Potassium Chloride,2187-07-26 22:00:00+00:00,40-60,mEq
274416,16435470,22252925,37935070,2187-07-26 07:01:27+00:00,Potassium Chloride,2187-07-26 11:00:00+00:00,20-40,mEq
274417,16435470,22252925,37935070,2187-07-26 07:01:27+00:00,Potassium Chloride,2187-07-27 15:00:00+00:00,20,mEq
274419,15170887,29502188,34726446,2139-12-13 21:07:08+00:00,Potassium Chloride,2139-12-03 17:00:00+00:00,30,mmol


Creating time series data

In [17]:
# Calculating time difference of each record of prescriptions from their admission
drug['timediff'] = (drug['starttime'] - drug['intime']).dt.total_seconds()/60/60

In [18]:
# Creating hour variable for time series
drug['hour'] = np.ceil(drug['timediff']).astype(int)

In [19]:
# I will call the label column as covariates 
drug['covariate'] = drug['drug']

In [20]:
drug['dose_val_rx']=drug['dose_val_rx'].astype(str)

In [21]:
drug['dose'] = [pd.to_numeric(x.split("-")[1]) if '-' in x else pd.to_numeric(x) for x in drug['dose_val_rx']]

In [22]:
# Keeping only the relevant columns
drug = drug[['subject_id', 'hadm_id', 'stay_id', 'hour', 'covariate', 'dose']]

Aggregating by sum

In [23]:
drug2=drug.groupby(['subject_id','hadm_id','stay_id','hour', 'covariate'], as_index=False)['dose'].sum()

In [24]:
display(drug2)

Unnamed: 0,subject_id,hadm_id,stay_id,hour,covariate,dose
0,10000980,26913865,39765666,3,Aspirin,81.0
1,10000980,26913865,39765666,12,Metoprolol Tartrate,12.5
2,10001217,24597018,37067082,-42,Bisacodyl,20.0
3,10001217,24597018,37067082,-42,Docusate Sodium,100.0
4,10001725,25563031,31205490,0,Docusate Sodium,100.0
...,...,...,...,...,...,...
99907,19999840,21033226,38978960,-47,Aspirin,381.0
99908,19999840,21033226,38978960,-3,Metoprolol Tartrate,5.0
99909,19999840,21033226,38978960,20,Potassium Chloride,120.0
99910,19999840,21033226,38978960,30,Bisacodyl,20.0


In [25]:
pandas_gbq.to_gbq(drug2,'rnn_dataset.timeseries-pres-table', project_id=project_id, if_exists='replace')

1it [00:04,  4.46s/it]
