In [1]:
# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

# You only need to authenticate once per session.
auth.authenticate_user()

In [2]:
# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_gbq

# below imports are used for pretty pandas dataframes and plots
from IPython.display import display, HTML
%matplotlib inline
plt.style.use('ggplot')

# Set up environment variables
project_id = 'genuine-box-350018'
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
pandas_gbq.context.project = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
    return pd.io.gbq.read_gbq(
        query,
        project_id=project_id,
        dialect="standard"
    )

if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project at the top of this cell.')

  # test it works
df = run_query("""
SELECT *
FROM `physionet-data.mimic_core.patients`
WHERE subject_id = 10012853
""")
assert df.shape[0] >= 1, 'unable to query MIMIC!'
display(df)


Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10012853,F,91,2175,2014 - 2016,NaT


**PRESCRIPTIONS**

All drug names for variables from prescriptions table are known as per https://github.com/USC-Melady/Benchmarking_DL_MIMICIII/blob/master/Codes/mimic3_mvcv/config/99plusf.csv.

In [6]:
drugs = ["ASA325", "ASA81", "BISA10R", "BISA5", "DOCU100", "HEPBASE", "INHRIV", "INSULIN", "KCL40I",
                "MICROK10", "MAG2PM", "METO25", "METO50", "NACLFLUSH", "NS1000", "PANT40", "PANT40I"]
drugs = ', '.join(str(x) for x in drugs)

These drug codes don't exist in prescriptions table... I will just use their names as per Supplementary Info.

In [11]:
drugs = ['Aspirin', 'Bisacodyl', 'Docusate Sodium', 'D5W', 'Humulin-R Insulin', 'Potassium Chloride', 'Magnesium Sulfate', 'Metoprolol Tartrate', 
        'Sodium Chloride 0.9% Flush', 'Pantoprazole']
drugs = ', '.join(str(x) for x in drugs)

In [18]:
drug = run_query(f'''
SELECT fco.subject_id, fco.hadm_id, fco.stay_id, fco.intime
, dr.drug, dr.starttime, dr.dose_val_rx, dr.dose_unit_rx
, CASE WHEN DATE_DIFF(dr.starttime,  CAST(fco.intime AS date), hour) > 48 THEN 1 ELSE 0 END AS exclude_after48h
, CASE WHEN dr.starttime IS NULL THEN 1 ELSE 0 END AS exclude_null_time
, CASE WHEN dr.dose_val_rx IS NULL THEN 1 ELSE 0 END AS exclude_null_dose
, CASE WHEN dr.dose_val_rx = '0' THEN 1 ELSE 0 END AS exclude_zero_dose
, CASE WHEN dr.dose_unit_rx IS NULL THEN 1 ELSE 0 END AS exclude_null_unit
FROM `physionet-data.mimic_hosp.prescriptions` dr
INNER JOIN `genuine-box-350018.rnn_dataset.final-cohort-table` fco
ON fco.hadm_id = dr.hadm_id
WHERE LOWER(dr.drug) IN ('aspirin', 'bisacodyl', 'docusate sodium', 'd5w', 'humulin-r insulin', 'potassium chloride', 'magnesium sulfate', 'metoprolol tartrate', 
        'sodium chloride 0.9% flush', 'pantoprazole')
''')
display(drug)

  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes


Unnamed: 0,subject_id,hadm_id,stay_id,intime,drug,starttime,dose_val_rx,dose_unit_rx,exclude_after48h,exclude_null_time,exclude_null_dose,exclude_zero_dose,exclude_null_unit
0,10131771,29307511,32851340,2117-08-29 02:53:58+00:00,Metoprolol Tartrate,2117-08-28 23:00:00,2.5 - 5,mg,0,0,0,0,0
1,12790812,29404160,37694225,2178-09-27 20:33:00+00:00,Potassium Chloride,2178-09-29 18:00:00,40-60,mEq,1,0,0,0,0
2,15612422,20131988,38671826,2147-05-04 10:08:03+00:00,Potassium Chloride,2147-05-04 20:00:00,40-60,mEq,0,0,0,0,0
3,16622813,26382064,32922138,2180-11-09 20:46:31+00:00,Potassium Chloride,2180-11-12 00:00:00,40-60,mEq,1,0,0,0,0
4,12875569,21017384,35371396,2184-04-13 21:06:05+00:00,Potassium Chloride,2184-04-14 00:00:00,20-40,mEq,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
274417,13135946,27380004,30690102,2142-01-06 22:00:03+00:00,Bisacodyl,2142-01-10 13:00:00,5-10,mg,1,0,0,0,0
274418,18231053,20661797,35802031,2156-07-10 10:08:54+00:00,Bisacodyl,2156-07-14 09:00:00,5-10,mg,1,0,0,0,0
274419,13104348,27693989,37158575,2186-09-11 18:36:15+00:00,Bisacodyl,2186-09-14 13:00:00,5-10,mg,1,0,0,0,0
274420,19770977,22869142,30976967,2149-12-21 11:52:23+00:00,Bisacodyl,2149-12-01 15:00:00,5-10,mg,0,0,0,0,0


In [19]:
pandas_gbq.to_gbq(drug,'rnn_dataset.prescription-table', project_id=project_id, if_exists='replace')

1it [00:22, 22.99s/it]


In [33]:
drug = run_query(f'''
SELECT *
FROM `genuine-box-350018.rnn_dataset.prescription-table`
''')

  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes


In [12]:
print(drug['exclude_after48h'].sum())

135524


In [13]:
print(drug['exclude_null_time'].sum())

116


In [14]:
print(drug['exclude_null_dose'].sum())

84


In [15]:
print(drug['exclude_zero_dose'].sum())

10


In [16]:
print(drug['exclude_null_unit'].sum())

84


In [34]:
# subsetting the dataset and removing excluded records 
drug=drug[(drug.exclude_after48h==0)&(drug.exclude_null_time==0)&(drug.exclude_null_dose==0)&(drug.exclude_null_unit==0)&(drug.exclude_zero_dose==0)][['subject_id','hadm_id', 'stay_id', 'intime', 'drug', 'starttime', 'dose_val_rx', 'dose_unit_rx']]

In [35]:
display(drug)

Unnamed: 0,subject_id,hadm_id,stay_id,intime,drug,starttime,dose_val_rx,dose_unit_rx
84,18068553,21442729,31297449,2162-06-05 07:28:00+00:00,Magnesium Sulfate,2162-06-05 13:00:00+00:00,4,gm
85,18068553,21442729,31297449,2162-06-05 07:28:00+00:00,Magnesium Sulfate,2162-06-05 13:00:00+00:00,2-4,gm
86,18068553,21442729,31297449,2162-06-05 07:28:00+00:00,Magnesium Sulfate,2162-06-05 13:00:00+00:00,2,gm
88,16815463,27022067,30022281,2123-04-04 15:28:00+00:00,Magnesium Sulfate,2123-04-05 01:00:00+00:00,4,gm
90,16815463,27022067,30022281,2123-04-04 15:28:00+00:00,Magnesium Sulfate,2123-04-05 01:00:00+00:00,2-4,gm
...,...,...,...,...,...,...,...,...
274415,16435470,22252925,37935070,2187-07-26 07:01:27+00:00,Potassium Chloride,2187-07-26 11:00:00+00:00,40,mEq
274416,16435470,22252925,37935070,2187-07-26 07:01:27+00:00,Potassium Chloride,2187-07-26 22:00:00+00:00,60,mEq
274417,16435470,22252925,37935070,2187-07-26 07:01:27+00:00,Potassium Chloride,2187-07-27 15:00:00+00:00,60,mEq
274419,15170887,29502188,34726446,2139-12-13 21:07:08+00:00,Potassium Chloride,2139-12-03 17:00:00+00:00,30,mmol


Creating time series data

In [36]:
# Calculating time difference of each record of prescriptions from their admission
drug['timediff'] = (drug['starttime'] - drug['intime']).dt.total_seconds()/60/60

In [37]:
# Creating hour variable for time series
drug['hour'] = np.ceil(drug['timediff']).astype(int)

In [38]:
# I will call the label column as covariates 
drug['covariate'] = drug['drug']

In [39]:
drug['dose_val_rx']=drug['dose_val_rx'].astype(str)

In [40]:
drug['dose'] = [pd.to_numeric(x.split("-")[1]) if '-' in x else pd.to_numeric(x) for x in drug['dose_val_rx']]

In [41]:
# Keeping only the relevant columns
drug = drug[['subject_id', 'hadm_id', 'stay_id', 'hour', 'covariate', 'dose']]

Aggregating by sum

In [42]:
drug2=drug.groupby(['subject_id','hadm_id','stay_id','hour', 'covariate'], as_index=False)['dose'].sum()

In [43]:
pandas_gbq.to_gbq(drug2,'rnn_dataset.timeseries-pres-table', project_id=project_id, if_exists='replace')

1it [00:05,  5.02s/it]
