In [None]:
# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

# You only need to authenticate once per session.
auth.authenticate_user()

In [None]:
# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_gbq

# below imports are used for pretty pandas dataframes and plots
from IPython.display import display, HTML
%matplotlib inline
plt.style.use('ggplot')

# Set up environment variables
project_id = 'genuine-box-350018'
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
pandas_gbq.context.project = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
    return pd.io.gbq.read_gbq(
        query,
        project_id=project_id,
        dialect="standard"
    )

if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project at the top of this cell.')

  # test it works
df = run_query("""
SELECT *
FROM `physionet-data.mimic_core.patients`
WHERE subject_id = 10012853
""")
assert df.shape[0] >= 1, 'unable to query MIMIC!'
display(df)


Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10012853,F,91,2175,2014 - 2016,NaT


**LABEVENTS**

Known itemids for variables from labevents table as per https://github.com/USC-Melady/Benchmarking_DL_MIMICIII/blob/master/Codes/mimic3_mvcv/config/99plusf.csv.

In [None]:
lab_known_itemids = [51221, 51265, 51222, 51249, 51248, 51250, 51279, 51277, 50902, 50868, 50912, 50931, 
                     50960, 50893, 50970, 51237, 51274, 51275, 51244, 51254, 51256, 51146, 51200, 50820, 
                     50802, 50804, 50818, 51491, 51498, 50813, 50861, 50878, 50863, 50862]

Finding itemids for variables listed in the PLAN, taken from Supplementary Info, (https://github.com/suvdzul/Reproduce-RNN-paper/blob/main/PLAN.md), that don't have their itemids from labevents table in mimic_hosp.

In [None]:
lab_unknown = run_query(f'''
SELECT itemid, label
FROM `physionet-data.mimic_hosp.d_labitems` 
WHERE LOWER(label) LIKE '%white blood cells%'
OR LOWER(label) LIKE '%potassium%' 
OR LOWER(label) LIKE '%sodium%' 
OR LOWER(label) LIKE '%bicarbonate%' 
OR LOWER(label) LIKE '%urea nitrogen%' 
OR LOWER(label) LIKE '%bilirubin%' 
OR LOWER(label) LIKE '%po2%'
''')
pd.set_option('display.max_rows', None)
display(lab_unknown)

Unnamed: 0,itemid,label
0,50803,"Calculated Bicarbonate, Whole Blood"
1,50821,pO2
2,50822,"Potassium, Whole Blood"
3,52452,"Potassium, Whole Blood"
4,50824,"Sodium, Whole Blood"
5,52455,"Sodium, Whole Blood"
6,52039,Calculated Bicarbonate
7,52042,pO2
8,52046,"Potassium, Urine"
9,52047,"Sodium, Urine"


In [None]:
# choosing relevant unknown item ids
lab_unknown_chosen = [52042, 50821, 50885, 51006, 52647, 50882, 50983, 52623, 50833, 50971, 52610, 51755, 51756, 51301]
lab_str_unknown = ','.join([str(s) for s in lab_unknown_chosen])

unknown_lab_chosen = run_query(f'''
SELECT itemid, label
FROM `physionet-data.mimic_hosp.d_labitems`
WHERE itemid IN ({lab_str_unknown})
''')
display(unknown_lab_chosen)

Unnamed: 0,itemid,label
0,50821,pO2
1,52042,pO2
2,50833,Potassium
3,50882,Bicarbonate
4,50885,"Bilirubin, Total"
5,50971,Potassium
6,52610,Potassium
7,50983,Sodium
8,52623,Sodium
9,51006,Urea Nitrogen


In [None]:
# combining known and unknown together
lab_itemids = lab_known_itemids + lab_unknown_chosen
lab_itemids = ','.join([str(s) for s in lab_itemids])

Compiling data from labevents while applying exclusions/inclusions:
- Exclude N/A charttime or value/valuenum
- Only include data for first 48h after admission

In [None]:
lab_data = run_query(f'''
SELECT fco.subject_id, fco.hadm_id, fco.stay_id, fco.intime
, lab.itemid, it.label, lab.charttime, lab.value, lab.valuenum, lab.valueuom
, CASE WHEN DATE_DIFF(lab.charttime,  CAST(fco.intime AS date), hour) > 48 THEN 1 ELSE 0 END AS exclude_after48h
, CASE WHEN lab.charttime IS NULL THEN 1 ELSE 0 END AS exclude_null_time
, CASE WHEN (lab.value IS NULL AND lab.valuenum IS NULL) THEN 1 ELSE 0 END AS exclude_null_value
, CASE WHEN (lab.value='0' AND lab.valuenum=0) THEN 1 ELSE 0 END AS exclude_zero_none_value
, CASE WHEN lab.valueuom is NULL THEN 1 ELSE 0 END AS exclude_null_valueuom
FROM `physionet-data.mimic_hosp.labevents` lab
INNER JOIN `genuine-box-350018.rnn_dataset.final-cohort-table` fco
ON fco.hadm_id = lab.hadm_id
LEFT JOIN `physionet-data.mimic_hosp.d_labitems` it
ON it.itemid = lab.itemid
WHERE lab.itemid IN ({lab_itemids}) 
''')
display(lab_data.head())

  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_cl

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,charttime,value,valuenum,valueuom,exclude_after48h,exclude_null_time,exclude_null_value,exclude_zero_none_value,exclude_null_valueuom
0,11067215,23574374,33085406,2168-04-29 15:14:33+00:00,51237,INR(PT),2168-04-22 05:45:00,1.9,1.9,,0,0,0,0,1
1,11226194,24922551,36319779,2175-12-27 02:13:39+00:00,51237,INR(PT),2175-12-29 05:17:00,1.1,1.1,,1,0,0,0,1
2,11539251,21399974,38365376,2179-04-22 14:13:14+00:00,51237,INR(PT),2179-04-24 02:46:00,1.3,1.3,,1,0,0,0,1
3,11955806,29300748,34127493,2170-12-23 17:17:01+00:00,51237,INR(PT),2170-12-23 14:07:00,1.1,1.1,,0,0,0,0,1
4,13241710,22990223,37121997,2153-08-14 19:25:41+00:00,51237,INR(PT),2153-08-14 18:11:00,1.1,1.1,,0,0,0,0,1


In [None]:
pandas_gbq.to_gbq(lab_data,'rnn_dataset.lab-table', project_id = project_id, if_exists = 'replace')

1it [05:01, 301.22s/it]


In [None]:
# Let's see how it looks like for one subject 14848066
lab_data[lab_data['subject_id']==14848066]

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,charttime,value,valuenum,valueuom,exclude_after48h,exclude_null_time,exclude_null_value,exclude_zero_none_value,exclude_null_valueuom
5195,14848066,20683707,35293872,2120-08-07 22:45:54+00:00,51237,INR(PT),2120-08-24 04:07:00,,,,1,0,1,0,1
5706,14848066,20683707,35293872,2120-08-07 22:45:54+00:00,51498,Specific Gravity,2120-08-28 14:37:00,1.010,1.01,,1,0,0,0,0
6901,14848066,20683707,35293872,2120-08-07 22:45:54+00:00,51200,Eosinophils,2120-08-22 14:37:00,0,0.00,%,1,0,0,1,0
6956,14848066,20683707,35293872,2120-08-07 22:45:54+00:00,51146,Basophils,2120-08-15 00:57:00,0,0.00,%,1,0,0,1,0
9520,14848066,20683707,35293872,2120-08-07 22:45:54+00:00,51254,Monocytes,2120-08-25 03:54:00,0,0.00,%,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6309611,14848066,20683707,35293872,2120-08-07 22:45:54+00:00,50821,pO2,2120-08-10 04:55:00,75,75.00,mm Hg,1,0,0,0,0
6314298,14848066,20683707,35293872,2120-08-07 22:45:54+00:00,50821,pO2,2120-08-08 00:14:00,45,45.00,mm Hg,0,0,0,0,0
6316442,14848066,20683707,35293872,2120-08-07 22:45:54+00:00,50818,pCO2,2120-08-14 02:42:00,39,39.00,mm Hg,1,0,0,0,0
6321759,14848066,20683707,35293872,2120-08-07 22:45:54+00:00,50820,pH,2120-08-24 09:46:00,7.41,7.41,units,1,0,0,0,0


Check how many records are excluded

In [None]:
print(lab_data['exclude_after48h'].sum())

4152560


In [None]:
print(lab_data['exclude_null_time'].sum())

0


In [None]:
print(lab_data['exclude_null_value'].sum())

8374


In [None]:
print(lab_data['exclude_null_valueuom'].sum())

136832


In [None]:
print(lab_data['exclude_zero_none_value'].sum())

54752


In [None]:
# subsetting the dataset and removing excluded records 
lab_data=lab_data[(lab_data.exclude_after48h==0)&(lab_data.exclude_null_time==0)&(lab_data.exclude_null_value==0)&(lab_data.exclude_null_valueuom==0)&(lab_data.exclude_zero_none_value==0)][['subject_id','hadm_id', 'stay_id', 'intime', 'itemid', 'label', 'charttime', 'value','valuenum', 'valueuom']]

According to PLAN, 5 covariates from labevents have multiple itemids:
* White Blood Cells = [51755, 51756, 51301]
* Potassium = [50833, 50971, 52610]
* Sodium = [50983, 52623]
* Urea Nitrogen = [51006, 52647]
* PO2 = [52042, 50821]


In [None]:
# For variables with duplicate codes - count occurence to see which label/measurement to use
lab_dup_itemids = [51755, 51756, 51301, 50833, 50971, 52610, 50983, 52623, 51006, 52647, 52042, 50821]
for x in lab_dup_itemids:
  print(x)
  print(len(lab_data[lab_data['itemid'] == x]))


51755
0
51756
0
51301
70856
50833
0
50971
79636
52610
0
50983
78029
52623
0
51006
74412
52647
0
52042
0
50821
65481


All covariates with multiple itemids have 100% of their observations in one itemid, no conversion or replacement needed, since there are no duplicates.

Creating time series data - time interval is hourly

In [None]:
# removing timestamp from intime
lab_data['intime'] = pd.to_datetime(lab_data['intime']).dt.tz_convert(None)

In [None]:
# Calculating time difference of each record of labevents from their admission
lab_data['timediff'] = (lab_data['charttime'] - lab_data['intime']).dt.total_seconds()/60/60

In [None]:
# Creating hour variable for time series
lab_data['hour'] = np.ceil(lab_data['timediff']).astype(int)

In [None]:
# I will call the label column as covariates 
lab_data['covariate'] = lab_data['label']

There are still records with valuenum == 0 and value has a 0 in its string...

In [None]:
lab_data[(lab_data['value'].isna())|(lab_data['valuenum']==0)]

Unnamed: 0,subject_id,hadm_id,stay_id,intime,itemid,label,charttime,value,valuenum,valueuom,timediff,hour,covariate
261020,19390654,25268824,36561397,2111-04-24 20:03:45,51146,Basophils,2111-04-25 04:05:00,0.0,0.0,%,8.020833,9,Basophils
950518,14499007,24975279,32406577,2180-10-30 10:55:37,50912,Creatinine,2180-10-27 15:15:00,0.0,0.0,mg/dL,-67.676944,-67,Creatinine
1476306,10948115,28748915,32686693,2148-03-13 05:10:00,50970,Phosphate,2148-03-13 14:03:00,0.0,0.0,mg/dL,8.883333,9,Phosphate
1697504,14199761,29993089,30184363,2135-07-01 05:22:33,50885,"Bilirubin, Total",2135-07-01 14:02:00,0.0,0.0,mg/dL,8.6575,9,"Bilirubin, Total"
1698192,11378149,29235017,31597960,2125-03-31 09:36:54,50885,"Bilirubin, Total",2125-03-28 03:10:00,0.0,0.0,mg/dL,-78.448333,-78,"Bilirubin, Total"
1950045,17789287,25119406,38913968,2124-09-22 22:49:46,50885,"Bilirubin, Total",2124-09-23 04:46:00,0.0,0.0,mg/dL,5.937222,6,"Bilirubin, Total"
2034326,11666315,23911356,34279325,2171-10-30 08:35:00,51146,Basophils,2171-10-31 02:50:00,0.0,0.0,%,18.25,19,Basophils
2364266,14423216,25240724,35715575,2149-01-01 16:50:07,51301,White Blood Cells,2148-12-25 09:12:00,0.0,0.0,K/uL,-175.635278,-175,White Blood Cells
2539777,16470848,27313327,39317251,2195-09-11 19:49:09,51146,Basophils,2195-09-11 20:06:00,0.0,0.0,%,0.280833,1,Basophils
2540766,11666315,23911356,34279325,2171-10-30 08:35:00,51200,Eosinophils,2171-10-31 02:50:00,0.0,0.0,%,18.25,19,Eosinophils


Remove these and we should be OK to use valuenum only...

In [None]:
# Excluding these additional 0 values and keeping relevant columns
lab_data2=lab_data[(lab_data['value'].notna())&(lab_data['valuenum']!=0)][['subject_id', 'hadm_id', 'stay_id', 'hour', 'covariate', 'valuenum']]

In [None]:
display(lab_data2)

Unnamed: 0,subject_id,hadm_id,stay_id,hour,covariate,valuenum
5538,14353753,28959343,34466833,-41,Specific Gravity,1.023
5541,13210618,28886630,36754056,-26,Specific Gravity,1.009
5543,19585396,27682039,36167665,-43,Specific Gravity,1.015
5544,11374776,23440405,34927785,3,Specific Gravity,1.026
5552,12620262,20350913,33258240,14,Specific Gravity,1.026
...,...,...,...,...,...,...
6330374,18943220,27827180,36039755,1,Lactate,2.200
6330375,19654837,24963930,31165420,-96,Lactate,1.200
6330376,11917664,27344800,37501394,13,Lactate,7.700
6330379,10706411,22795641,31395458,16,Lactate,5.300


Aggregating - I am not sure how each labevents variables need to be aggregated, it makes sense for me to average them.

In [None]:
lab_data3=lab_data2.groupby(['subject_id','hadm_id','stay_id','hour', 'covariate'], as_index=False).agg({'valuenum':[min,max]})

In [None]:
display(lab_data3)

Unnamed: 0_level_0,subject_id,hadm_id,stay_id,hour,covariate,valuenum,valuenum
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,min,max
0,10000980,26913865,39765666,12,PT,14.50,14.50
1,10000980,26913865,39765666,12,PTT,69.60,69.60
2,10000980,26913865,39765666,22,Anion Gap,17.00,17.00
3,10000980,26913865,39765666,22,Bicarbonate,23.00,23.00
4,10000980,26913865,39765666,22,"Calcium, Total",9.20,9.20
...,...,...,...,...,...,...,...
2072410,19999987,23865745,36195440,7,Base Excess,1.00,1.00
2072411,19999987,23865745,36195440,7,Calculated Total CO2,28.00,28.00
2072412,19999987,23865745,36195440,7,pCO2,44.00,44.00
2072413,19999987,23865745,36195440,7,pH,7.39,7.39


In [None]:
lab_data3.columns = (lab_data3.columns.map('_'.join)
                  .str.replace('min','min')
                  .str.replace('max','max'))
display(lab_data3)

Unnamed: 0,subject_id_,hadm_id_,stay_id_,hour_,covariate_,valuenum_min,valuenum_max
0,10000980,26913865,39765666,12,PT,14.50,14.50
1,10000980,26913865,39765666,12,PTT,69.60,69.60
2,10000980,26913865,39765666,22,Anion Gap,17.00,17.00
3,10000980,26913865,39765666,22,Bicarbonate,23.00,23.00
4,10000980,26913865,39765666,22,"Calcium, Total",9.20,9.20
...,...,...,...,...,...,...,...
2072410,19999987,23865745,36195440,7,Base Excess,1.00,1.00
2072411,19999987,23865745,36195440,7,Calculated Total CO2,28.00,28.00
2072412,19999987,23865745,36195440,7,pCO2,44.00,44.00
2072413,19999987,23865745,36195440,7,pH,7.39,7.39


In [None]:
lab_data3.rename(columns = {'subject_id_':'subject_id', 'hadm_id_':'hadm_id', 'stay_id_':'stay_id', 'hour_':'hour', 'covariate_':'covariate'}, inplace = True)
display(lab_data3)

Unnamed: 0,subject_id,hadm_id,stay_id,hour,covariate,valuenum_min,valuenum_max
0,10000980,26913865,39765666,12,PT,14.50,14.50
1,10000980,26913865,39765666,12,PTT,69.60,69.60
2,10000980,26913865,39765666,22,Anion Gap,17.00,17.00
3,10000980,26913865,39765666,22,Bicarbonate,23.00,23.00
4,10000980,26913865,39765666,22,"Calcium, Total",9.20,9.20
...,...,...,...,...,...,...,...
2072410,19999987,23865745,36195440,7,Base Excess,1.00,1.00
2072411,19999987,23865745,36195440,7,Calculated Total CO2,28.00,28.00
2072412,19999987,23865745,36195440,7,pCO2,44.00,44.00
2072413,19999987,23865745,36195440,7,pH,7.39,7.39


In [None]:
pandas_gbq.to_gbq(lab_data3,'rnn_dataset.timeseries-lab-table', project_id=project_id, if_exists='replace')

1it [00:41, 41.98s/it]


In [None]:
lab_data_final = run_query('''
SELECT *
FROM `genuine-box-350018.rnn_dataset.timeseries-lab-table`
''')
display(lab_data_final)

Unnamed: 0,subject_id,hadm_id,stay_id,hour,covariate,valuenum_min,valuenum_max
0,10000980,26913865,39765666,12,PT,14.50,14.50
1,10000980,26913865,39765666,12,PTT,69.60,69.60
2,10000980,26913865,39765666,22,Anion Gap,17.00,17.00
3,10000980,26913865,39765666,22,Bicarbonate,23.00,23.00
4,10000980,26913865,39765666,22,"Calcium, Total",9.20,9.20
...,...,...,...,...,...,...,...
2072410,19999987,23865745,36195440,7,Base Excess,1.00,1.00
2072411,19999987,23865745,36195440,7,Calculated Total CO2,28.00,28.00
2072412,19999987,23865745,36195440,7,pCO2,44.00,44.00
2072413,19999987,23865745,36195440,7,pH,7.39,7.39
