In [3]:
# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

# You only need to authenticate once per session.
auth.authenticate_user()

In [None]:
# Installing required package to export my table from colab to BigQuery
pip install pandas-gbq -U

In [15]:
# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_gbq

# below imports are used for pretty pandas dataframes and plots
from IPython.display import display, HTML
%matplotlib inline
plt.style.use('ggplot')

# Set up environment variables
project_id = 'genuine-box-350018'
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
pandas_gbq.context.project = project_id


# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
    return pd.io.gbq.read_gbq(
        query,
        project_id=project_id,
        dialect="standard"
    )

if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project at the top of this cell.')

  # test it works
df = run_query("""
SELECT *
FROM `physionet-data.mimic_core.patients`
WHERE subject_id = 10012853
""")
assert df.shape[0] >= 1, 'unable to query MIMIC!'
display(df)


Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10012853,F,91,2175,2014 - 2016,NaT


**COHORT**

Creating cohort with admission year, death status, death time

In [22]:
df = run_query(f'''
SELECT 	
	ie.subject_id
	, ie.stay_id
  , ie.hadm_id
	, ie.intime
  , pa.anchor_year_group
  , pa.anchor_year
  , pa.anchor_age
	, DATETIME_DIFF(ie.intime, DATETIME(pa.anchor_year, 1, 1, 0, 0, 0), YEAR) + pa.anchor_age AS age
-- Calculating actual year of admission by taking median of anchor_year_group + difference between age and anchor_age
  , CAST((CAST(SUBSTRING(pa.anchor_year_group,1,4) AS int) + CAST(SUBSTRING(pa.anchor_year_group,8,11) AS int))/2 AS int) + 
  DATETIME_DIFF(ie.intime, DATETIME(pa.anchor_year, 1, 1, 0, 0, 0), YEAR) AS year
  , adm.deathtime
  , (CASE WHEN adm.deathtime IS NOT NULL THEN 1 ELSE 0 END) AS death_status
-- Calculating actual time of death (how many hours after admission)
  , (CASE WHEN adm.deathtime IS NOT NULL THEN DATE_DIFF(adm.deathtime, ie.intime, hour) ELSE NULL END) AS death_hour
FROM `physionet-data.mimic_icu.icustays` ie 
INNER JOIN `physionet-data.mimic_core.patients` pa
ON ie.subject_id = pa.subject_id
INNER JOIN `physionet-data.mimic_core.admissions` adm
ON ie.subject_id = adm.subject_id
''')
display(df)


Unnamed: 0,subject_id,stay_id,hadm_id,intime,anchor_year_group,anchor_year,anchor_age,age,year,deathtime,death_status,death_hour
0,19137716,32541787,28917192,2115-03-25 17:17:00,2008 - 2010,2111,18,22,2013,NaT,0,
1,19077387,30880402,28114904,2112-11-19 03:54:00,2017 - 2019,2112,18,18,2018,NaT,0,
2,11155072,39132918,22807589,2113-02-07 19:37:04,2011 - 2013,2113,18,18,2012,NaT,0,
3,11155072,32764467,22807589,2113-02-19 21:34:50,2011 - 2013,2113,18,18,2012,NaT,0,
4,17945455,31529823,21555018,2113-08-08 00:51:00,2014 - 2016,2113,18,18,2015,NaT,0,
...,...,...,...,...,...,...,...,...,...,...,...,...
421553,12585757,39005168,20912625,2206-01-26 10:31:00,2011 - 2013,2204,91,93,2014,NaT,0,
421554,12585757,39005168,20912625,2206-01-26 10:31:00,2011 - 2013,2204,91,93,2014,NaT,0,
421555,12585757,39005168,20912625,2206-01-26 10:31:00,2011 - 2013,2204,91,93,2014,NaT,0,
421556,12585757,39005168,20912625,2206-01-26 10:31:00,2011 - 2013,2204,91,93,2014,NaT,0,


In [23]:
pandas_gbq.to_gbq(df,'rnn_dataset.cohort-table', project_id = project_id)

1it [00:25, 25.80s/it]


**FINAL COHORT**

Adding exclusions:
- include first ICU stays only
- include admissions between 2008 and 2012
- include patients who were alive for the first 48h after admission

In [24]:
admlist = run_query(f'''
SELECT co.subject_id, co.stay_id, co.hadm_id, co.intime, co.year, co.deathtime, co.death_status
, CASE WHEN row_number() OVER (PARTITION BY co.subject_id ORDER BY co.intime) > 1 THEN 1 ELSE 0 END as exclude_stay
, CASE WHEN (co.year < 2008 OR co.year > 2012) THEN 1 ELSE 0 END AS exclude_admityear
, CASE WHEN death_hour < 48 THEN 1 ELSE 0 END AS exclude_death_under48h
FROM `genuine-box-350018.rnn_dataset.cohort-table` co
''')
display(admlist)

  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes
  bqstorage_client=bqstorage_client, dtypes=dtypes


Unnamed: 0,subject_id,stay_id,hadm_id,intime,year,deathtime,death_status,exclude_stay,exclude_admityear,exclude_death_under48h
0,10012438,33768181,22764825,2178-06-07 21:57:00+00:00,2018,NaT,0,0,1,0
1,10012438,33768181,22764825,2178-06-07 21:57:00+00:00,2018,NaT,0,1,1,0
2,10034317,36228864,20827960,2159-08-26 12:59:45+00:00,2018,NaT,0,0,1,0
3,10034317,33536886,20827960,2159-08-30 12:35:14+00:00,2018,NaT,0,1,1,0
4,10112984,31093528,20721573,2160-08-14 10:00:07+00:00,2018,NaT,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
421553,19969137,31637227,20917922,2143-03-20 03:52:00+00:00,2018,NaT,0,1,1,0
421554,19970892,38532939,25899573,2116-06-14 16:11:17+00:00,2017,NaT,0,0,1,0
421555,19970892,38532939,25899573,2116-06-14 16:11:17+00:00,2017,NaT,0,1,1,0
421556,19987071,33385977,22919294,2113-12-27 20:31:00+00:00,2009,NaT,0,0,0,0


In [25]:
final_cohort = admlist[(admlist.exclude_stay==0)&(admlist.exclude_admityear==0)&(admlist.exclude_death_under48h==0)][['subject_id', 'stay_id', 'hadm_id', 'intime', 'year','deathtime','death_status']]
display(final_cohort)

Unnamed: 0,subject_id,stay_id,hadm_id,intime,year,deathtime,death_status
6,10157718,30185469,24943658,2164-02-21 23:58:01+00:00,2009,NaT,0
70,10249925,32593030,28164030,2122-12-29 23:50:00+00:00,2009,NaT,0
72,10311956,36577532,25628815,2123-03-06 23:58:05+00:00,2009,NaT,0
74,10329846,35463345,28917992,2139-12-16 20:22:00+00:00,2009,NaT,0
82,10363830,39333021,27193050,2148-12-10 11:51:12+00:00,2012,NaT,0
...,...,...,...,...,...,...,...
421190,19565653,33904168,22270700,2151-03-29 12:48:27+00:00,2010,NaT,0
421468,19632296,31209930,22877763,2154-03-17 11:24:14+00:00,2012,NaT,0
421520,19806212,36539073,25284876,2133-06-21 12:12:51+00:00,2012,NaT,0
421524,19863982,35517551,26873581,2120-02-13 23:21:03+00:00,2010,NaT,0


In [26]:
pandas_gbq.to_gbq(final_cohort,'rnn_dataset.final-cohort-table', project_id = project_id)

1it [00:03,  3.50s/it]


In [31]:
# Counting unique stays to make sure there are no duplicates
unique_stays = len(set(final_cohort["stay_id"]))
print(unique_stays)

21894


In [32]:
# Counting unique admissions to make sure there are no duplicates
unique_admissions = len(set(final_cohort["hadm_id"]))
print(unique_admissions)

21894


In [33]:
# Total mortality
final_cohort["death_status"].sum()

1492