# Setup

In [3]:
import os 
import pandas as pd

project_id="upbeat-legacy-282508" # need to specify accordingly
os.environ["GOOGLE_CLOUD_PROJECT"]=project_id

In [4]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [5]:
def run_query(query):
    df = pd.io.gbq.read_gbq(query, project_id=project_id, verbose=False, configuration={'query':{'useLegacySql': False}})
    print(len(df), 'rows')
    print(df.head())
    return df

# an example
df = run_query('''
SELECT
  subject_id,
  hadm_id,
  stay_id
FROM
  `physionet-data.mimic_icu.icustays`
LIMIT 10
''')
df

10 rows
   subject_id   hadm_id   stay_id
0    10122297  25825366  36349608
1    17168310  21560534  33376903
2    15703353  29272306  35111434
3    10308232  21297383  30153687
4    13307171  21128752  38043905


Unnamed: 0,subject_id,hadm_id,stay_id
0,10122297,25825366,36349608
1,17168310,21560534,33376903
2,15703353,29272306,35111434
3,10308232,21297383,30153687
4,13307171,21128752,38043905
5,13386137,25196108,35892179
6,15671682,29195317,32077948
7,11960199,27343339,32798313
8,19612002,24483791,39073700
9,18275181,28133399,35881435


# Cohort extraction

The cohort extraction in eICU is more complicated than mimiv-iv as eICU does not provide explicitly an event on invasive mechanical ventilation. Furthermore, all events in eICU have only a chart time instead of duration, making it challenging to decide when a continuous event actually ends. So there are two main tasks to tackle in order to construct the prolonged MV cohort with eICU. First, identify charted events/items that indicate invasive MV; and second, decide start and end time of the MV. 

Below is the SQL modified based on https://github.com/nus-mornin-lab/oxygenation_kc/blob/master/data-extraction/eICU/eicu_oxygen_therapy.sql to address the two tasks and extract the eICU cohort. Again, only first ventilation in an ICU with duration longer than 24h is kept, with an over-72h flag.

In [4]:
sql_vent_cohort = '''

WITH respchart AS (
    SELECT *
    FROM `physionet-data.eicu_crd.respiratorycharting`
)

, nursechart AS (
    SELECT *
    FROM `physionet-data.eicu_crd.nursecharting`
)

, pat AS (
    SELECT *
    FROM `physionet-data.eicu_crd.patient`
)

-- paterns modified based on 
-- https://github.com/nus-mornin-lab/oxygenation_kc/blob/master/data-extraction/eICU/eicu_oxygen_therapy.sql

-- The categories are invasive ventilation,
-- noninvasive ventilation, and either/or.
, ventsettings0 AS (
    SELECT patientunitstayid AS icustay_id
        , charttime
        , CASE

            -- Invasive ventilation
            WHEN
                string IN (
                    'plateau pressure',
                    'postion at lip',
                    'position at lip',
                    'pressure control'
                )
                OR string LIKE '%set vt%'
                OR string LIKE '%sputum%'
                OR string LIKE '%rsbi%'
                OR string LIKE '%tube%'
                OR string LIKE '%ett%'
                OR string LIKE '%endotracheal%'
                OR string LIKE '%tracheal suctioning%'
                OR string LIKE '%tracheostomy%'
                OR string LIKE '%reintubation%' # re?
                OR string LIKE '%intubation%' # remove re?
                OR string LIKE '%assist controlled%'
                OR string LIKE '%volume controlled%'
                OR string LIKE '%pressure controlled%'
                OR string LIKE '%trach collar%'
            THEN 0

            -- Noninvasive ventilation
            WHEN
                string IN (
                    'bi-pap',
                    'ambubag'
                )
                OR string LIKE '%ipap%'
                OR string LIKE '%niv%'
                OR string LIKE '%epap%'
                OR string LIKE '%mask leak%'
                OR string LIKE '%volume assured%'
                OR string LIKE '%non-invasive ventilation%'
                OR string LIKE '%cpap%'
            THEN 1

            -- Either invasive or noninvasive ventilation:
            WHEN
                string IN (
                    'flowtrigger',
                    'peep',
                    'tv/kg ibw',
                    'mean airway pressure',
                    'peak insp. pressure',
                    'exhaled mv',
                    'exhaled tv (machine)',
                    'exhaled tv (patient)',
                    'flow sensitivity',
                    'peak flow',
                    'f total',
                    'pressure to trigger ps',
                    'adult con setting set rr',
                    'adult con setting set vt',
                    'vti',
                    'exhaled vt',
                    'adult con alarms hi press alarm',
                    'mve',
                    'respiratory phase',
                    'inspiratory pressure, set',
                    'a1: high exhaled vt',
                    'set fraction of inspired oxygen (fio2)',
                    'insp flow (l/min)',
                    'adult con setting spont exp vt',
                    'spont tv',
                    'pulse ox results vt',
                    'vt spontaneous (ml)',
                    'peak pressure',
                    'ltv1200',
                    'tc'
                )
                OR (
                    string LIKE '%vent%'
                    AND NOT string LIKE '%hyperventilat%'
                )
                OR string LIKE '%tidal%'
                OR string LIKE '%flow rate%'
                OR string LIKE '%minute volume%'
                OR string LIKE '%leak%'
                OR string LIKE '%pressure support%'
                OR string LIKE '%peep%'
                OR string LIKE '%tidal volume%'
            THEN 2

            ELSE NULL

        END AS vent_type
        , activeUponDischarge
    FROM (

        SELECT patientunitstayid
            , nursingChartOffset AS charttime
            , LOWER(nursingchartvalue) AS string
            , NULL AS activeUponDischarge
        FROM nursechart

        UNION ALL

        SELECT patientunitstayid
            , respchartoffset AS charttime
            , LOWER(respchartvaluelabel) AS string
            , NULL AS activeUponDischarge
        FROM respchart

        UNION ALL

        -- Oxygen device from respchart
        SELECT patientunitstayid
            , respchartoffset AS charttime
            , LOWER(respchartvalue) AS string
            , NULL AS activeUponDischarge
        FROM respchart
        WHERE LOWER(respchartvaluelabel) IN (
            'o2 device',
            'respiratory device',
            'ventilator type',
            'oxygen delivery method'
        )

        UNION ALL

        -- The treatment table also contains info on oxygen therapy.
        SELECT patientunitstayid
            , treatmentoffset AS charttime
            , LOWER(treatmentstring) AS string
            , activeUponDischarge
        FROM `physionet-data.eicu_crd.treatment`
    )
    WHERE charttime >= -60

)
-- 
-- Ensure charttime is unique
, ventsettings AS (
    SELECT icustay_id
        , charttime
        -- , MIN(vent_type) AS vent_type
        , vent_type
        , MAX(activeUponDischarge) AS activeUponDischarge
    FROM ventsettings0
    WHERE vent_type IS NOT NULL
    GROUP BY icustay_id, charttime, vent_type
    order by icustay_id, charttime, vent_type
)


, vd0 as
(
  select
    *
    -- this carries over the previous charttime which had an event
    , LAG(CHARTTIME, 1) OVER (partition by icustay_id order by charttime)
    as charttime_lag
    -- similarly, this is one for vent_type, to later identify first vent_type=0
    , LAG(vent_type, 1) OVER (partition by icustay_id order by charttime)
    as vent_type_lag
  from ventsettings
)
, vd1 as
(
  select
      icustay_id
      , charttime
      , vent_type
      , vent_type_lag
      , activeUponDischarge
    -- If the time since the last oxygen therapy event is more than 24 hours,
    -- we consider that ventilation had ended in between.
    -- That is, the next ventilation record corresponds to a new ventilation session.
      , CASE
        WHEN charttime - charttime_lag > 24*60 THEN 1
        WHEN charttime_lag IS NULL THEN 1 -- No lag can be computed for the very first record
        ELSE 0
      END AS newvent
      
  -- use the staging table with only oxygen therapy records from chart events
  FROM vd0
)
-- select * from vd1 # 3677741 rows
, vd2 as
(
  select vd1.*
  -- create a cumulative sum of the instances of new ventilation
  -- this results in a monotonic integer assigned to each instance of ventilation
  , SUM( newvent )
      OVER ( partition by icustay_id order by charttime )
    as ventnum
  from vd1
)

-- now we convert the charttime of the ventilator setting into durations
-- here we focus on invasive ventilation, which has vent_type=0
-- meanwhile, chartevents with vent_type=2 can be either invasive or non-invasive ventilation
-- so here we pick the stays with an invasive chart event as long as vent_type=0 is in the stay
-- and pick the earliest charttime (equals either 0 or 2) as the beginning time 
-- and find the end time as following
,vd3 AS (
  select icustay_id, charttime, vent_type, ventnum, activeUponDischarge
      , CASE
        WHEN vent_type - vent_type_lag = -2 THEN 0 #'non2in'
        WHEN vent_type - vent_type_lag = 2 THEN 1 #'in2non'
        WHEN vent_type - vent_type_lag = 0 THEN 2 #'non2non'
        WHEN vent_type_lag IS NULL THEN 2 #'non2non' -- No lag can be computed for the very first record
        ELSE 3
      END AS vent_type_transit
  from vd2
  where (vent_type=0) or (vent_type=2)
)



, vd_endtime AS 
(
    SELECT icustay_id
        , ventnum
        , CASE 
            WHEN (
                MAX(activeUponDischarge)
                -- vent_end cannot be later than the unit discharge time.
                -- However, unitdischargeoffset often seems too low.
                -- So, we only use it if it yields and extension of the
                -- ventilation time from ventsettings.
                AND MAX(charttime)+60 < MAX(pat.unitdischargeoffset)

            ) THEN MAX(pat.unitdischargeoffset)

            -- End time is currently a charting time
            -- Since these are usually recorded hourly, ventilation is actually longer.
            -- We therefore add 60 minutes to the last time.
            ELSE MAX(charttime)+60
        END AS vent_end
    FROM vd3
        LEFT JOIN pat 
        ON vd3.icustay_id = pat.patientunitstayid
    GROUP BY icustay_id, ventnum
)
, vd_starttime AS
(
    SELECT icustay_id
        , ventnum
        , MIN(charttime) AS vent_start
    FROM vd3
    WHERE vent_type_transit = 0 # 116765 final 22372
    # otherwise final count is 40749 (ventnum=1 and dur>24 and start>=0)
    GROUP BY icustay_id, ventnum
)
, vd_final AS 
(
    select e.icustay_id, e.ventnum, s.vent_start, e.vent_end
    from vd_endtime e
    left join vd_starttime s on e.icustay_id = s.icustay_id and e.ventnum = s.ventnum
)
, vd_dur AS
(
    select *
        , (vent_end - vent_start) / 60 AS vent_duration
    from vd_final
    order by icustay_id 
)
select icustay_id patientunitstayid, vent_start, vent_end, vent_duration
, case 
    when vent_duration > 72 then 1
    else 0
  end over72h
from vd_dur 
where ventnum=1 and vent_duration>=24 and vent_start>=0


'''

In [16]:
# create a view
dataset_id = 'default' # change accordingly
view_id = f"{project_id}.{dataset_id}.vent_cohort_eicu_view"

In [5]:


from google.cloud import bigquery

client = bigquery.Client()

view = bigquery.Table(view_id)
view.view_query = sql_vent_cohort

view = client.create_table(view)
print('Created view')

Created view


# Feature extraction

9 features: hr min/max, resp min/max, temp min/max, peep max, ph max, spo2 min

In [17]:
sql_features = f'''
with bg_ft as (
    SELECT 
    v.patientunitstayid, 
    pH #, peep
    FROM `{view_id}` v
    left join `physionet-data.eicu_crd_derived.pivoted_bg` bg
    on v.patientunitstayid=bg.patientunitstayid
    where bg.chartoffset <= v.vent_end and bg.chartoffset >= v.vent_start
)

, vs_ft as (
    SELECT 
    v.patientunitstayid, 
    heartrate, temperature, respiratoryrate, spo2
    FROM `{view_id}` v
    left join `physionet-data.eicu_crd_derived.pivoted_vital` vital
    on v.patientunitstayid = vital.patientunitstayid
    where vital.chartoffset <= v.vent_end and vital.chartoffset >= v.vent_start
)
select 
v.*
,max(pH) ph_max
, min(spo2) spo2_min

,min(heartrate) hr_min
,max(heartrate) hr_max
,min(respiratoryrate) resp_min
,max(respiratoryrate) resp_max
,min(temperature) temp_min 
,max(temperature) temp_max

from `{view_id}` v
left join bg_ft b on b.patientunitstayid=v.patientunitstayid
left join vs_ft s on s.patientunitstayid=v.patientunitstayid
group by patientunitstayid, vent_start, vent_end, vent_duration, over72h

'''

In [29]:
sql_features2 = f'''
with 
resp as (
SELECT patientunitstayid,
respchartoffset
,lower(respchartvaluelabel) label
,respchartvalue
FROM `physionet-data.eicu_crd.respiratorycharting` 
)
, peep_tmp as (
select 
patientunitstayid
,respchartoffset chartoffset
,respchartvalue peep
from resp
where label like 'peep'
)
, peep_ft1 as (
  select 
  v.patientunitstayid, safe_cast(peep as float64) peep
  from `{view_id}` v
  left join peep_tmp p on v.patientunitstayid=p.patientunitstayid
  where p.chartoffset <= v.vent_end and p.chartoffset >= v.vent_start and safe_cast(peep as float64) < 100
)
, peep_ft2 as (
    SELECT 
    v.patientunitstayid, peep
    FROM `{view_id}` v
    left join `physionet-data.eicu_crd_derived.pivoted_bg` bg
    on v.patientunitstayid=bg.patientunitstayid
    where bg.chartoffset <= v.vent_end and bg.chartoffset >= v.vent_start
)
, peep_union as (
  select * from peep_ft1 
  
  union all

  select * from peep_ft2
)

select 
v.*
, max(peep) peep_max

from `{view_id}` v
left join peep_union p on p.patientunitstayid=v.patientunitstayid
group by patientunitstayid, vent_start, vent_end, vent_duration, over72h

'''

In [19]:
df_ft1 = run_query(sql_features)

22326 rows
   patientunitstayid  vent_start  vent_end  ...  resp_max  temp_min  temp_max
0            3072790          18      1804  ...      16.0      35.7      36.6
1            2767867          24      3204  ...      17.0      37.9      37.9
2            3176933         121      2405  ...      17.0      35.6      37.8
3             271620         141      4158  ...      14.0      36.2      37.5
4            3183892         222      9121  ...      58.0      34.8      38.4

[5 rows x 13 columns]


In [30]:
df_ft2 = run_query(sql_features2)

22369 rows
   patientunitstayid  vent_start  vent_end  vent_duration  over72h  peep_max
0             141233           4      1672      27.800000        0      55.0
1             156988        1329     29112     463.050000        1      56.0
2             161176         266     10138     164.533333        1      11.0
3             168134        1782      9577     129.916667        1       3.0
4             179701          79     24195     401.933333        1       9.0


In [31]:
cols = ['patientunitstayid', 'vent_start', 'vent_end', 'vent_duration', 'over72h']
cohort_ft = pd.merge(df_ft1, df_ft2, on=cols, how='left')
len(cohort_ft)

22326

In [32]:
cohort_ft.head()

Unnamed: 0,patientunitstayid,vent_start,vent_end,vent_duration,over72h,ph_max,spo2_min,hr_min,hr_max,resp_min,resp_max,temp_min,temp_max,peep_max
0,3072790,18,1804,29.766667,0,,96.0,85.0,98.0,8.0,16.0,35.7,36.6,5.0
1,2767867,24,3204,53.0,0,,96.0,81.0,81.0,17.0,17.0,37.9,37.9,
2,3176933,121,2405,38.066667,0,7.42,95.0,67.0,120.0,8.0,17.0,35.6,37.8,5.0
3,271620,141,4158,66.95,0,,91.0,65.0,88.0,14.0,14.0,36.2,37.5,
4,3183892,222,9121,148.316667,1,7.43,83.0,66.0,163.0,12.0,58.0,34.8,38.4,


In [None]:
# conform to the naming in mimic-iv dataset
cohort_ft.rename(columns={'hr_max':'heart_rate_max',
                   'hr_min':'heart_rate_min',
                   'resp_max':'resp_rate_max',
                   'resp_min':'resp_rate_min'}, inplace=True)

In [33]:
cohort_ft.to_csv('ft9_invasive_cohort_eicu.csv', index=False)
os.listdir('.')

['.config', 'ft9_invasive_cohort_eicu.csv', 'adc.json', 'sample_data']

In [34]:
from google.colab import files
files.download('ft9_invasive_cohort_eicu.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [35]:
icustay_ids = [str(i) for i in cohort_ft.patientunitstayid.unique()]

In [36]:
# demographic and discharge status
sql_demo_etc = '''
select 
patientunitstayid, patienthealthsystemstayid patient_id, age, gender, hospitalid, 
unitdischargeoffset,
unitdischargestatus, 
hospitaldischargeoffset,
hospitaldischargestatus
 from `physionet-data.eicu_crd.patient` 
where patientunitstayid in ({})
'''.format(','.join(icustay_ids))

In [37]:
cohort_demo = run_query(sql_demo_etc)

22326 rows
   patientunitstayid  ...  hospitaldischargestatus
0            3139462  ...                  Expired
1            3168101  ...                    Alive
2            3184917  ...                    Alive
3            3162539  ...                  Expired
4            1364364  ...                    Alive

[5 rows x 9 columns]


In [38]:
cohort_demo.to_csv('ft9_invasive_cohort_eicu_demo_mort.csv', index=False)
files.download('ft9_invasive_cohort_eicu_demo_mort.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>