# Setup

In [1]:
import os 
import pandas as pd

project_id="upbeat-legacy-282508" # need to specify accordingly
os.environ["GOOGLE_CLOUD_PROJECT"]=project_id

In [2]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [3]:
def run_query(query):
    df = pd.io.gbq.read_gbq(query, project_id=project_id, verbose=False, configuration={'query':{'useLegacySql': False}})
    print(len(df), 'rows')
    print(df.head())
    return df

# an example
df = run_query('''
SELECT
  subject_id,
  hadm_id,
  stay_id
FROM
  `physionet-data.mimic_icu.icustays`
LIMIT 10
''')

10 rows
   subject_id   hadm_id   stay_id
0    12776735  20817525  34547665
1    16256226  20013290  39289362
2    12974563  29618057  32563675
3    14609218  20606189  34947848
4    12687112  26132667  37445058


# Cohort extraction

Here procedure `itemid=225792` is used to identify invasive mechanical ventilation. First MV at each stay with duration longer than 24hr is used.

In [4]:
sql_vent_cohort = '''
with tmp as (
  select stay_id, starttime, endtime, itemid 
  , DATETIME_DIFF(endtime, starttime, MINUTE) as duration
  ,ROW_NUMBER() OVER (partition by stay_id order by starttime) as firstRow
  from `physionet-data.mimic_icu.procedureevents` 
  where itemid = 225792
) 
select
stay_id, starttime, endtime, duration
,CASE
      WHEN duration > 72*60 THEN 1
    ELSE
    0
  END
    AS over72h
from tmp
where duration >= 24 * 60 and firstRow=1
order by stay_id
'''

cohort_df = run_query(sql_vent_cohort)
print('num of first ventilation cases in an ICU with duration longer than 24h', len(cohort_df))

13630 rows
    stay_id           starttime             endtime  duration  over72h
0  30000213 2162-06-21 06:00:00 2162-06-22 08:54:00      1614        0
1  30002548 2111-08-17 17:40:00 2111-08-18 18:50:00      1510        0
2  30003749 2120-11-05 16:30:00 2120-11-07 03:45:00      2115        0
3  30004018 2158-12-27 22:22:00 2159-01-05 00:56:00     11674        1
4  30004391 2153-09-05 13:26:00 2153-09-07 17:00:00      3094        0
num of first ventilation cases in an ICU with duration longer than 24h 13630


We now need to create a view in bq for the cohort in order to continue with feature extraction.

In [5]:
dataset_id = 'default' # change accordingly
view_id = f"{project_id}.{dataset_id}.vent_cohort_mimiciv_view"

print(view_id)

upbeat-legacy-282508.default.vent_cohort_mimiciv_view


In [None]:
# for first-time use, need to uncomment and run the following to create the view 

# from google.cloud import bigquery

# client = bigquery.Client()

# view = bigquery.Table(view_id)
# view.view_query = sql_vent_cohort

# view = client.create_table(view)
# print('Created view')

# Feature extraction

In [6]:
sql_height = f'''
SELECT v.stay_id, he.height #v.starttime, v.endtime, he.* #, we.* 

FROM `{view_id}` v
left join `physionet-data.mimic_derived.height` he on v.stay_id  = he.stay_id  
where he.height is not null
order by he.height 

'''
height = run_query(sql_height)

10149 rows
    stay_id  height
0  39303452   122.0
1  35391202   122.0
2  34660011   122.0
3  31413365   122.0
4  36344634   122.0


In [7]:
sql_weight = f'''
with tmp as (
  SELECT v.stay_id, v.starttime, v.endtime,  min(we.starttime) as weight_time 
  FROM `{view_id}` v
  left join `physionet-data.mimic_derived.weight_durations` we on v.stay_id  = we.stay_id 
  where we.weight is not null # there are some missing values
  group by v.stay_id, v.starttime, v.endtime
  
) 
# select * from tmp
select t.stay_id, we.weight  from tmp t
left join `physionet-data.mimic_derived.weight_durations` we on t.stay_id  = we.stay_id and t.weight_time = we.starttime

'''
weight = run_query(sql_weight)

13482 rows
    stay_id  weight
0  30547062    88.0
1  31261018    63.8
2  31597662    95.0
3  31855208    62.7
4  32791762   146.2


In [8]:
sql_demographics= f'''
with tmp as (
  select v.stay_id , v.starttime , a.admission_location , a.insurance , a.language ,a.ethnicity ,a.marital_status, a.admittime ,ag.age, p.gender 
  FROM `{view_id}` v
  left join `physionet-data.mimic_icu.icustays` i on v.stay_id = i.stay_id 
  left join `physionet-data.mimic_core.admissions` a on i.subject_id = a.subject_id and i.hadm_id = a.hadm_id  
  left join `physionet-data.mimic_core.patients` p on i.subject_id =p.subject_id 
  left join `physionet-data.mimic_derived.age` ag on i.hadm_id = ag.hadm_id 
)

select 
stay_id ,admission_location ,insurance ,language ,ethnicity ,marital_status, gender, age
, DATETIME_DIFF(starttime, admittime, HOUR) as hours_in_hosp_before_intubation
FROM tmp
'''
demo = run_query(sql_demographics)

13630 rows
    stay_id admission_location  ... age hours_in_hosp_before_intubation
0  31091899     EMERGENCY ROOM  ...  21                               1
1  31379617     EMERGENCY ROOM  ...  95                               1
2  33482269     EMERGENCY ROOM  ...  95                               2
3  33521080     EMERGENCY ROOM  ...  92                               3
4  34637766     EMERGENCY ROOM  ...  90                               3

[5 rows x 9 columns]


In [9]:
HOUR_GAP = 24 

# DATETIME_DIFF(*,*, HOUR) renders the hour interval into integer, which could cause some entries to be ignored
# use DATETIME_DIFF(*,*, MINUTE) and compare with a new window_gap
WINDOW_GAP = HOUR_GAP * 60

In [10]:
sql_bg= f'''
with tmp as (
select v.stay_id ,v.starttime ,v.endtime , bg.charttime ,bg.totalco2 ,bg.lactate, bg.ph
FROM `{view_id}` v
left join `physionet-data.mimic_icu.icustays` i on v.stay_id = i.stay_id 
left join `physionet-data.mimic_derived.bg` bg on i.subject_id = bg.subject_id and i.hadm_id = bg.hadm_id 
where (bg.charttime >= v.starttime ) and (DATETIME_DIFF(bg.charttime, v.starttime, MINUTE) <= {WINDOW_GAP} ) 
)
select stay_id, 
max(totalco2) as co2_total_max,
min(totalco2) as co2_total_min,
max(ph) as ph_max,
min(ph) as ph_min,
max(lactate) as lactate_max,
min(lactate) as lactate_min,
from tmp
group by stay_id ,starttime , endtime
'''
bg = run_query(sql_bg)

12577 rows
    stay_id  co2_total_max  co2_total_min  ...  ph_min  lactate_max  lactate_min
0  39889812           28.0           16.0  ...    7.12         19.0          8.6
1  33791747           29.0           25.0  ...    7.30          4.5          3.9
2  37733778           32.0           25.0  ...    7.19          5.0          3.4
3  35198144           17.0           12.0  ...    7.09          6.8          4.3
4  31169797           34.0           26.0  ...    7.30         12.0          4.9

[5 rows x 7 columns]


In [11]:
sql_pfratio= f'''
with tmp as (
select v.stay_id ,v.starttime ,v.endtime ,bg.po2 ,bg.fio2 , bg.pao2fio2ratio, bg.charttime 
,ROW_NUMBER() OVER (partition by v.stay_id, v.starttime,v.endtime  order by bg.charttime DESC) as lastRow
FROM `{view_id}` v
left join `physionet-data.mimic_icu.icustays` i on v.stay_id = i.stay_id 
left join `physionet-data.mimic_derived.bg` bg on i.subject_id = bg.subject_id and i.hadm_id = bg.hadm_id 
where (bg.charttime >= v.starttime ) and (DATETIME_DIFF(bg.charttime, v.starttime, MINUTE) <= {WINDOW_GAP} ) 
)
select stay_id, pao2fio2ratio, #lastRow, starttime, endtime
from tmp 
where lastRow=1
'''
pf = run_query(sql_pfratio)

12577 rows
    stay_id  pao2fio2ratio
0  30030798          262.5
1  30081487           85.0
2  30117582          236.0
3  30213599          330.0
4  30275454          305.0


In [12]:
sql_vital = f'''
select v.stay_id

, MAX(case when itemid in (220045) and valuenum > 0 and valuenum < 300 then valuenum else null end) as heart_rate_max
, MIN(case when itemid in (220045) and valuenum > 0 and valuenum < 300 then valuenum else null end) as heart_rate_min

, MAX(case when itemid = 220181 and valuenum > 0 and valuenum < 300 then valuenum else null end) as mbp_ni_max
, MIN(case when itemid = 220181 and valuenum > 0 and valuenum < 300 then valuenum else null end) as mbp_ni_min

, MAX(case when itemid in (220052) and valuenum > 0 and valuenum < 300 then valuenum else null end) as mbp_arterial_max
, MIN(case when itemid in (220052) and valuenum > 0 and valuenum < 300 then valuenum else null end) as mbp_arterial_min

, MAX(case when itemid in (220210,224690) and valuenum > 0 and valuenum < 70 then valuenum else null end) as resp_rate_max
, MIN(case when itemid in (220210,224690) and valuenum > 0 and valuenum < 70 then valuenum else null end) as resp_rate_min

, MAX(case when itemid in (220277) and valuenum > 0 and valuenum <= 100 then valuenum else null end) as spo2_max
, MIN(case when itemid in (220277) and valuenum > 0 and valuenum <= 100 then valuenum else null end) as spo2_min

, ROUND(
    MAX(case when itemid in (223761) and valuenum > 70 and valuenum < 120 then (valuenum-32)/1.8 -- converted to degC in valuenum call
             when itemid in (223762) and valuenum > 10 and valuenum < 50  then valuenum else null end)
    , 2) as temp_max
, ROUND(
    MIN(case when itemid in (223761) and valuenum > 70 and valuenum < 120 then (valuenum-32)/1.8 -- converted to degC in valuenum call
             when itemid in (223762) and valuenum > 10 and valuenum < 50  then valuenum else null end)
    , 2) as temp_min

, MAX(case when ce.itemid in (225664,220621,226537) and ce.valuenum > 0 then ce.valuenum else null end) as glucose_max
, MIN(case when ce.itemid in (225664,220621,226537) and ce.valuenum > 0 then ce.valuenum else null end) as glucose_min
FROM `{view_id}` v
-- left join `physionet-data.mimic_icu.icustays` i on v.stay_id = i.stay_id
left join `physionet-data.mimic_icu.chartevents` ce on v.stay_id  = ce.stay_id 
where (ce.charttime >= v.starttime ) and (DATETIME_DIFF(ce.charttime, v.starttime, MINUTE) <= {WINDOW_GAP} ) 
 and ce.itemid in
  (
    220045, -- Heart Rate
    220181, -- Non Invasive Blood Pressure mean
    220052, -- Arterial Blood Pressure mean
    220210, -- Respiratory Rate
    224690, -- Respiratory Rate (Total)
    220277, -- SPO2, peripheral
    -- TEMPERATURE
    223762, -- "Temperature Celsius"
    223761,  -- "Temperature Fahrenheit"
    -- GLUCOSE
    225664, -- Glucose finger stick
    220621, -- Glucose (serum)
    226537 -- Glucose (whole blood)
)
group by v.stay_id
'''
vitals = run_query(sql_vital)

13564 rows
    stay_id  heart_rate_max  heart_rate_min  ...  temp_min  glucose_max  glucose_min
0  34465005            88.0            70.0  ...       NaN        153.0         90.0
1  34235348           106.0            78.0  ...     35.33        184.0         61.0
2  39397352            96.0            52.0  ...     34.80        124.0         87.0
3  38256253           102.0            60.0  ...     35.33        154.0        103.0
4  31677119           107.0            66.0  ...     36.17        155.0        116.0

[5 rows x 15 columns]


In [13]:
sql_med= f'''
with vaso as (
  SELECT c.stay_id, 
  count(v.starttime) vasopressin #, v.endtime 
  FROM `{view_id}` c 
  left join `physionet-data.mimic_derived.vasopressin` v on v.stay_id =c.stay_id  
  where v.starttime >= c.starttime and DATETIME_DIFF(v.starttime, c.starttime, MINUTE) <={WINDOW_GAP} 
  group by c.stay_id
) 
, epin as (
  SELECT c.stay_id, 
  count(v.starttime) epinephrine #, v.endtime 
  FROM `{view_id}` c 
  left join `physionet-data.mimic_derived.epinephrine` v on v.stay_id =c.stay_id  
  where v.starttime >= c.starttime and DATETIME_DIFF(v.starttime, c.starttime, MINUTE) <={WINDOW_GAP} 
  group by c.stay_id
) 

, dobu as (
  SELECT c.stay_id, 
  count(v.starttime) dobutamine #, v.endtime 
  FROM `{view_id}` c 
  left join `physionet-data.mimic_derived.dobutamine` v on v.stay_id =c.stay_id  
  where v.starttime >= c.starttime and DATETIME_DIFF(v.starttime, c.starttime, MINUTE) <={WINDOW_GAP} 
  group by c.stay_id
)
, nore as (
  SELECT c.stay_id, 
  count(v.starttime) norepinephrine #, v.endtime 
  FROM `{view_id}` c 
  left join `physionet-data.mimic_derived.norepinephrine` v on v.stay_id =c.stay_id  
  where v.starttime >= c.starttime and DATETIME_DIFF(v.starttime, c.starttime, MINUTE) <={WINDOW_GAP} 
  group by c.stay_id
)
, phen as (
  SELECT c.stay_id, 
  count(v.starttime) phenylephrine #, v.endtime 
  FROM `{view_id}` c 
  left join `physionet-data.mimic_derived.phenylephrine` v on v.stay_id =c.stay_id  
  where v.starttime >= c.starttime and DATETIME_DIFF(v.starttime, c.starttime, MINUTE) <={WINDOW_GAP} 
  group by c.stay_id
)
, dopa as (
  SELECT c.stay_id, 
  count(v.starttime) dopamine #, v.endtime 
  FROM `{view_id}` c 
  left join `physionet-data.mimic_derived.dopamine` v on v.stay_id =c.stay_id  
  where v.starttime >= c.starttime and DATETIME_DIFF(v.starttime, c.starttime, MINUTE) <={WINDOW_GAP} 
  group by c.stay_id
)

select 
c.stay_id
, case when vasopressin >0 then 1 else 0 end vasopressin
, case when epinephrine >0 then 1 else 0 end epinephrine
, case when dobutamine >0 then 1 else 0 end dobutamine
, case when norepinephrine >0 then 1 else 0 end norepinephrine
, case when phenylephrine >0 then 1 else 0 end phenylephrine
, case when dopamine >0 then 1 else 0 end dopamine
FROM `{view_id}` c
left join vaso on c.stay_id =vaso.stay_id
left join epin on c.stay_id =epin.stay_id
left join dobu on c.stay_id =dobu.stay_id
left join nore on c.stay_id =nore.stay_id
left join phen on c.stay_id =phen.stay_id
left join dopa on c.stay_id =dopa.stay_id
'''
med = run_query(sql_med)

13630 rows
    stay_id  vasopressin  epinephrine  ...  norepinephrine  phenylephrine  dopamine
0  30025200            0            0  ...               0              1         0
1  30095195            0            0  ...               0              0         0
2  30245448            0            0  ...               1              1         0
3  30343769            1            1  ...               1              0         0
4  30400199            0            0  ...               1              0         0

[5 rows x 7 columns]


In [14]:
sql_neuroblock=f'''
with tmp as (
select 
v.stay_id,
case when (n.starttime >= v.starttime ) and (DATETIME_DIFF(n.starttime, v.starttime, MINUTE) <={WINDOW_GAP} ) then 1 else 0 end as nb
FROM `{view_id}` v
LEFT JOIN `physionet-data.mimic_derived.neuroblock` n on v.stay_id = n.stay_id 
)
select stay_id, case when sum(nb)>0 then 1 else 0 end as neuroblocker
FROM tmp
GROUP BY stay_id 
'''
neuroblock = run_query(sql_neuroblock)

13630 rows
    stay_id  neuroblocker
0  30019367             1
1  30123819             0
2  30147362             0
3  30167170             0
4  30178464             0


In [15]:
sql_rrt=f'''
with tmp as (
  select 
  v.stay_id,
  case when (n.charttime >= v.starttime ) and (DATETIME_DIFF(n.charttime, v.starttime, MINUTE) <={WINDOW_GAP}  ) and (n.dialysis_present=1) then 1 else 0 end as dia
  FROM `{view_id}` v
  LEFT JOIN `physionet-data.mimic_derived.rrt` n on v.stay_id = n.stay_id 
)
select stay_id, case when sum(dia)>0 then 1 else 0 end as rrt
FROM tmp
GROUP BY stay_id 
'''
rrt = run_query(sql_rrt)

13630 rows
    stay_id  rrt
0  30025720    0
1  30106320    0
2  30112339    0
3  30139485    0
4  30299191    0


In [16]:
sql_vent_setting= f'''
with tmp as (SELECT 
vc.stay_id, vc.starttime ,vc.endtime , vs.charttime ,vs.fio2 , vs.peep ,vs.plateau_pressure 
FROM `{view_id}` vc
left join `physionet-data.mimic_icu.icustays` i on vc.stay_id = i.stay_id 
left join `physionet-data.mimic_derived.ventilator_setting` vs on i.subject_id = vs.subject_id 
where (vs.charttime >= vc.starttime ) and (DATETIME_DIFF(vs.charttime, vc.starttime, MINUTE) <={WINDOW_GAP} )
)
select stay_id,
max(fio2) as fio2_max,
min(fio2) as fio2_min,
max(peep) as peep_max,
min(peep) as peep_min,
max(plateau_pressure) as plateau_pressure_max,
min(plateau_pressure) as plateau_pressure_min,

from tmp
group by stay_id
'''
vent = run_query(sql_vent_setting)

13622 rows
    stay_id  fio2_max  ...  plateau_pressure_max  plateau_pressure_min
0  32455752     100.0  ...                  36.0                  25.0
1  30962143     100.0  ...                  38.0                  34.0
2  39476125     100.0  ...                  36.0                  32.0
3  31368984     100.0  ...                  10.0                  10.0
4  39076122     100.0  ...                  45.0                  30.0

[5 rows x 7 columns]


In [17]:
sql_rhythm=f'''
with tmp as (
select v.stay_id , r.charttime ,r.heart_rhythm , ROW_NUMBER() OVER (partition by v.stay_id order by r.charttime DESC) as lastRow
  from `{view_id}` v 
  left join `physionet-data.mimic_icu.icustays` i on v.stay_id = i.stay_id 
  left join `physionet-data.mimic_derived.rhythm` r on i.subject_id = r.subject_id 
  where (r.charttime >= v.starttime ) and (DATETIME_DIFF(r.charttime, v.starttime, MINUTE) <={WINDOW_GAP}  )
) 
select stay_id , case when 
 (heart_rhythm = 'SR (Sinus Rhythm)' ) or
 (heart_rhythm = 'ST (Sinus Tachycardia)' ) or
 (heart_rhythm = 'SB (Sinus Bradycardia)' )
then 1 else 0 end as sinus_rhythm
from tmp
where lastRow=1
'''
rhythm = run_query(sql_rhythm)

13613 rows
    stay_id  sinus_rhythm
0  30106115             1
1  30143207             1
2  30204754             0
3  30262829             0
4  30277942             1


In [None]:
# scores

In [18]:
sql_apsiii= f'''
select ap.*
FROM `{view_id}` v
left join `physionet-data.mimic_derived.apsiii` ap on v.stay_id = ap.stay_id 
'''
aps = run_query(sql_apsiii)

13630 rows
   subject_id   hadm_id   stay_id  ...  glucose_score  acidbase_score  gcs_score
0    12431768  29145210  30477007  ...            0.0            12.0       48.0
1    10428217  28879895  31374882  ...            3.0            12.0       29.0
2    17135164  22310263  31781209  ...            5.0            12.0       29.0
3    15644476  24889537  32352192  ...            0.0            12.0       33.0
4    19714589  28579211  32531492  ...            5.0             9.0       48.0

[5 rows x 21 columns]


In [19]:
sql_sofa= f'''
 select s.*
 FROM `{view_id}` v
 left join `physionet-data.mimic_derived.first_day_sofa` s on v.stay_id = s.stay_id 
'''
sofa = run_query(sql_sofa)

13630 rows
   subject_id   hadm_id   stay_id  SOFA  ...  liver  cardiovascular  cns  renal
0    17962978  20502339  31032577    20  ...    4.0             4.0    4    3.0
1    16473254  20338916  34826033     0  ...    NaN             0.0    0    0.0
2    18060511  25765419  39172999     0  ...    0.0             0.0    0    0.0
3    15237577  22872053  39311300    20  ...    3.0             4.0    3    4.0
4    12187003  21158956  36278823     0  ...    0.0             0.0    0    0.0

[5 rows x 10 columns]


In [20]:
sql_comorbidities='''
WITH diag AS
(
    SELECT 
        hadm_id
        , CASE WHEN icd_version = 9 THEN icd_code ELSE NULL END AS icd9_code
        , CASE WHEN icd_version = 10 THEN icd_code ELSE NULL END AS icd10_code
    FROM `physionet-data.mimic_hosp.diagnoses_icd` diag
)
, com AS
(
    SELECT
        ad.hadm_id

        -- Myocardial infarction
        , MAX(CASE WHEN
            SUBSTR(icd9_code, 1, 3) IN ('410','412')
            OR
            SUBSTR(icd10_code, 1, 3) IN ('I21','I22')
            OR
            SUBSTR(icd10_code, 1, 4) = 'I252'
            THEN 1 
            ELSE 0 END) AS myocardial_infarct

        -- Congestive heart failure
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 3) = '428'
            OR
            SUBSTR(icd9_code, 1, 5) IN ('39891','40201','40211','40291','40401','40403',
                          '40411','40413','40491','40493')
            OR 
            SUBSTR(icd9_code, 1, 4) BETWEEN '4254' AND '4259'
            OR
            SUBSTR(icd10_code, 1, 3) IN ('I43','I50')
            OR
            SUBSTR(icd10_code, 1, 4) IN ('I099','I110','I130','I132','I255','I420',
                                                   'I425','I426','I427','I428','I429','P290')
            THEN 1 
            ELSE 0 END) AS congestive_heart_failure

        -- Peripheral vascular disease
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 3) IN ('440','441')
            OR
            SUBSTR(icd9_code, 1, 4) IN ('0930','4373','4471','5571','5579','V434')
            OR
            SUBSTR(icd9_code, 1, 4) BETWEEN '4431' AND '4439'
            OR
            SUBSTR(icd10_code, 1, 3) IN ('I70','I71')
            OR
            SUBSTR(icd10_code, 1, 4) IN ('I731','I738','I739','I771','I790',
                                                   'I792','K551','K558','K559','Z958','Z959')
            THEN 1 
            ELSE 0 END) AS peripheral_vascular_disease

        -- Cerebrovascular disease
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 3) BETWEEN '430' AND '438'
            OR
            SUBSTR(icd9_code, 1, 5) = '36234'
            OR
            SUBSTR(icd10_code, 1, 3) IN ('G45','G46')
            OR 
            SUBSTR(icd10_code, 1, 3) BETWEEN 'I60' AND 'I69'
            OR
            SUBSTR(icd10_code, 1, 4) = 'H340'
            THEN 1 
            ELSE 0 END) AS cerebrovascular_disease

        -- Dementia
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 3) = '290'
            OR
            SUBSTR(icd9_code, 1, 4) IN ('2941','3312')
            OR
            SUBSTR(icd10_code, 1, 3) IN ('F00','F01','F02','F03','G30')
            OR
            SUBSTR(icd10_code, 1, 4) IN ('F051','G311')
            THEN 1 
            ELSE 0 END) AS dementia

        -- Chronic pulmonary disease
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 3) BETWEEN '490' AND '505'
            OR
            SUBSTR(icd9_code, 1, 4) IN ('4168','4169','5064','5081','5088')
            OR 
            SUBSTR(icd10_code, 1, 3) BETWEEN 'J40' AND 'J47'
            OR 
            SUBSTR(icd10_code, 1, 3) BETWEEN 'J60' AND 'J67'
            OR
            SUBSTR(icd10_code, 1, 4) IN ('I278','I279','J684','J701','J703')
            THEN 1 
            ELSE 0 END) AS chronic_pulmonary_disease

        -- Rheumatic disease
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 3) = '725'
            OR
            SUBSTR(icd9_code, 1, 4) IN ('4465','7100','7101','7102','7103',
                                                  '7104','7140','7141','7142','7148')
            OR
            SUBSTR(icd10_code, 1, 3) IN ('M05','M06','M32','M33','M34')
            OR
            SUBSTR(icd10_code, 1, 4) IN ('M315','M351','M353','M360')
            THEN 1 
            ELSE 0 END) AS rheumatic_disease

        -- Peptic ulcer disease
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 3) IN ('531','532','533','534')
            OR
            SUBSTR(icd10_code, 1, 3) IN ('K25','K26','K27','K28')
            THEN 1 
            ELSE 0 END) AS peptic_ulcer_disease

        -- Mild liver disease
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 3) IN ('570','571')
            OR
            SUBSTR(icd9_code, 1, 4) IN ('0706','0709','5733','5734','5738','5739','V427')
            OR
            SUBSTR(icd9_code, 1, 5) IN ('07022','07023','07032','07033','07044','07054')
            OR
            SUBSTR(icd10_code, 1, 3) IN ('B18','K73','K74')
            OR
            SUBSTR(icd10_code, 1, 4) IN ('K700','K701','K702','K703','K709','K713',
                                                   'K714','K715','K717','K760','K762',
                                                   'K763','K764','K768','K769','Z944')
            THEN 1 
            ELSE 0 END) AS mild_liver_disease

        -- Diabetes without chronic complication
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 4) IN ('2500','2501','2502','2503','2508','2509') 
            OR
            SUBSTR(icd10_code, 1, 4) IN ('E100','E10l','E106','E108','E109','E110','E111',
                                                   'E116','E118','E119','E120','E121','E126','E128',
                                                   'E129','E130','E131','E136','E138','E139','E140',
                                                   'E141','E146','E148','E149')
            THEN 1 
            ELSE 0 END) AS diabetes_without_cc

        -- Diabetes with chronic complication
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 4) IN ('2504','2505','2506','2507')
            OR
            SUBSTR(icd10_code, 1, 4) IN ('E102','E103','E104','E105','E107','E112','E113',
                                                   'E114','E115','E117','E122','E123','E124','E125',
                                                   'E127','E132','E133','E134','E135','E137','E142',
                                                   'E143','E144','E145','E147')
            THEN 1 
            ELSE 0 END) AS diabetes_with_cc

        -- Hemiplegia or paraplegia
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 3) IN ('342','343')
            OR
            SUBSTR(icd9_code, 1, 4) IN ('3341','3440','3441','3442',
                                                  '3443','3444','3445','3446','3449')
            OR 
            SUBSTR(icd10_code, 1, 3) IN ('G81','G82')
            OR 
            SUBSTR(icd10_code, 1, 4) IN ('G041','G114','G801','G802','G830',
                                                   'G831','G832','G833','G834','G839')
            THEN 1 
            ELSE 0 END) AS paraplegia

        -- Renal disease
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 3) IN ('582','585','586','V56')
            OR
            SUBSTR(icd9_code, 1, 4) IN ('5880','V420','V451')
            OR
            SUBSTR(icd9_code, 1, 4) BETWEEN '5830' AND '5837'
            OR
            SUBSTR(icd9_code, 1, 5) IN ('40301','40311','40391','40402','40403','40412','40413','40492','40493')          
            OR
            SUBSTR(icd10_code, 1, 3) IN ('N18','N19')
            OR
            SUBSTR(icd10_code, 1, 4) IN ('I120','I131','N032','N033','N034',
                                                   'N035','N036','N037','N052','N053',
                                                   'N054','N055','N056','N057','N250',
                                                   'Z490','Z491','Z492','Z940','Z992')
            THEN 1 
            ELSE 0 END) AS renal_disease

        -- Any malignancy, including lymphoma and leukemia, except malignant neoplasm of skin
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 3) BETWEEN '140' AND '172'
            OR
            SUBSTR(icd9_code, 1, 4) BETWEEN '1740' AND '1958'
            OR
            SUBSTR(icd9_code, 1, 3) BETWEEN '200' AND '208'
            OR
            SUBSTR(icd9_code, 1, 4) = '2386'
            OR
            SUBSTR(icd10_code, 1, 3) IN ('C43','C88')
            OR
            SUBSTR(icd10_code, 1, 3) BETWEEN 'C00' AND 'C26'
            OR
            SUBSTR(icd10_code, 1, 3) BETWEEN 'C30' AND 'C34'
            OR
            SUBSTR(icd10_code, 1, 3) BETWEEN 'C37' AND 'C41'
            OR
            SUBSTR(icd10_code, 1, 3) BETWEEN 'C45' AND 'C58'
            OR
            SUBSTR(icd10_code, 1, 3) BETWEEN 'C60' AND 'C76'
            OR
            SUBSTR(icd10_code, 1, 3) BETWEEN 'C81' AND 'C85'
            OR
            SUBSTR(icd10_code, 1, 3) BETWEEN 'C90' AND 'C97'
            THEN 1 
            ELSE 0 END) AS malignant_cancer

        -- Moderate or severe liver disease
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 4) IN ('4560','4561','4562')
            OR
            SUBSTR(icd9_code, 1, 4) BETWEEN '5722' AND '5728'
            OR
            SUBSTR(icd10_code, 1, 4) IN ('I850','I859','I864','I982','K704','K711',
                                                   'K721','K729','K765','K766','K767')
            THEN 1 
            ELSE 0 END) AS severe_liver_disease

        -- Metastatic solid tumor
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 3) IN ('196','197','198','199')
            OR 
            SUBSTR(icd10_code, 1, 3) IN ('C77','C78','C79','C80')
            THEN 1 
            ELSE 0 END) AS metastatic_solid_tumor

        -- AIDS/HIV
        , MAX(CASE WHEN 
            SUBSTR(icd9_code, 1, 3) IN ('042','043','044')
            OR 
            SUBSTR(icd10_code, 1, 3) IN ('B20','B21','B22','B24')
            THEN 1 
            ELSE 0 END) AS aids
    FROM `physionet-data.mimic_core.admissions` ad
    LEFT JOIN diag
    ON ad.hadm_id = diag.hadm_id
    GROUP BY ad.hadm_id
)

SELECT 

cohort.stay_id 
    , congestive_heart_failure
    , cerebrovascular_disease
    , dementia
    , chronic_pulmonary_disease
    , rheumatic_disease
    , mild_liver_disease
    , diabetes_without_cc
    , diabetes_with_cc
    , paraplegia
    , renal_disease
    , malignant_cancer
    , severe_liver_disease 
    , metastatic_solid_tumor 
    , aids


FROM `{}` cohort 
left join `physionet-data.mimic_icu.icustays` icu on icu.stay_id = cohort.stay_id
left join com on com.hadm_id = icu.hadm_id 
'''.format(view_id)
comorb = run_query(sql_comorbidities)

13630 rows
    stay_id  congestive_heart_failure  ...  metastatic_solid_tumor  aids
0  30081073                         0  ...                       0     0
1  30092891                         0  ...                       0     0
2  30117630                         0  ...                       0     0
3  30131613                         0  ...                       0     0
4  30172516                         0  ...                       0     0

[5 rows x 15 columns]


# Merge data

In [21]:
# first check out subscores in apsiii and sofa; may remove them to have a simpler feature set
sofa.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'SOFA', 'respiration',
       'coagulation', 'liver', 'cardiovascular', 'cns', 'renal'],
      dtype='object')

In [22]:
aps.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'apsiii', 'apsiii_prob', 'hr_score',
       'mbp_score', 'temp_score', 'resp_rate_score', 'pao2_aado2_score',
       'hematocrit_score', 'wbc_score', 'creatinine_score', 'uo_score',
       'bun_score', 'sodium_score', 'albumin_score', 'bilirubin_score',
       'glucose_score', 'acidbase_score', 'gcs_score'],
      dtype='object')

In [23]:
aps_col_full = ['stay_id', 'apsiii', 'apsiii_prob', 'hr_score',
       'mbp_score', 'temp_score', 'resp_rate_score', 'pao2_aado2_score',
       'hematocrit_score', 'wbc_score', 'creatinine_score', 'uo_score',
       'bun_score', 'sodium_score', 'albumin_score', 'bilirubin_score',
       'glucose_score', 'acidbase_score', 'gcs_score']
sofa_col_full = ['stay_id', 'SOFA', 'respiration',
       'coagulation', 'liver', 'cardiovascular', 'cns', 'renal']

aps_col_short = ['stay_id', 'apsiii']
sofa_col_short= ['stay_id', 'SOFA']

In [24]:
# merge all dataset to the cohort_df
ft_df = cohort_df.copy()

med['count_of_vaso'] = med.epinephrine + med.vasopressin + med.dobutamine + med.norepinephrine + med.phenylephrine + med.dopamine

In [25]:
for df in [demo, weight, height, bg, pf, vitals, med, vent, rrt, rhythm, neuroblock, comorb]:
    ft_df = ft_df.merge(df, how='left')
ft_df['duration'] /= 60.
ft_df = ft_df.drop_duplicates()

In [26]:
ft63 = ft_df.merge(sofa[sofa_col_short]).merge(aps[aps_col_short])
print('basic info', ft63.columns[:5])
print('features', ft63.columns[5:])
print('ft count', len(ft63.columns[5:]))

basic info Index(['stay_id', 'starttime', 'endtime', 'duration', 'over72h'], dtype='object')
features Index(['admission_location', 'insurance', 'language', 'ethnicity',
       'marital_status', 'gender', 'age', 'hours_in_hosp_before_intubation',
       'weight', 'height', 'co2_total_max', 'co2_total_min', 'ph_max',
       'ph_min', 'lactate_max', 'lactate_min', 'pao2fio2ratio',
       'heart_rate_max', 'heart_rate_min', 'mbp_ni_max', 'mbp_ni_min',
       'mbp_arterial_max', 'mbp_arterial_min', 'resp_rate_max',
       'resp_rate_min', 'spo2_max', 'spo2_min', 'temp_max', 'temp_min',
       'glucose_max', 'glucose_min', 'vasopressin', 'epinephrine',
       'dobutamine', 'norepinephrine', 'phenylephrine', 'dopamine',
       'count_of_vaso', 'fio2_max', 'fio2_min', 'peep_max', 'peep_min',
       'plateau_pressure_max', 'plateau_pressure_min', 'rrt', 'sinus_rhythm',
       'neuroblocker', 'congestive_heart_failure', 'cerebrovascular_disease',
       'dementia', 'chronic_pulmonary_disease', '

In [27]:
ft_full = ft_df.merge(sofa[sofa_col_full]).merge(aps[aps_col_full])
print('basic info', ft_full.columns[:5])
print('features', ft_full.columns[5:])
print('ft count', len(ft_full.columns[5:]))

basic info Index(['stay_id', 'starttime', 'endtime', 'duration', 'over72h'], dtype='object')
features Index(['admission_location', 'insurance', 'language', 'ethnicity',
       'marital_status', 'gender', 'age', 'hours_in_hosp_before_intubation',
       'weight', 'height', 'co2_total_max', 'co2_total_min', 'ph_max',
       'ph_min', 'lactate_max', 'lactate_min', 'pao2fio2ratio',
       'heart_rate_max', 'heart_rate_min', 'mbp_ni_max', 'mbp_ni_min',
       'mbp_arterial_max', 'mbp_arterial_min', 'resp_rate_max',
       'resp_rate_min', 'spo2_max', 'spo2_min', 'temp_max', 'temp_min',
       'glucose_max', 'glucose_min', 'vasopressin', 'epinephrine',
       'dobutamine', 'norepinephrine', 'phenylephrine', 'dopamine',
       'count_of_vaso', 'fio2_max', 'fio2_min', 'peep_max', 'peep_min',
       'plateau_pressure_max', 'plateau_pressure_min', 'rrt', 'sinus_rhythm',
       'neuroblocker', 'congestive_heart_failure', 'cerebrovascular_disease',
       'dementia', 'chronic_pulmonary_disease', '

In [28]:
ft63.stay_id.nunique(), len(ft63)

(13630, 13630)

In [29]:
# save it to csv
ft63.to_csv('ft63_mimiciv_new_dataset.csv', index=False)
# you can see it here
os.listdir('.')

from google.colab import files
files.download('ft63_mimiciv_new_dataset.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# len(ft63.columns), len(ft)

In [None]:
# save it to csv
# ft63.to_csv('ft63_invasive_procedureevents_based_cohort.csv')
# # you can see it here
# os.listdir('.')

In [None]:
# download the file to your laptop
# from google.colab import files
# files.download('ft63_invasive_procedureevents_based_cohort.csv') 

# Pt Char

In [30]:
# mortality extraction
sql_mortality = '''
SELECT c.* , a.subject_id ,a.hadm_id , a.admittime hosp_intime, a.dischtime hosp_outtime, i.intime icu_intime, i.outtime icu_outtime, a.deathtime 

FROM `{}` c
left join `physionet-data.mimic_icu.icustays` i on i.stay_id =c.stay_id 
left join `physionet-data.mimic_core.admissions` a on a.hadm_id =i.hadm_id 
order by stay_id
'''.format(view_id)
mort = run_query(sql_mortality)

13630 rows
    stay_id           starttime  ...         icu_outtime           deathtime
0  30000213 2162-06-21 06:00:00  ... 2162-06-22 20:52:48                 NaT
1  30002548 2111-08-17 17:40:00  ... 2111-08-18 18:50:31                 NaT
2  30003749 2120-11-05 16:30:00  ... 2120-11-07 08:19:44 2120-11-07 03:41:00
3  30004018 2158-12-27 22:22:00  ... 2159-01-12 23:47:50 2159-01-12 17:45:00
4  30004391 2153-09-05 13:26:00  ... 2153-09-13 18:21:18                 NaT

[5 rows x 12 columns]


In [31]:
sql_mimic_vent_setting=f'''
with tmp as (SELECT 
vc.stay_id, vc.starttime ,vc.endtime , vs.charttime ,vs.fio2 , vs.peep ,vs.plateau_pressure 
FROM `{view_id}` vc
left join `physionet-data.mimic_icu.icustays` i on vc.stay_id = i.stay_id 
left join `physionet-data.mimic_derived.ventilator_setting` vs on i.subject_id = vs.subject_id 
where (vs.charttime >= vc.starttime ) and (DATETIME_DIFF(vs.charttime, vc.starttime, MINUTE) <= {WINDOW_GAP} ) 
)
select stay_id,
max(peep) as peep_max,
avg(peep) as peep_mean,
min(peep) as peep_min,

max(fio2) as fio2_max,
avg(fio2) as fio2_mean,
min(fio2) as fio2_min,

max(plateau_pressure) as plateau_pr_max,
avg(plateau_pressure) as plateau_pr_mean,
min(plateau_pressure) as plateau_pr_min,

from tmp
group by stay_id
'''
mimic_ft = run_query(sql_mimic_vent_setting)

13622 rows
    stay_id  peep_max  ...  plateau_pr_mean  plateau_pr_min
0  33321724      20.0  ...            30.00            26.0
1  32218208      20.0  ...            28.75            19.0
2  33795117      22.0  ...            44.50            43.0
3  33853516       8.3  ...            19.90            18.2
4  32950566       5.2  ...            16.60            16.0

[5 rows x 10 columns]


In [32]:
sql_elective = f'''
--https://github.com/MIT-LCP/mimic-iv/blob/3c5803969b6360881ba8a225094004f97da8020b/concepts/score/oasis.sql
with surgflag as
(
  select ie.stay_id
    , max(case
        when lower(curr_service) like '%surg%' then 1
        when curr_service = 'ORTHO' then 1
    else 0 end) as surgical
  FROM `physionet-data.mimic_icu.icustays` ie
  left join `physionet-data.mimic_hosp.services` se
    on ie.hadm_id = se.hadm_id
    and se.transfertime < DATETIME_ADD(ie.intime, INTERVAL '1' DAY)
  group by ie.stay_id
)

select
ie.stay_id,
  case
        when adm.ADMISSION_TYPE = 'ELECTIVE' and sf.surgical = 1
          then 1
        when adm.ADMISSION_TYPE is null or sf.surgical is null
          then null
        else 0
      end as electivesurgery

FROM `physionet-data.mimic_icu.icustays` ie
inner join `physionet-data.mimic_core.admissions` adm
  on ie.hadm_id = adm.hadm_id
left join surgflag sf
  on ie.stay_id = sf.stay_id
'''
elective = run_query(sql_elective)

76540 rows
    stay_id  electivesurgery
0  36647759                0
1  34792777                0
2  34948580                0
3  37499438                0
4  37704589                0


In [33]:
pt_char_col1 = ['stay_id', 'subject_id', 'hadm_id', 'hosp_intime', 'hosp_outtime', 'icu_intime', 'icu_outtime', 'deathtime']
pt_char_col2 = ['stay_id', 'gender', 'age', 'height', 'weight']
pt_char_col3 = ['stay_id', 'count_of_vaso','rrt', 'SOFA', 'apsiii']

In [34]:
df1 = mort[pt_char_col1]
df2 = ft63[pt_char_col2]
df3 = ft63[pt_char_col3]

In [35]:
# calc bmi
def bmi_func(w, h):
    if h==0 or w==0:
        return None
    if h>500:
        return None
    
    bmi = w/ (h/100)**2
    if bmi <100 and bmi>1:
        return bmi
    else:
        return None

df2['bmi'] = df2.apply(lambda r: bmi_func(r['weight'], r['height']) if r[['weight', 'height']].notnull().all() else None , axis=1)
df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,stay_id,gender,age,height,weight,bmi
0,30000213,M,66,160.0,84.7,33.085937
1,30002548,M,70,178.0,88.4,27.900518
2,30003749,M,50,180.0,90.5,27.932099
3,30004018,F,56,152.0,71.3,30.860457
4,30004391,M,89,178.0,58.8,18.558263
...,...,...,...,...,...,...
13625,39995735,M,62,173.0,86.4,28.868322
13626,39996044,M,58,,66.4,
13627,39996867,F,41,163.0,58.0,21.829952
13628,39998012,F,40,168.0,60.0,21.258503


In [36]:
# vaso flag
df3['vasopressor'] = df3.count_of_vaso.apply(lambda x: 1 if x>0 else 0)
df3 = df3[['stay_id', 'vasopressor', 'rrt', 'SOFA', 'apsiii']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [37]:
pt_char_df = df1.merge(df2).merge(elective).merge(df3).merge(mimic_ft, how='left')
pt_char_df.columns, len(pt_char_df)

(Index(['stay_id', 'subject_id', 'hadm_id', 'hosp_intime', 'hosp_outtime',
        'icu_intime', 'icu_outtime', 'deathtime', 'gender', 'age', 'height',
        'weight', 'bmi', 'electivesurgery', 'vasopressor', 'rrt', 'SOFA',
        'apsiii', 'peep_max', 'peep_mean', 'peep_min', 'fio2_max', 'fio2_mean',
        'fio2_min', 'plateau_pr_max', 'plateau_pr_mean', 'plateau_pr_min'],
       dtype='object'), 13630)

In [38]:
pt_char_df.to_csv('pt_char_mimiciv_new_dataset.csv', index=False)
# you can see it here
os.listdir('.')

from google.colab import files
files.download('pt_char_mimiciv_new_dataset.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>