学习github代码，关于第一天患者的血气分析信息提取2

1、建立数据库连接

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2

from IPython.display import display,HTML
%matplotlib inline
plt.style.use('ggplot')

#创建数据库连接
sqluser='postgres'
dbname='mimic'
schema_name='mimiciii'
con=psycopg2.connect(dbname=dbname,user=sqluser)
query_schema='set search_path to '+schema_name+';'

2、查询spo2的临时表

In [4]:
query=query_schema+"""
SELECT chartevents.subject_id,
            chartevents.hadm_id,
            chartevents.icustay_id,
            chartevents.charttime,
           --这里限定了spo2的范围 0-100，他是选择的最大值
            max(
                CASE
                    WHEN ((chartevents.valuenum <= (0)::double precision) OR (chartevents.valuenum > (100)::double precision)) THEN NULL::double precision
                    ELSE chartevents.valuenum
                END) AS spo2
           FROM mimiciii.chartevents
          WHERE (chartevents.itemid = ANY (ARRAY[646, 220277]))
          GROUP BY chartevents.subject_id, chartevents.hadm_id, chartevents.icustay_id, chartevents.charttime
          limit 10
"""
df=pd.read_sql_query(query,con)
df

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,spo2
0,3,145834,211552,2101-10-20 18:45:00,98.0
1,3,145834,211552,2101-10-20 19:00:00,82.0
2,3,145834,211552,2101-10-20 19:15:00,74.0
3,3,145834,211552,2101-10-20 21:15:00,99.0
4,3,145834,211552,2101-10-20 21:30:00,96.0
5,3,145834,211552,2101-10-20 21:45:00,98.0
6,3,145834,211552,2101-10-20 22:00:00,100.0
7,3,145834,211552,2101-10-20 22:15:00,100.0
8,3,145834,211552,2101-10-20 22:30:00,100.0
9,3,145834,211552,2101-10-20 22:45:00,100.0


1)从上面的代码我们可以看到，血氧饱和度的itemid=646 或220277（与我们选择的一致）<br>
2)关于icutay_id 对应的每一个charttime这一个时间点，作者在这里球了一个最大值，有可能出现，在一个时间点，系统里会有多条记录，这样选择最大值，会减少重复的数据，这个方法可以为我们后续提取数据给出很好的建议

3、查询FiO2的临时表

In [4]:
query=query_schema+"""
SELECT chartevents.subject_id,
            chartevents.hadm_id,
            chartevents.icustay_id,
            chartevents.charttime,
            max(
                CASE
                    WHEN (chartevents.itemid = 223835) THEN
                    CASE
                        WHEN ((chartevents.valuenum > (0)::double precision) AND (chartevents.valuenum <= (1)::double precision)) THEN (chartevents.valuenum * (100)::double precision)
                        WHEN ((chartevents.valuenum > (1)::double precision) AND (chartevents.valuenum < (21)::double precision)) THEN NULL::double precision
                        WHEN ((chartevents.valuenum >= (21)::double precision) AND (chartevents.valuenum <= (100)::double precision)) THEN chartevents.valuenum
                        ELSE NULL::double precision
                    END
                    WHEN (chartevents.itemid = ANY (ARRAY[3420, 3422])) THEN chartevents.valuenum
                    WHEN ((chartevents.itemid = 190) AND (chartevents.valuenum > (0.20)::double precision) AND (chartevents.valuenum < (1)::double precision)) THEN (chartevents.valuenum * (100)::double precision)
                    ELSE NULL::double precision
                END) AS fio2_chartevents
           FROM mimiciii.chartevents
          WHERE ((chartevents.itemid = ANY (ARRAY[3420, 190, 223835, 3422])) AND (chartevents.error IS DISTINCT FROM 1))
          GROUP BY chartevents.subject_id, chartevents.hadm_id, chartevents.icustay_id, chartevents.charttime
          limit 10
"""
df=pd.read_sql_query(query,con)
df

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,fio2_chartevents
0,3,145834,211552,2101-10-20 20:00:00,
1,3,145834,211552,2101-10-20 23:15:00,
2,3,145834,211552,2101-10-20 23:45:00,60.000002
3,3,145834,211552,2101-10-21 00:28:00,50.0
4,3,145834,211552,2101-10-21 04:15:00,50.0
5,3,145834,211552,2101-10-21 07:34:00,50.0
6,3,145834,211552,2101-10-21 09:48:00,40.000001
7,3,145834,211552,2101-10-21 11:00:00,40.000001
8,3,145834,211552,2101-10-21 15:00:00,40.000001
9,3,145834,211552,2101-10-21 20:50:00,40.000001


这里有3点需要注意<br>
1、所有数据从chartevents提取<br>
2、FiO2的itemid=223835、3420、3422、190，比我们多选了一个3422
3、关于FiO2单位不同意的问题，当时我也提出了，有的是0-1 有的是0-100，需要转换，但是没注意223835是这两种单位的混合，这里要格外注意，<br>
当在处理233835时，0-1范围的×100，1-21范围的直接pass，21-100的直接保留使用<br>
3420、3422单位不需要转化，范围为0-100<br>
190 只保留0.2-1的 并×100<br>
4、chartevents.error IS DISTINCT FROM 1 如果error字段为1 表示这个值是不可用的，这个字段为null时是可用的，如果直接写chartevents.error!=1 由于会存在null 直接比较会出错，所以要这样表达


In [7]:
query=query_schema+"""
WITH stg_spo2 AS (
         SELECT chartevents.subject_id,
            chartevents.hadm_id,
            chartevents.icustay_id,
            chartevents.charttime,
           --这里限定了spo2的范围 0-100，他是选择的最大值
            max(
                CASE
                    WHEN ((chartevents.valuenum <= (0)::double precision) OR (chartevents.valuenum > (100)::double precision)) THEN NULL::double precision
                    ELSE chartevents.valuenum
                END) AS spo2
           FROM mimiciii.chartevents
          WHERE (chartevents.itemid = ANY (ARRAY[646, 220277]))
          GROUP BY chartevents.subject_id, chartevents.hadm_id, chartevents.icustay_id, chartevents.charttime
        )
SELECT bg.subject_id,
            bg.hadm_id,
            bg.icustay_id,
            bg.charttime,
            bg.specimen,
            bg.aado2,
            bg.baseexcess,
            bg.bicarbonate,
            bg.totalco2,
            bg.carboxyhemoglobin,
            bg.chloride,
            bg.calcium,
            bg.glucose,
            bg.hematocrit,
            bg.hemoglobin,
            bg.intubated,
            bg.lactate,
            bg.methemoglobin,
            bg.o2flow,
            bg.fio2,
            bg.so2,
            bg.pco2,
            bg.peep,
            bg.ph,
            bg.po2,
            bg.potassium,
            bg.requiredo2,
            bg.sodium,
            bg.temperature,
            bg.tidalvolume,
            bg.ventilationrate,
            bg.ventilator,
            row_number() OVER (PARTITION BY bg.icustay_id, bg.charttime ORDER BY s1.charttime DESC) AS lastrowspo2,
            s1.spo2
           FROM (mimiciii.bloodgasfirstday bg
             LEFT JOIN stg_spo2 s1 ON (((bg.icustay_id = s1.icustay_id) AND ((s1.charttime >= (bg.charttime - '02:00:00'::interval hour)) AND (s1.charttime <= bg.charttime)))))
          WHERE (bg.po2 IS NOT NULL)
          limit 5
"""
df=pd.read_sql_query(query,con)
df

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,specimen,aado2,baseexcess,bicarbonate,totalco2,carboxyhemoglobin,...,po2,potassium,requiredo2,sodium,temperature,tidalvolume,ventilationrate,ventilator,lastrowspo2,spo2
0,55973,152234,200001,2181-11-25 19:27:00,,,1.0,,28.0,,...,79.0,4.6,,,,,,,1,95.0
1,55973,152234,200001,2181-11-25 19:27:00,,,1.0,,28.0,,...,79.0,4.6,,,,,,,2,94.0
2,55973,152234,200001,2181-11-26 11:07:00,,,1.0,,28.0,,...,105.0,,,,,,,,1,100.0
3,55973,152234,200001,2181-11-26 11:07:00,,,1.0,,28.0,,...,105.0,,,,,,,,2,100.0
4,55973,152234,200001,2181-11-26 17:44:00,,,,,27.0,,...,66.0,,,,37.3,,,,1,95.0


在bloodgasfirstday的基础上关联之前查询的SpO2数据，使用到over窗口函数，估计是对于bloodgasfirstday会对应多条SpO2，所以要对多条spO2进行处理<br>
两张表使用icustay_id关联，并限制，SpO2的时间在 bloodgasfirstday的charttime之前2小时以内，且po2不为null<br>
关于多条记录的处理方式是：按照bg.icustay_id,bg.charttime创建时间窗，按照spo2表的charttime排序，按照时间降序排列，并编号


In [9]:
query=query_schema+"""
--上面讲过的，提取SpO2
with stg_spo2 as
(
  select SUBJECT_ID, HADM_ID, ICUSTAY_ID, CHARTTIME
    -- max here is just used to group SpO2 by charttime
    , max(case when valuenum <= 0 or valuenum > 100 then null else valuenum end) as SpO2
  from CHARTEVENTS
  -- o2 sat
  where ITEMID in
  (
    646 -- SpO2
  , 220277 -- O2 saturation pulseoxymetry
  )
  group by SUBJECT_ID, HADM_ID, ICUSTAY_ID, CHARTTIME
)
--提取FiO2
, stg_fio2 as
(
  select SUBJECT_ID, HADM_ID, ICUSTAY_ID, CHARTTIME
    -- pre-process the FiO2s to ensure they are between 21-100%
    , max(
        case
          when itemid = 223835
            then case
              when valuenum > 0 and valuenum <= 1
                then valuenum * 100
              -- improperly input data - looks like O2 flow in litres
              when valuenum > 1 and valuenum < 21
                then null
              when valuenum >= 21 and valuenum <= 100
                then valuenum
              else null end -- unphysiological
        when itemid in (3420, 3422)
        -- all these values are well formatted
            then valuenum
        when itemid = 190 and valuenum > 0.20 and valuenum < 1
        -- well formatted but not in %
            then valuenum * 100
      else null end
    ) as fio2_chartevents
  from CHARTEVENTS
  where ITEMID in
  (
    3420 -- FiO2
  , 190 -- FiO2 set
  , 223835 -- Inspired O2 Fraction (FiO2)
  , 3422 -- FiO2 [measured]
  )
  -- exclude rows marked as error
  and error IS DISTINCT FROM 1
  group by SUBJECT_ID, HADM_ID, ICUSTAY_ID, CHARTTIME
)
--将SpO2关联到bg
, stg2 as
(
select bg.*
  , ROW_NUMBER() OVER (partition by bg.icustay_id, bg.charttime order by s1.charttime DESC) as lastRowSpO2
  , s1.spo2
from bloodgasfirstday bg
left join stg_spo2 s1
  -- same patient
  on  bg.icustay_id = s1.icustay_id
  -- spo2 occurred at most 2 hours before this blood gas
  and s1.charttime between bg.charttime - interval '2' hour and bg.charttime
where bg.po2 is not null
)
--计算了一个预测值不是很明白，同时关联了FiO2
select bg.*
  , ROW_NUMBER() OVER (partition by bg.icustay_id, bg.charttime order by s2.charttime DESC) as lastRowFiO2
  , s2.fio2_chartevents

  -- create our specimen prediction
  ,  1/(1+exp(-(-0.02544
  +    0.04598 * po2
  + coalesce(-0.15356 * spo2             , -0.15356 *   97.49420 +    0.13429)
  + coalesce( 0.00621 * fio2_chartevents ,  0.00621 *   51.49550 +   -0.24958)
  + coalesce( 0.10559 * hemoglobin       ,  0.10559 *   10.32307 +    0.05954)
  + coalesce( 0.13251 * so2              ,  0.13251 *   93.66539 +   -0.23172)
  + coalesce(-0.01511 * pco2             , -0.01511 *   42.08866 +   -0.01630)
  + coalesce( 0.01480 * fio2             ,  0.01480 *   63.97836 +   -0.31142)
  + coalesce(-0.00200 * aado2            , -0.00200 *  442.21186 +   -0.01328)
  + coalesce(-0.03220 * bicarbonate      , -0.03220 *   22.96894 +   -0.06535)
  + coalesce( 0.05384 * totalco2         ,  0.05384 *   24.72632 +   -0.01405)
  + coalesce( 0.08202 * lactate          ,  0.08202 *    3.06436 +    0.06038)
  + coalesce( 0.10956 * ph               ,  0.10956 *    7.36233 +   -0.00617)
  + coalesce( 0.00848 * o2flow           ,  0.00848 *    7.59362 +   -0.35803)
  ))) as SPECIMEN_PROB
from stg2 bg
left join stg_fio2 s2
  -- same patient
  on  bg.icustay_id = s2.icustay_id
  -- fio2 occurred at most 4 hours before this blood gas
  and s2.charttime between bg.charttime - interval '4' hour and bg.charttime
where bg.lastRowSpO2 = 1 -- only the row with the most recent SpO2 (if no SpO2 found lastRowSpO2 = 1)
limit 10
"""
df=pd.read_sql_query(query,con)
df

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,specimen,aado2,baseexcess,bicarbonate,totalco2,carboxyhemoglobin,...,sodium,temperature,tidalvolume,ventilationrate,ventilator,lastrowspo2,spo2,lastrowfio2,fio2_chartevents,specimen_prob
0,55973,152234,200001,2181-11-25 19:27:00,,,1.0,,28.0,,...,,,,,,1,95.0,1,,0.944906
1,55973,152234,200001,2181-11-26 11:07:00,,,1.0,,28.0,,...,,,,,,1,100.0,1,,0.969243
2,55973,152234,200001,2181-11-26 17:44:00,,,,,27.0,,...,,37.3,,,,1,95.0,1,,0.854325
3,55973,152234,200001,2181-11-26 18:56:00,,,,,26.0,,...,,,,,,1,94.0,1,,0.994931
4,27513,163557,200003,2199-08-03 01:55:00,ART,,,,20.0,,...,,,,,,1,94.0,1,,0.981179
5,27513,163557,200003,2199-08-03 01:55:00,ART,,,,20.0,,...,,,,,,1,94.0,2,,0.981179
6,27513,163557,200003,2199-08-03 03:42:00,ART,,,,23.0,,...,,,,,,1,98.0,1,,0.957332
7,27513,163557,200003,2199-08-03 03:42:00,ART,,,,23.0,,...,,,,,,1,98.0,2,,0.957332
8,27513,163557,200003,2199-08-03 03:42:00,ART,,,,23.0,,...,,,,,,1,98.0,3,,0.957332
9,27513,163557,200003,2199-08-03 10:49:00,ART,,,,24.0,,...,,,,,,1,97.0,1,50.0,0.985891


关联FiO2是限制在bg.charttime前四个小时以内，同时限制了使用最接近charttime的SpO2，<br>
注意一个函数coalesce 返回第一个非空的值