# 筛选病人流程

## 1、连接数据库<br>
由于使用notebook暂时只支持R、python，所以这里使用python运行SQL脚本语言
连接数据库方式比较简单，具体可参照此代码段

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2

from IPython.display import display,HTML
%matplotlib inline
plt.style.use('ggplot')

#创建数据库连接
sqluser='postgres'
dbname='mimic'
password='19871115'
schema_name='mimiciii'
con=psycopg2.connect(host='192.168.8.103',dbname=dbname,password=password,user=sqluser)
query_schema='set search_path to '+schema_name+';'

## 2、查询患者的年龄和ICU停留时间<br>
可以想到，要使用的是patients和icustays这两张表 

In [9]:
query=query_schema+"""
SELECT icu.subject_id,icu.hadm_id,icu.icustay_id,icu.intime,icu.outtime
,EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60/60/24 as icu_length_of_stay --在ICU停留时间
,EXTRACT(EPOCH FROM  icu.intime-pat.dob)/60/60/24/365.242 AS age --显示年龄
FROM icustays icu
  INNER JOIN patients pat
  ON icu.subject_id = pat.subject_id
  limit 10
"""
df=pd.read_sql_query(query,con)
df

Unnamed: 0,subject_id,hadm_id,icustay_id,intime,outtime,icu_length_of_stay,age
0,2,163353,243653,2138-07-17 21:20:07,2138-07-17 23:32:21,0.091829,0.002434
1,3,145834,211552,2101-10-20 19:10:11,2101-10-26 20:43:09,6.06456,76.526792
2,4,185777,294638,2191-03-16 00:29:31,2191-03-17 16:46:31,1.678472,47.845047
3,5,178980,214757,2103-02-02 06:04:24,2103-02-02 08:06:00,0.084444,0.000693
4,6,107064,228232,2175-05-30 21:30:54,2175-06-03 13:39:54,3.672917,65.942297
5,7,118037,278444,2121-05-23 15:35:29,2121-05-23 22:01:00,0.26772,0.001779
6,7,118037,236754,2121-05-25 03:26:01,2121-05-25 21:10:19,0.739097,0.005868
7,8,159514,262299,2117-11-20 12:36:10,2117-11-21 14:24:55,1.075521,0.001438
8,9,150750,220597,2149-11-09 13:07:02,2149-11-14 20:52:14,5.323056,41.790228
9,10,184167,288409,2103-06-28 11:39:05,2103-07-06 13:51:43,8.092106,0.001329


从结果中可以看到，对于患者7，看了一次病，但是有两次ICU记录，像这样的情况要进行处理，具体处理方式还需要讨论

## 3、为每一次进入ICU记录排序，并关联每一次ICU内是否使用呼吸机

In [13]:
query=query_schema+"""
SELECT icu.subject_id,icu.hadm_id,icu.icustay_id,icu.intime,icu.outtime
,EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60/60/24 as icu_length_of_stay --在ICU停留时间
,EXTRACT(EPOCH FROM  icu.intime-pat.dob)/60/60/24/365.242 AS age --显示年龄
,rank() OVER (PARTITION BY icu.subject_id ORDER BY icu.intime) AS icustay_id_order
,vent.vent
FROM icustays icu
  INNER JOIN patients pat
  ON icu.subject_id = pat.subject_id
  LEFT JOIN ventfirstday vent
  ON icu.icustay_id=vent.icustay_id
  limit 10
"""
df=pd.read_sql_query(query,con)
df

Unnamed: 0,subject_id,hadm_id,icustay_id,intime,outtime,icu_length_of_stay,age,icustay_id_order,vent
0,2,163353,243653,2138-07-17 21:20:07,2138-07-17 23:32:21,0.091829,0.002434,1,0
1,3,145834,211552,2101-10-20 19:10:11,2101-10-26 20:43:09,6.06456,76.526792,1,1
2,4,185777,294638,2191-03-16 00:29:31,2191-03-17 16:46:31,1.678472,47.845047,1,0
3,5,178980,214757,2103-02-02 06:04:24,2103-02-02 08:06:00,0.084444,0.000693,1,0
4,6,107064,228232,2175-05-30 21:30:54,2175-06-03 13:39:54,3.672917,65.942297,1,0
5,7,118037,278444,2121-05-23 15:35:29,2121-05-23 22:01:00,0.26772,0.001779,1,0
6,7,118037,236754,2121-05-25 03:26:01,2121-05-25 21:10:19,0.739097,0.005868,2,0
7,8,159514,262299,2117-11-20 12:36:10,2117-11-21 14:24:55,1.075521,0.001438,1,1
8,9,150750,220597,2149-11-09 13:07:02,2149-11-14 20:52:14,5.323056,41.790228,1,1
9,10,184167,288409,2103-06-28 11:39:05,2103-07-06 13:51:43,8.092106,0.001329,1,0


通过icustay_id_order和vent 可以判断，是否符合要求

## 4、分别对年龄、ICU停留时间、做出判断、是否第一次进入ICU、是否机械通气 做出判断

在这里可以看出，对于某一个条件的筛选过程不是直接把这部分病人删除，而是，给每一个病人加一个字段，用于标记是否符合条件，这一点非常方便

In [6]:
query=query_schema+"""
--先创建一个icu_info的临时表
WITH icu_info as(
SELECT icu.subject_id,icu.hadm_id,icu.icustay_id,icu.intime,icu.outtime,vent.vent
,EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60/60/24 as icu_length_of_stay --在ICU停留时间
,EXTRACT(EPOCH FROM  icu.intime-pat.dob)/60/60/24/365.242 AS age --显示年龄
,rank() OVER (PARTITION BY icu.subject_id ORDER BY icu.intime) AS icustay_id_order

FROM icustays icu
  INNER JOIN patients pat
  ON icu.subject_id = pat.subject_id
  LEFT JOIN ventfirstday vent
  ON icu.icustay_id=vent.icustay_id

--LIMIT 1000
)
--在上面的临时表中查询
SELECT
  icu_info.subject_id,icu_info.hadm_id,icu_info.icustay_id,age,icu_info.intime,icu_info.outtime
--第一个限制条件，在ICU停留超过1天
,case
  when icu_info.icu_length_of_stay<1 then 1
  ELSE  0 END
AS exclusion_los
--限制年龄要大于16岁 为成年
,CASE
  WHEN icu_info.age<16 THEN 1
  ELSE 0 END
AS exclusion_age
--第一次进入ICU的患者
,CASE
  WHEN icu_info.icustay_id_order!=1 then 1
  ELSE 0 END
AS exclusion_first_stay
--在ICU中是否有机械通气
,CASE
  WHEN icu_info.vent=0 then 1
  ELSE 0 END
AS exclusion_vent

FROM icu_info
limit 10
"""
df=pd.read_sql_query(query,con)
df

Unnamed: 0,subject_id,hadm_id,icustay_id,age,intime,outtime,exclusion_los,exclusion_age,exclusion_first_stay,exclusion_vent
0,2,163353,243653,0.002434,2138-07-17 21:20:07,2138-07-17 23:32:21,1,1,0,1
1,3,145834,211552,76.526792,2101-10-20 19:10:11,2101-10-26 20:43:09,0,0,0,0
2,4,185777,294638,47.845047,2191-03-16 00:29:31,2191-03-17 16:46:31,0,0,0,1
3,5,178980,214757,0.000693,2103-02-02 06:04:24,2103-02-02 08:06:00,1,1,0,1
4,6,107064,228232,65.942297,2175-05-30 21:30:54,2175-06-03 13:39:54,0,0,0,1
5,7,118037,278444,0.001779,2121-05-23 15:35:29,2121-05-23 22:01:00,1,1,0,1
6,7,118037,236754,0.005868,2121-05-25 03:26:01,2121-05-25 21:10:19,1,1,1,1
7,8,159514,262299,0.001438,2117-11-20 12:36:10,2117-11-21 14:24:55,0,1,0,0
8,9,150750,220597,41.790228,2149-11-09 13:07:02,2149-11-14 20:52:14,0,0,0,0
9,10,184167,288409,0.001329,2103-06-28 11:39:05,2103-07-06 13:51:43,0,1,0,1


根据exclusion_*字段可以看出每一次入ICU记录是否符合条件，方便后期统计使用

In [13]:
print('{:20s} {:5d}'.format('Observations', df.shape[0]))
idxExcl = np.zeros(df.shape[0],dtype=bool)
for col in df.columns:
    if "exclusion_" in col:
        print('{:20s} {:5d} ({:2.2f}%)'.format(col, df[col].sum(), df[col].sum()*100.0/df.shape[0]))
        idxExcl = (idxExcl) | (df[col]==1)
# print a summary of how many were excluded in total
print('')
print('{:20s} {:5d} ({:2.2f}%)'.format('Total excluded', np.sum(idxExcl), np.sum(idxExcl)*100.0/df.shape[0]))

Observations         61532
exclusion_los        12308 (20.00%)
exclusion_age         8109 (13.18%)
exclusion_first_stay 15056 (24.47%)
exclusion_vent       36867 (59.92%)

Total excluded       45221 (73.49%)


使用以上代码，就可以统计每一个限制条件排除了多少病人，最终符合条件的就是exclusion_*字段全为0的患者。

## 5、想进一步加入影像学检查

这一部分内容只是一个探索，由于mimic中只有影像学的报告，在noteevents表中，category字段记录报告的类型，description字段记录检查报告的内容，但是通过内容的观察，也很难看出对于疾病有用的诊断信息，但部分是一些做检查的信息，很难通过这个来排查患者。

In [14]:
query=query_schema+"""
SELECT description
FROM noteevents
  WHERE category='Radiology' AND description LIKE '%CHEST%'
GROUP BY description
limit 10
"""
df=pd.read_sql_query(query,con)
df

Unnamed: 0,description
0,CHEST SGL VIEW/LINE PLACEMENT
1,P CHEST PORT. LINE PLACEMENT PORT
2,CT CHEST W/CONTRAST
3,P BABYGRAM (CHEST ONLY) PORT
4,CHEST (APICAL LORD ONLY)
5,CT CHEST W/O CONTRAST
6,P CHEST (PA & LAT) PORT
7,"LP RIB UNILAT, W/ AP CHEST LEFT PORT"
8,MRA (CHEST & ABD) W&W/O CONTRAST
9,"B RIB UNILAT, W/ AP CHEST BILAT"


最终决定使用noteevents做一个尝试，做一个假设，在ICU期间，是否做过胸部影像学检查，如果没做过，则排除

In [16]:
query=query_schema+"""
--先创建一个icu_info的临时表
WITH icu_info as(
SELECT icu.subject_id,icu.hadm_id,icu.icustay_id,icu.intime,icu.outtime,vent.vent
,EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60/60/24 as icu_length_of_stay --在ICU停留时间
,EXTRACT(EPOCH FROM  icu.intime-pat.dob)/60/60/24/365.242 AS age --显示年龄
,rank() OVER (PARTITION BY icu.subject_id ORDER BY icu.intime) AS icustay_id_order
FROM icustays icu
  INNER JOIN patients pat
  ON icu.subject_id = pat.subject_id
  LEFT JOIN ventfirstday vent
  ON icu.icustay_id=vent.icustay_id

)
,chest_num AS (
  SELECT icu.subject_id,icu.hadm_id,icu.icustay_id,count(*) AS chest_num
 -- ,count(*) OVER (PARTITION BY note.hadm_id ORDER BY note.charttime)
FROM icustays icu
  LEFT JOIN noteevents note
  ON  icu.hadm_id=note.hadm_id
  WHERE note.charttime>icu.intime
  AND note.charttime<icu.outtime
  AND note.category= 'Radiology'
  AND note.description LIKE '%CHEST%'
GROUP BY icu.subject_id,icu.hadm_id,icu.icustay_id
)
--在上面的临时表中查询
SELECT
  icu_info.subject_id,icu_info.hadm_id,icu_info.icustay_id,age,icu_info.intime,icu_info.outtime
--第一个限制条件，在ICU停留超过1天
,case
  when icu_info.icu_length_of_stay<1 then 1
  ELSE  0 END
AS exclusion_los
--限制年龄要大于16岁 为成年
,CASE
  WHEN icu_info.age<16 THEN 1
  ELSE 0 END
AS exclusion_age
--第一次进入ICU的患者
,CASE
  WHEN icu_info.icustay_id_order!=1 then 1
  ELSE 0 END
AS exclusion_first_stay
--在ICU中是否有机械通气
,CASE
  WHEN icu_info.vent=0 then 1
  ELSE 0 END
AS exclusion_vent
,CASE
  WHEN chest_num.chest_num<1 THEN 1
  ELSE 0 END
AS exclusion_chest
FROM icu_info
LEFT JOIN chest_num
  ON chest_num.icustay_id=icu_info.icustay_id

"""
df=pd.read_sql_query(query,con)
df





Unnamed: 0,subject_id,hadm_id,icustay_id,age,intime,outtime,exclusion_los,exclusion_age,exclusion_first_stay,exclusion_vent,exclusion_chest
0,55973,152234,200001,61.071279,2181-11-25 19:06:12,2181-11-28 20:59:25,0,0,1,1,0
1,27513,163557,200003,48.296271,2199-08-02 19:50:04,2199-08-08 17:09:18,0,0,0,0,0
2,10950,189514,200006,54.072308,2159-09-03 11:28:14,2159-09-04 19:08:10,0,0,1,0,0
3,20707,129310,200007,43.345013,2109-02-17 10:03:37,2109-02-18 17:03:12,0,0,0,1,0
4,29904,129607,200009,47.555978,2189-11-30 10:34:32,2189-12-02 14:17:37,0,0,0,0,0
5,11861,192256,200010,27.253056,2132-08-04 23:03:19,2132-08-05 22:14:11,1,0,1,1,0
6,93535,121562,200011,84.234751,2188-08-06 01:39:24,2188-08-07 16:50:53,0,0,1,0,0
7,28448,177527,200012,32.989682,2153-12-23 05:12:55,2153-12-23 15:55:54,1,0,0,1,0
8,9514,127229,200014,84.730042,2105-02-16 23:16:48,2105-02-18 16:53:29,0,0,0,0,0
9,74032,117458,200016,67.236698,2150-12-02 15:59:20,2150-12-03 14:54:29,1,0,1,1,0


但是结果显示所有患者都进行了胸部影像学检查，感觉有问题。

In [17]:
print('{:20s} {:5d}'.format('Observations', df.shape[0]))
idxExcl = np.zeros(df.shape[0],dtype=bool)
for col in df.columns:
    if "exclusion_" in col:
        print('{:20s} {:5d} ({:2.2f}%)'.format(col, df[col].sum(), df[col].sum()*100.0/df.shape[0]))
        idxExcl = (idxExcl) | (df[col]==1)
# print a summary of how many were excluded in total
print('')
print('{:20s} {:5d} ({:2.2f}%)'.format('Total excluded', np.sum(idxExcl), np.sum(idxExcl)*100.0/df.shape[0]))

Observations         61532
exclusion_los        12308 (20.00%)
exclusion_age         8109 (13.18%)
exclusion_first_stay 15056 (24.47%)
exclusion_vent       36867 (59.92%)
exclusion_chest          0 (0.00%)

Total excluded       45221 (73.49%)


## 6、对PF做出限制

患者进入ICU，前7天，只要发生过PF<300 则入组，pf是关于患者七天内最小的pf值记录

In [2]:
query=query_schema+"""
SELECT select_patient.*,pf.exclusion_pf
FROM select_patient
LEFT JOIN pf
  ON select_patient.icustay_id=pf.icustay_id
"""
df=pd.read_sql_query(query,con)
df


Unnamed: 0,subject_id,hadm_id,icustay_id,age,gender,height_first,weight_first,ethnicity,intime,outtime,icu_length_of_stay,exclusion_los,exclusion_age,exclusion_first_stay,exclusion_vent,exclusion_chest,exclusion_pf
0,55973,152234,200001,61.071279,F,170.18,61.00,ASIAN - ASIAN INDIAN,2181-11-25 19:06:12,2181-11-28 20:59:25,3.078623,0,0,1,1,0,1
1,27513,163557,200003,48.296271,M,,77.50,WHITE,2199-08-02 19:50:04,2199-08-08 17:09:18,5.888356,0,0,0,0,0,0
2,10950,189514,200006,54.072308,M,,82.40,OTHER,2159-09-03 11:28:14,2159-09-04 19:08:10,1.319398,0,0,1,0,0,1
3,20707,129310,200007,43.345013,M,177.80,126.00,WHITE,2109-02-17 10:03:37,2109-02-18 17:03:12,1.291377,0,0,0,1,0,1
4,29904,129607,200009,47.555978,F,160.02,87.20,WHITE,2189-11-30 10:34:32,2189-12-02 14:17:37,2.154919,0,0,0,0,0,0
5,11861,192256,200010,27.253056,F,,49.30,BLACK/AFRICAN AMERICAN,2132-08-04 23:03:19,2132-08-05 22:14:11,0.965880,1,0,1,1,0,1
6,93535,121562,200011,84.234751,F,,101.40,WHITE,2188-08-06 01:39:24,2188-08-07 16:50:53,1.632975,0,0,1,0,0,0
7,28448,177527,200012,32.989682,F,,51.20,ASIAN,2153-12-23 05:12:55,2153-12-23 15:55:54,0.446516,1,0,0,1,0,1
8,9514,127229,200014,84.730042,M,167.64,62.00,UNKNOWN/NOT SPECIFIED,2105-02-16 23:16:48,2105-02-18 16:53:29,1.733808,0,0,0,0,0,0
9,74032,117458,200016,67.236698,F,,64.00,WHITE,2150-12-02 15:59:20,2150-12-03 14:54:29,0.954965,1,0,1,1,0,1


In [3]:
print('{:20s} {:5d}'.format('Observations', df.shape[0]))
idxExcl = np.zeros(df.shape[0],dtype=bool)
for col in df.columns:
    if "exclusion_" in col:
        print('{:20s} {:5d} ({:2.2f}%)'.format(col, df[col].sum(), df[col].sum()*100.0/df.shape[0]))
        idxExcl = (idxExcl) | (df[col]==1)
# print a summary of how many were excluded in total
print('')
print('{:20s} {:5d} ({:2.2f}%)'.format('Total excluded', np.sum(idxExcl), np.sum(idxExcl)*100.0/df.shape[0]))

Observations         61532
exclusion_los        12308 (20.00%)
exclusion_age         8109 (13.18%)
exclusion_first_stay 15056 (24.47%)
exclusion_vent       36867 (59.92%)
exclusion_chest          0 (0.00%)
exclusion_pf         37550 (61.03%)

Total excluded       48662 (79.08%)


In [4]:
61532-48662

12870