In [1]:
'''检查EXIT-SEP数据一致性并合并基线表'''
#! 手动修改: 
# 6-终点指标-含住ICU时间+住院时间+临床结局+费用  删除第一行标头 D1, D28
# 2.2-D1D4D7-APACHEII评分-1817例.xlsx 表头手动改为 APACHE_II_D1, D4, D7
# 2.4-D1-D7感染性休克-1817例.xlsx 表头7个 D1感染休克-0无/1有 分别改为 D1, D2, D3... 感染休克-0无/1有
# 4.1 - 4.4: 67例未用药基线表头“结果”改成对应的指标名称
# 4.5-D1-D7血气分析-1817例.xlsx 删除表头D1-D7，只保留 D1 基线血气, 另存为 4.5-D1血气分析-1817例.xlsx; 此外有一例编号070007患者未在随机化表中出现，缺少070008号患者，确认070007是否为记录错误

import os, sys
import pandas as pd

# 检测运行环境
def in_notebook():
    return 'IPKernelApp' in getattr(globals().get('get_ipython', lambda: None)(), 'config', {})

if in_notebook():
    notebook_dir = os.getcwd()
    src_path = os.path.abspath(os.path.join(notebook_dir, '..'))
else:
    src_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
sys.path.append(src_path) if src_path not in sys.path else None

from src.utils import *
from src.setup import *
EXIT_SEP_PATH = r'E:\BaiduNetdiskWorkspace\My workspace\23.4.4-PhD-thesis\EXIT-SEP\EXIT-SEP病例数据-手工处理' # 试验原始数据


# 合并构建特征

In [2]:
filelist = getfiles(EXIT_SEP_PATH)

lab_sheet_dict = {'4.1': (['红细胞(RBC)', '血红蛋白(Hb)', '白细胞(WBC)', '中性粒细胞（N）', '淋巴细胞(L)', '血小板（PLT）', '红细胞压积（HCT）'], '67例未用药病例血常规基线'),
                  '4.2': (['谷丙转氨酶（ALT)', '谷草转氨酶(AST)', '总胆红素(STB)', '尿素氮（BUN）', '血肌酐（Cr）', '血糖（Glu）', '血钾（K）', '血钠（Na）'], '67例未用药病例血生化基线'),
                  '4.3': (['纤维蛋白原（Fg）', '凝血酶原时间（PT）', '部分活化凝血酶原时间（APTT）', 'D-二聚体（D-Dimer）'], '67例未用药病例凝血基线'),
                  '4.4': (['C反应蛋白（CRP）', '降钙素原（PCT）'], '67例未用药病例炎症指标基线')
                  }

df = None
for file in filelist:
    fname = os.path.split(file)[1]
    print(f'处理数据: {fname}')

    # 跳过字典
    if fname=='1.4-基线合并疾病归类.xlsx':
        continue
    
    # 跳过合并用药
    elif fname.startswith(('5','~')):
        continue

    # 病原学检查
    elif fname.startswith('1.2'):
        df_temp1 = pd.read_excel(file)

        df_temp = pd.read_excel(file, sheet_name='病原学检查结果')
        df_temp['Gram-_infect'] = df_temp['致病菌名称'].isin(range(1,13)).astype(int)
        df_temp['Gram+_infect'] = df_temp['致病菌名称'].isin(range(13,20)).astype(int)
        df_temp['Fungi_infect'] = df_temp['致病菌名称'].isin(range(20,29)).astype(int)
        df_temp['atypical_infect'] = (df_temp['致病菌名称']==29).astype(int)
        df_temp['virus_infect'] = df_temp['致病菌名称'].isin([30, 31]).astype(int)
        df_temp['other_infect'] = df_temp['致病菌名称'].isin([29, 30, 31]).astype(int)


        df_temp['Gram_neg_resist'] = (df_temp['耐药情况']=='阴性菌耐药').astype(int)
        df_temp['Gram_pos_resist'] = (df_temp['耐药情况']=='阳性菌耐药').astype(int)
        df_temp['Fungi_resist'] = (df_temp['耐药情况']=='真菌耐药').astype(int)
        df_temp['multidrug_resist'] = (df_temp['多重耐药']=='多重耐药').astype(int)

        df_temp = df_temp.groupby('受试者代码')[["Gram-_infect","Gram+_infect","Fungi_infect","atypical_infect","virus_infect","other_infect",
                                            "Gram_neg_resist","Gram_pos_resist","Fungi_resist","multidrug_resist"]].agg(max)
        df_temp = df_temp1.merge(df_temp, on='受试者代码', how='left', validate='1:1').rename(columns={'受试者代码':'ID'})
        df = df.merge(df_temp, on=['ID'], how='left', validate='1:1') if not (df is None) else df_temp

    # 生命体征
    elif fname.startswith('1.3'):
        df_temp = pd.read_excel(file)
        index_col = df_temp.columns[df_temp.columns.str.contains(r'受试者[编代]码')][0]
        df_temp = df_temp.rename(columns={index_col:'ID'})
        if '研究中心名称' in df_temp.columns:
            df_temp.drop(columns='研究中心名称', inplace=True)
        
        # 选择第一次测量
        mask_v1 = df_temp['入组天数'] == 1
        df_temp = df_temp[mask_v1]
        
        df = df.merge(df_temp, on=['ID'], how='left', validate='1:1') if not (df is None) else df_temp

    # 实验室检查数据处理
    elif fname[:3] in lab_sheet_dict.keys():
        
        (lab_sheet_list, non_treat_sheet) = lab_sheet_dict[fname[:3]]

        df_lab = None
        # 1750 例受试者数据
        for sheet in lab_sheet_list:
            df_temp = pd.read_excel(file, sheet_name=sheet)
            # 选择基线数据
            mask_v1 = df_temp['访视编号']=='sv1'
            df_temp = df_temp[mask_v1]

            # 选择检查结果
            index_col = df_temp.columns[df_temp.columns.str.contains(r'受试者[编代]码')][0]
            df_temp = df_temp[[index_col, '结果']].rename(columns={'结果':sheet, index_col:'ID'})

            if df_lab is None:
                df_lab = df_temp
            else:
                df_lab = df_lab.merge(df_temp, on=['ID'], how='left', validate='1:1')
        
        # 67例未用药基线
        df_non_treat = pd.read_excel(file, sheet_name=non_treat_sheet)
        # 选择检查结果, lab_sheet_list (检查指标子表)已手动填写到67例未用药基线表检查结果的对应列名
        index_col = df_non_treat.columns[df_non_treat.columns.str.contains(r'受试者[编代]码')][0]
        df_non_treat = df_non_treat[[index_col, *lab_sheet_list]].rename(columns={index_col:'ID'})

        # 合并到 1750例数据
        df_lab = pd.concat([df_lab, df_non_treat])
        
        # 与其他数据合并
        df = df.merge(df_lab, on=['ID'], how='left', validate='1:1') if not (df is None) else df_lab

    else:
        df_temp = pd.read_excel(file)
        index_col = df_temp.columns[df_temp.columns.str.contains(r'受试者[编代]码')][0]
        df_temp = df_temp.rename(columns={index_col:'ID'})
        if '研究中心名称' in df_temp.columns:
            df_temp.drop(columns='研究中心名称', inplace=True)

        df = df.merge(df_temp, on=['ID'], how='left', validate='1:1') if not (df is None) else df_temp

df.to_excel(f'{DATA}/EXIT_SEP_merged.xlsx', index=False)

处理数据: 0-受试者随机化信息.xlsx
处理数据: 1.1-人口学及一般临床资料-1817例.xlsx
处理数据: 1.2-基线病原学检查-1817例.xlsx
处理数据: 1.3-D1-D7生命体征-1817例.xlsx
处理数据: 1.4-基线合并疾病归类.xlsx
处理数据: 2.1-D1D4D7-SOFA评分-1817例.xlsx
处理数据: 2.2-D1D4D7-APACHEII评分-1817例.xlsx
处理数据: 2.3-D1D7-DIC评分-1817例.xlsx
处理数据: 2.4-D1-D7感染性休克-1817例.xlsx
处理数据: 3.1-28天内器官支持记录-呼吸支持-1817例.xlsx
处理数据: 3.2-28天内器官支持记录-CRRT-1817例.xlsx
处理数据: 3.3-28天内器官支持记录-营养支持-1817例.xlsx
处理数据: 4.1实验室检查-血常规-1817例.xlsx
处理数据: 4.2-实验室检查-血生化-1817例.xlsx
处理数据: 4.3-实验室检查-凝血四项-1817例.xlsx
处理数据: 4.4-实验室检查-炎症指标-1817例.xlsx
处理数据: 4.5-D1血气分析-1817例.xlsx
处理数据: 5.1-合并用药-28天内抗感染药物.xlsx
处理数据: 5.2-合并用药-28天内升压药物.xlsx
处理数据: 5.3-合并用药-28天内激素.xlsx
处理数据: 5.4-合并用药-28天内抗凝药物.xlsx
处理数据: 5.5-合并用药-28天其他合并药物.xlsx
处理数据: 6-终点指标-含住ICU时间+住院时间+临床结局+费用.xlsx


# 选择特征数据

In [3]:
SELECTED_VAR = ['ID',
                '试验组别',
                '年龄_x',
                '性别_x',
                '民族',
                '身高',
                '体重',
                'BMI',
                '有无吸烟史',
                '吸烟指数', # 吸烟指数=每日吸烟支数×吸烟年数
                '有无酗酒史',
                '感染来源',
                '感染部位',
                '病原学检查（已做=2，未做=1）',
                '整体有无耐药情况',
                "Gram-_infect","Gram+_infect","Fungi_infect",
                "Gram_neg_resist","Gram_pos_resist","Fungi_resist","multidrug_resist",
                '体温(℃)',
                '心率(次/分)',
                '呼吸(次/分)',
                '收缩压(mmHg)',
                '舒张压(mmHg)',
                '呼吸功能[PaO2/FIO2(mmHg) SaO2/FIO2]', # SOFA
                '凝血指标[血小板109/L]', # SOFA
                '肝脏[胆红素(μmol/L)]', # SOFA
                '心血管[低血压]', # SOFA
                '中枢神经系统[Glasgow昏迷分数]', # SOFA
                '肾脏[肌酐(μmol/L)或尿量(mL/d)]', # SOFA
                'SOFA总分_y',
                'APACHE_II_D1',
                'D1-DIC总分',
                'D1感染休克-0无/1有',
                'D1平均动脉压（MAP）mmHg',
                'D1血乳酸（Lac）mmol/l',
                'D1是否使用呼吸支持-0未用/1使用',
                'D1呼吸支持类型', # 机械通气:有创/无创; 其他为非机械通气:面罩吸氧/湿化雾疗仪/高流量湿化氧疗仪
                'D1是否使用CCRT-0未用/1使用',
                'D1是否使用营养支持-0未用/1使用',
                'D1营养支持途径',

                '红细胞(RBC)', '血红蛋白(Hb)', '白细胞(WBC)', '中性粒细胞（N）', '淋巴细胞(L)', '血小板（PLT）', '红细胞压积（HCT）', # 血常规
                '谷丙转氨酶（ALT)', '谷草转氨酶(AST)', '总胆红素(STB)', '尿素氮（BUN）', '血肌酐（Cr）', '血糖（Glu）', '血钾（K）', '血钠（Na）', # 血生化
                '纤维蛋白原（Fg）', '凝血酶原时间（PT）', '部分活化凝血酶原时间（APTT）', 'D-二聚体（D-Dimer）', # 凝血
                'C反应蛋白（CRP）', '降钙素原（PCT）', # 炎症
                'PH', '氧合指数（PaO2/FiO2）mmHG', '碳酸氢（HCO3-）mmol/l', '氧分压（PaO2）mmHg', '血乳酸（Lac）mmol/l','二氧化碳分压（PaCO2）mmHg', # 血气

                '28天预后', # 主要结局- 28天死亡
                '死亡时间', # 主要结局- 28天死亡时间 (生存分析用,surv_time)
                '放弃治疗在院外死亡的，按院内死亡记录',# 次要结局- 院内死亡
                '计算住ICU天数时，死亡的按住满28天计算', # 次要结局-住ICU天数
                '计算住院天数时，死亡的按住满28天计算', # 次要结局-住院天数
                'D1-D7有无感染休克-0无/1有', # 次要结局-是否进展性发生septic shock
                ]

df_selected = df[SELECTED_VAR].copy()
df_selected.to_excel(f'{DATA}/EXIT_SEP_selected.xlsx', index=False)

# 数据清洗

In [4]:
pd.set_option('display.max_columns', 100)
df_selected

Unnamed: 0,ID,试验组别,年龄_x,性别_x,民族,身高,体重,BMI,有无吸烟史,吸烟指数,有无酗酒史,感染来源,感染部位,病原学检查（已做=2，未做=1）,整体有无耐药情况,Gram-_infect,Gram+_infect,Fungi_infect,Gram_neg_resist,Gram_pos_resist,Fungi_resist,multidrug_resist,体温(℃),心率(次/分),呼吸(次/分),收缩压(mmHg),舒张压(mmHg),呼吸功能[PaO2/FIO2(mmHg) SaO2/FIO2],凝血指标[血小板109/L],肝脏[胆红素(μmol/L)],心血管[低血压],中枢神经系统[Glasgow昏迷分数],肾脏[肌酐(μmol/L)或尿量(mL/d)],SOFA总分_y,APACHE_II_D1,D1-DIC总分,D1感染休克-0无/1有,D1平均动脉压（MAP）mmHg,D1血乳酸（Lac）mmol/l,D1是否使用呼吸支持-0未用/1使用,D1呼吸支持类型,D1是否使用CCRT-0未用/1使用,D1是否使用营养支持-0未用/1使用,D1营养支持途径,红细胞(RBC),血红蛋白(Hb),白细胞(WBC),中性粒细胞（N）,淋巴细胞(L),血小板（PLT）,红细胞压积（HCT）,谷丙转氨酶（ALT),谷草转氨酶(AST),总胆红素(STB),尿素氮（BUN）,血肌酐（Cr）,血糖（Glu）,血钾（K）,血钠（Na）,纤维蛋白原（Fg）,凝血酶原时间（PT）,部分活化凝血酶原时间（APTT）,D-二聚体（D-Dimer）,C反应蛋白（CRP）,降钙素原（PCT）,PH,氧合指数（PaO2/FiO2）mmHG,碳酸氢（HCO3-）mmol/l,氧分压（PaO2）mmHg,血乳酸（Lac）mmol/l,二氧化碳分压（PaCO2）mmHg,28天预后,死亡时间,放弃治疗在院外死亡的，按院内死亡记录,计算住ICU天数时，死亡的按住满28天计算,计算住院天数时，死亡的按住满28天计算,D1-D7有无感染休克-0无/1有
0,10001,试验药物,40,男,汉族,180,80.0,24.7,1,400,0,其他部位感染,藏毛囊肿伴感染,2,无,0.0,1.0,0.0,0.0,0.0,0.0,0.0,38.00,102,29,120,61,4,2,0,3,0,0,9,13,3.0,1,81.0,1.2,1,湿化氧疗仪,0,1,肠内,3.07,92.0,0.36,0.04,75.04,91.0,26.4,53,46,19.5,2.9,60,10.83,3.4,133.1,5.11,12,35.4,412,128,14.39,7.50,86.0,21.5,43.6,1.2,31.4,存活,28.0,,8.0,21.0,1.0
1,10002,对照药物,46,男,汉族,170,70.0,24.2,0,,0,肺部感染,呼吸系统,2,有,0.0,1.0,0.0,0.0,1.0,0.0,0.0,38.20,112,31,101,61,3,0,0,0,4,0,7,23,2.0,0,,,1,有创,0,1,肠内,2.94,86.0,12.37,79.94,5.42,771.0,27.0,31,45,7.2,5,50,8.05,4.34,138,4.17,13.2,32.4,938,66.9,0.21,7.40,136.0,26.3,54.4,1.7,42.1,存活,28.0,,23.0,28.0,0.0
2,10003,对照药物,73,男,汉族,170,80.0,27.7,0,,0,腹腔感染,胆系,2,无,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.60,117,20,139,63,3,2,2,4,0,0,11,14,3.0,1,88.0,3.1,1,有创,0,1,肠内,3.41,109.0,6.31,93.4,2.3,72.0,30.0,124,107,85.7,7.4,97,11.54,3.45,139.7,4.7,12.2,29.7,551,15.9,5.86,7.25,101.3,19.3,50.6,3.1,45.9,存活,28.0,,6.0,12.0,1.0
3,10004,试验药物,52,女,汉族,160,55.0,21.5,0,,0,腹腔感染,胆系,2,无,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.00,91,19,134,61,2,1,1,4,0,0,8,4,2.0,1,85.0,1.1,1,氧疗,0,0,,3.59,109.0,30.97,90.14,3.62,127.0,31.7,271,200,24.9,9.6,60,7.54,3.84,139,3.65,15.9,32.1,4200,18.7,30,7.41,284.9,22.4,113.9,1.1,35.3,存活,28.0,,7.0,7.0,1.0
4,10005,对照药物,29,男,汉族,170,70.0,24.2,0,,0,肺部感染,呼吸系统,2,有,0.0,1.0,0.0,0.0,1.0,0.0,0.0,38.50,99,36,136,71,2,0,2,0,0,0,4,5,2.0,0,,,1,湿化氧疗仪,0,1,肠内,3.49,105.0,15.63,79.34,12.14,280.0,30.2,65,49,33.4,3.1,54,5.66,3.7,132.6,4.03,15.3,35.9,2089,190,0.55,7.48,242.8,22.7,97.1,0.8,30.3,存活,28.0,,12.0,28.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1812,450001,试验药物,66,男,汉族,171,52.0,17.8,1,200,0,腹腔感染,消化道,2,有,1.0,0.0,0.0,1.0,0.0,0.0,1.0,36.70,142,30,72,41,1,1,0,4,3,1,10,30,2.0,1,59.0,15.0,1,有创,0,0,,3.09,96.0,0.42,71,21,138.0,29.7,91,30,8.4,8.1,131,8.9,2.9,153,1.85,16.5,92.1,2.92,24.33,83.96,7.16,253.0,23.0,152.0,15.0,64.1,死亡,2.0,ICU,28.0,28.0,1.0
1813,450002,对照药物,58,男,汉族,180,68.0,21.0,1,300,1,腹腔感染,消化道,2,有,1.0,1.0,0.0,1.0,1.0,0.0,1.0,37.70,122,26,119,62,1,2,0,3,0,1,7,14,5.0,1,80.0,6.1,1,有创,1,1,肠外,3.84,103.0,2.18,1.73,0.28,90.0,32.4,93,182,9.2,12.2,168.4,5.9,3.7,142,3.07,17.7,53.3,6.83,67.59,100,7.25,370.0,20.5,148.0,6.1,46.9,存活,28.0,,27.0,28.0,1.0
1814,450003,对照药物,71,男,汉族,162,70.0,26.7,0,,0,腹腔感染,胆系,2,有,1.0,0.0,0.0,1.0,0.0,0.0,0.0,36.50,82,17,118,39,0,2,0,0,0,1,3,14,3.0,0,,,1,氧疗,0,1,肠外,2.04,61.0,16.14,14.67,3.6,52.0,18.8,14,18,5.5,13.7,147,11.2,4.2,144,3.34,10.3,23.6,2.48,,8.6,7.38,448.0,13.7,148.0,9.7,17.7,存活,28.0,,28.0,28.0,0.0
1815,450004,试验药物,62,女,汉族,162,64.0,24.4,0,,0,其他部位感染,口底多间隙感染,2,有,0.0,1.0,0.0,0.0,1.0,0.0,0.0,37.50,88,22,149,83,2,0,0,0,0,0,2,7,2.0,0,,,1,有创,0,1,肠内,4.65,145.0,13.24,80.4,12.3,318.0,28.0,4,28,9.2,5.6,77.6,8.2,3.3,136,5.28,9.7,33.6,0.91,101.2,0.095,7.50,287.0,27.0,115.0,0.6,33.7,存活,28.0,,4.0,8.0,0.0


In [5]:
df_clean = df_selected[['ID']].copy()
df_clean['XBJ_intervention'] = df_selected['试验组别'].replace({'试验药物':1, '对照药物':0})
# 排除标准特征对齐
# df_clean['metastatic_cancer'] = 0
# df_clean['hematologic_cancer'] = 0
# df_clean['AIDS'] = 0
df_clean['age'] = df_selected['年龄_x']
df_clean['sex'] = df_selected['性别_x'].replace({'男':1, '女':0})
# 民族，身高，体重
df_clean['BMI'] = df_selected['BMI']
# df_clean['smoke_status'] = df_selected['有无吸烟史'] # 1 有; 0 无
# df_clean['smoke_pack_year'] = (df_selected['吸烟指数'] / 20).fillna(0)  #! 有杂入文字
# df_clean['drink_status'] = df_selected['有无吸烟史'] # 1 有; 0 无
df_clean['primary_infection_site_lung'] = (df_selected['感染来源']=='肺部感染').astype(int)
df_clean['primary_infection_site_abdo'] = (df_selected['感染来源']=='腹腔感染').astype(int)
df_clean['primary_infection_site_uri'] = (df_selected['感染部位'].str.contains('泌尿')).astype(int)
df_clean['primary_infection_site_skin'] = (df_selected['感染部位'].str.contains('皮肤')).astype(int)
df_clean['primary_infection_site_brain'] = (df_selected['感染部位'].str.contains('颅内')).astype(int)
df_clean['primary_infection_site_blood'] = (df_selected['感染部位'].str.contains('血液|败血症')).astype(int)

df_clean['pathogen_test'] = df_selected['病原学检查（已做=2，未做=1）'].replace({2:0, 1:1}) # 0-已做; 1-未做， 用作缺失指示器
pathogen_list = ['Gram-_infect','Gram+_infect', 'Fungi_infect', # 感染
                'Gram_neg_resist','Gram_pos_resist','Fungi_resist','multidrug_resist'] # 耐药
df_clean[pathogen_list] = df_selected[pathogen_list].fillna(0) # 已清洗病原学数据直接复制，0代表缺失（未做病原学检查）

df_clean['temperature'] = df_selected['体温(℃)'].astype(float)
df_clean['heart_rate'] = df_selected['心率(次/分)'].astype(float)
df_clean['respir_rate'] = df_selected['呼吸(次/分)'].astype(float)
df_clean['SBP'] = df_selected['收缩压(mmHg)'].astype(float)
df_clean['DBP'] = df_selected['舒张压(mmHg)'].astype(float)
df_clean['SOFA_respiration'] = df_selected['呼吸功能[PaO2/FIO2(mmHg) SaO2/FIO2]'].astype(float)
df_clean['SOFA_coagulation'] = df_selected['凝血指标[血小板109/L]'].astype(float)
df_clean['SOFA_liver'] = df_selected['肝脏[胆红素(μmol/L)]'].astype(float)
df_clean['SOFA_cardio'] = df_selected['心血管[低血压]'].astype(float)
df_clean['SOFA_cns'] = df_selected['中枢神经系统[Glasgow昏迷分数]'].astype(float)
df_clean['SOFA_renal'] = df_selected['肾脏[肌酐(μmol/L)或尿量(mL/d)]'].astype(float)
df_clean['SOFA'] = df_selected['SOFA总分_y'].astype(float)
df_clean['APACHE_II'] = df_selected['APACHE_II_D1'].astype(float)
df_clean['DIC-score'] = df_selected['D1-DIC总分'].astype(float)
df_clean['septic_shock'] = df_selected['D1感染休克-0无/1有'].astype(int)
df_clean['MAP'] = df_selected['D1平均动脉压（MAP）mmHg'].astype(float)
df_clean['Lac'] = df_selected['D1血乳酸（Lac）mmol/l'].astype(float)

df_clean['Respiratory_Support'] = df_selected['D1是否使用呼吸支持-0未用/1使用'].astype(int)
mask_oxy = df_selected['D1呼吸支持类型'].str.contains('氧疗|吸氧', na=False)
mask_NIPPV = (df_selected['D1呼吸支持类型'] == '无创')
mask_MV = df_selected['D1呼吸支持类型'].str.contains('有创', na=False)
df_clean.loc[mask_oxy, 'Respiratory_Support'] = 1 # 非侵入、非机械通气
df_clean.loc[mask_NIPPV, 'Respiratory_Support'] = 2 # 无创正压通气(Non-invasive Positive Pressure Ventilation, NIPPV)
df_clean.loc[mask_MV, 'Respiratory_Support'] = 3 # 有创机械通气 (Mechanical Ventilation, MV)
df_clean['MV'] = df_selected['D1呼吸支持类型'].str.contains('有创', na=False).astype(int)
df_clean['NIPPV'] = df_selected['D1呼吸支持类型'].str.contains('无创', na=False).astype(int)
df_clean['MV/NIPPV'] = df_clean[['MV','NIPPV']].max(axis=1)

df_clean['CCRT'] = df_selected['D1是否使用CCRT-0未用/1使用'].astype(int) # Continuous Renal Replacement Therapy 持续性肾脏替代治疗

df_clean['nutri_support'] = df_selected['D1是否使用营养支持-0未用/1使用'].astype(int)
df_clean['nutri_support_enteral'] = df_selected['D1营养支持途径'].str.contains('肠内', na=False)
df_clean['nutri_support_parenteral'] = df_selected['D1营养支持途径'].str.contains('肠外', na=False)
# (df_clean['nutri_support_enteral'] + df_clean['nutri_support_parenteral'] == df_clean['nutri_support']).all() # 肠内/肠外均不使用时即为无营养支持

df_clean[['RBC','Hb','WBC', 'NE%', 'LYM%', 'PLT', 'HCT',
          'ALT', 'AST', 'STB', 'BUN', 'Scr', 'Glu', 'K+', 'Na+',
          'Fg', 'PT', 'APTT', 'D-Dimer',
          'CRP', 'PCT', 
          'PH', 'PaO2', 'PaO2/FiO2', 'PaCO2', 'HCO3-', 'Lac', 
          ]] = df_selected[['红细胞(RBC)', '血红蛋白(Hb)', '白细胞(WBC)', '中性粒细胞（N）', '淋巴细胞(L)', '血小板（PLT）', '红细胞压积（HCT）', # 血常规
                            '谷丙转氨酶（ALT)', '谷草转氨酶(AST)', '总胆红素(STB)', '尿素氮（BUN）', '血肌酐（Cr）', '血糖（Glu）', '血钾（K）', '血钠（Na）', # 血生化
                            '纤维蛋白原（Fg）', '凝血酶原时间（PT）', '部分活化凝血酶原时间（APTT）', 'D-二聚体（D-Dimer）', # 凝血
                            'C反应蛋白（CRP）', '降钙素原（PCT）', # 炎症
                            'PH', '氧合指数（PaO2/FiO2）mmHG', '碳酸氢（HCO3-）mmol/l', '氧分压（PaO2）mmHg', '血乳酸（Lac）mmol/l','二氧化碳分压（PaCO2）mmHg', # 血气
                            ]].replace(['ND','NK', 'NA', 'na'], np.nan)\
                                .applymap(lambda x: re.sub("＞|>|＜|<", '', x) if isinstance(x, str) else x)\
                                .astype(float)


df_clean['in_hospital_mortality'] = (df_selected['放弃治疗在院外死亡的，按院内死亡记录'].isin(['ICU', '其他科室', '院内'])).astype(int) # 0-未发生院内死亡 1-院内死亡
df_clean['28d_mortality'] = df_selected['28天预后'].replace({'存活':0, '死亡':1, ' ':np.nan}) # 0-存活 1-死亡
df_clean['7d_septic_shock'] = df_selected['D1-D7有无感染休克-0无/1有'].astype(float)

In [6]:
# 校验结局
mask_xbj = df_clean['XBJ_intervention'] == 1
mask_placebo = df_clean['XBJ_intervention'] == 0
mask_in_hos_death = (df_clean['in_hospital_mortality']==1)
mask_28d_death = (df_clean['28d_mortality']==1)

print(f'28天死亡: {mask_28d_death.sum()}({mask_28d_death.sum()/len(df_clean)*100:.2f}%); placebo:{(mask_placebo & mask_28d_death).sum()}; XBJ:{(mask_xbj & mask_28d_death).sum()}')
print(f'住院死亡: {mask_in_hos_death.sum()}({mask_in_hos_death.sum()/len(df_clean)*100:.2f}%); placebo:{(mask_placebo & mask_in_hos_death).sum()}; XBJ:{(mask_xbj & mask_in_hos_death).sum()}')

df_clean.to_csv(f'{DATA}/EXIT_SEP_clean.tsv.gz', sep='\t', compression='gzip', index=False)

28天死亡: 395(21.74%); placebo:230; XBJ:165
住院死亡: 350(19.26%); placebo:201; XBJ:149


In [7]:
df_clean_worse_case_both_die = df_clean.copy()
df_clean_worse_case_xbj_die = df_clean.copy()

# 假设缺失结局时两组都在28天死亡（只对主要结局指标分析）
df_clean_worse_case_both_die['28d_mortality'] = df_clean_worse_case_both_die['28d_mortality'].fillna(1)

mask_28d_death = (df_clean_worse_case_both_die['28d_mortality']==1)
print('假设缺失结局时两组都死亡')
print(f'28天死亡: {mask_28d_death.sum()}; placebo:{(mask_placebo & mask_28d_death).sum()}; XBJ:{(mask_xbj & mask_28d_death).sum()}')


# 假设缺失结局时XBJ组在28天死亡，对照组存活
mask_missing_28d = df_clean_worse_case_xbj_die['28d_mortality'].isna()
df_clean_worse_case_xbj_die.loc[(mask_xbj & mask_missing_28d), '28d_mortality'] = 1
df_clean_worse_case_xbj_die.loc[(mask_placebo & mask_missing_28d), '28d_mortality'] = 0

mask_28d_death = (df_clean_worse_case_xbj_die['28d_mortality']==1)
print('假设缺失结局时XBJ死亡,对照组存活')
print(f'28天死亡: {mask_28d_death.sum()}; placebo:{(mask_placebo & mask_28d_death).sum()}; XBJ:{(mask_xbj & mask_28d_death).sum()}')

df_clean_worse_case_both_die.to_csv(f'{DATA}/EXIT_SEP_worse_case_both_die.tsv.gz', sep='\t', compression='gzip', index=False)
df_clean_worse_case_xbj_die.to_csv(f'{DATA}/EXIT_SEP_worse_case_xbj_die.tsv.gz', sep='\t', compression='gzip', index=False)

假设缺失结局时两组都死亡
28天死亡: 452; placebo:254; XBJ:198
假设缺失结局时XBJ死亡,对照组存活
28天死亡: 428; placebo:230; XBJ:198


In [8]:
df_clean

Unnamed: 0,ID,XBJ_intervention,age,sex,BMI,primary_infection_site_lung,primary_infection_site_abdo,primary_infection_site_uri,primary_infection_site_skin,primary_infection_site_brain,primary_infection_site_blood,pathogen_test,Gram-_infect,Gram+_infect,Fungi_infect,Gram_neg_resist,Gram_pos_resist,Fungi_resist,multidrug_resist,temperature,heart_rate,respir_rate,SBP,DBP,SOFA_respiration,SOFA_coagulation,SOFA_liver,SOFA_cardio,SOFA_cns,SOFA_renal,SOFA,APACHE_II,DIC-score,septic_shock,MAP,Lac,Respiratory_Support,MV,NIPPV,MV/NIPPV,CCRT,nutri_support,nutri_support_enteral,nutri_support_parenteral,RBC,Hb,WBC,NE%,LYM%,PLT,HCT,ALT,AST,STB,BUN,Scr,Glu,K+,Na+,Fg,PT,APTT,D-Dimer,CRP,PCT,PH,PaO2,PaO2/FiO2,PaCO2,HCO3-,in_hospital_mortality,28d_mortality,7d_septic_shock
0,10001,1,40,1,24.7,0,0,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,38.0,102.0,29.0,120.0,61.0,4.0,2.0,0.0,3.0,0.0,0.0,9.0,13.0,3.0,1,81.0,31.4,1,0,0,0,0,1,True,False,3.07,92.0,0.36,0.04,75.04,91.0,26.4,53.0,46.0,19.5,2.9,60.0,10.83,3.40,133.1,5.11,12.0,35.4,412.00,128.00,14.390,7.50,86.0,21.5,43.6,1.2,0,0.0,1.0
1,10002,0,46,1,24.2,1,0,0,0,0,0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,38.2,112.0,31.0,101.0,61.0,3.0,0.0,0.0,0.0,4.0,0.0,7.0,23.0,2.0,0,,42.1,3,1,0,1,0,1,True,False,2.94,86.0,12.37,79.94,5.42,771.0,27.0,31.0,45.0,7.2,5.0,50.0,8.05,4.34,138.0,4.17,13.2,32.4,938.00,66.90,0.210,7.40,136.0,26.3,54.4,1.7,0,0.0,0.0
2,10003,0,73,1,27.7,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.6,117.0,20.0,139.0,63.0,3.0,2.0,2.0,4.0,0.0,0.0,11.0,14.0,3.0,1,88.0,45.9,3,1,0,1,0,1,True,False,3.41,109.0,6.31,93.40,2.30,72.0,30.0,124.0,107.0,85.7,7.4,97.0,11.54,3.45,139.7,4.70,12.2,29.7,551.00,15.90,5.860,7.25,101.3,19.3,50.6,3.1,0,0.0,1.0
3,10004,1,52,0,21.5,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0,91.0,19.0,134.0,61.0,2.0,1.0,1.0,4.0,0.0,0.0,8.0,4.0,2.0,1,85.0,35.3,1,0,0,0,0,0,False,False,3.59,109.0,30.97,90.14,3.62,127.0,31.7,271.0,200.0,24.9,9.6,60.0,7.54,3.84,139.0,3.65,15.9,32.1,4200.00,18.70,30.000,7.41,284.9,22.4,113.9,1.1,0,0.0,1.0
4,10005,0,29,1,24.2,1,0,0,0,0,0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,38.5,99.0,36.0,136.0,71.0,2.0,0.0,2.0,0.0,0.0,0.0,4.0,5.0,2.0,0,,30.3,1,0,0,0,0,1,True,False,3.49,105.0,15.63,79.34,12.14,280.0,30.2,65.0,49.0,33.4,3.1,54.0,5.66,3.70,132.6,4.03,15.3,35.9,2089.00,190.00,0.550,7.48,242.8,22.7,97.1,0.8,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1812,450001,1,66,1,17.8,0,1,0,0,0,0,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,36.7,142.0,30.0,72.0,41.0,1.0,1.0,0.0,4.0,3.0,1.0,10.0,30.0,2.0,1,59.0,64.1,3,1,0,1,0,0,False,False,3.09,96.0,0.42,71.00,21.00,138.0,29.7,91.0,30.0,8.4,8.1,131.0,8.90,2.90,153.0,1.85,16.5,92.1,2.92,24.33,83.960,7.16,253.0,23.0,152.0,15.0,1,1.0,1.0
1813,450002,0,58,1,21.0,0,1,0,0,0,0,0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,37.7,122.0,26.0,119.0,62.0,1.0,2.0,0.0,3.0,0.0,1.0,7.0,14.0,5.0,1,80.0,46.9,3,1,0,1,1,1,False,True,3.84,103.0,2.18,1.73,0.28,90.0,32.4,93.0,182.0,9.2,12.2,168.4,5.90,3.70,142.0,3.07,17.7,53.3,6.83,67.59,100.000,7.25,370.0,20.5,148.0,6.1,0,0.0,1.0
1814,450003,0,71,1,26.7,0,1,0,0,0,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,36.5,82.0,17.0,118.0,39.0,0.0,2.0,0.0,0.0,0.0,1.0,3.0,14.0,3.0,0,,17.7,1,0,0,0,0,1,False,True,2.04,61.0,16.14,14.67,3.60,52.0,18.8,14.0,18.0,5.5,13.7,147.0,11.20,4.20,144.0,3.34,10.3,23.6,2.48,,8.600,7.38,448.0,13.7,148.0,9.7,0,0.0,0.0
1815,450004,1,62,0,24.4,0,0,0,0,0,0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,37.5,88.0,22.0,149.0,83.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,7.0,2.0,0,,33.7,3,1,0,1,0,1,True,False,4.65,145.0,13.24,80.40,12.30,318.0,28.0,4.0,28.0,9.2,5.6,77.6,8.20,3.30,136.0,5.28,9.7,33.6,0.91,101.20,0.095,7.50,287.0,27.0,115.0,0.6,0,0.0,0.0
