In [1]:
'''post SQL data clean for MIMIC-IV'''

# 手动修改: 
# 处理数据: 无创通气数据.csv, ventilation 列名改为 non_invasive_ventilation
# 处理数据: 有创通气数据.csv, ventilation 列名改为 invasive_MV


import os, sys
import pandas as pd
pd.set_option('display.max_columns', None)

# 检测运行环境
def in_notebook():
    return 'IPKernelApp' in getattr(globals().get('get_ipython', lambda: None)(), 'config', {})

if in_notebook():
    notebook_dir = os.getcwd()
    src_path = os.path.abspath(os.path.join(notebook_dir, '..'))
else:
    src_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
sys.path.append(src_path) if src_path not in sys.path else None

from src.utils import *
from src.setup import *
from src.data_utils import replace_abnormal_values
MIMIC_IV_PATH = f'{ROOT}/data/MIMIC-IV-ICU-sepsis/' # 原始数据

# 合并数据

In [2]:

# 获取所有文件路径
filelist = getfiles(MIMIC_IV_PATH)

# 初始化 DataFrame 列表
df = None

# 合并所有表格
for file in filelist:
    fname = os.path.split(file)[1]
    df_temp = pd.read_csv(file, usecols=lambda x: not x in ['hadm_id'])

    required_columns = list({'subject_id', 'stay_id'} & set(df_temp.columns)) # 理论上同一个 stay_id 对应的 hadm_id(住院号) 应该一样，通过stay_id即可确定患者记录
    mask_dup = df_temp.duplicated(subset=required_columns, keep='first')
    print(f'处理数据: {fname}, 发现 {mask_dup.sum():,} 条重复数据。')

    df_temp = df_temp[~mask_dup].copy()
    df = df.merge(df_temp, on=['subject_id', 'stay_id'], how='left') if not df is None else df_temp

df_24h_sepsis3 = pd.read_csv(f'{DATA}/sepsis_within_24_hours_view.csv').drop_duplicates(['subject_id', 'stay_id'])
df = df_24h_sepsis3.merge(df, on=['subject_id', 'stay_id'], validate='1:1')
df['ID'] = df['subject_id'].astype(str) +'_'+ df['stay_id'].astype(str)
df.to_excel(f'{DATA}/MIMIC_IV_merged.xlsx', index=False)


处理数据: CRP数据.csv, 发现 0 条重复数据。
处理数据: CRRT数据.csv, 发现 0 条重复数据。
处理数据: pao2fio2ratio.csv, 发现 10 条重复数据。
处理数据: 人口统计学信息.csv, 发现 0 条重复数据。
处理数据: 住ICU的脓毒症患者.csv, 发现 0 条重复数据。
处理数据: 凝血数据.csv, 发现 0 条重复数据。
处理数据: 合并基础疾病.csv, 发现 0 条重复数据。
处理数据: 多种评分数据.csv, 发现 0 条重复数据。
处理数据: 无创机械通气.csv, 发现 0 条重复数据。
处理数据: 有创+无创机械通气.csv, 发现 0 条重复数据。
处理数据: 有创机械通气.csv, 发现 0 条重复数据。
处理数据: 生化肝肾功数据.csv, 发现 0 条重复数据。
处理数据: 脓毒症休克数据.csv, 发现 0 条重复数据。
处理数据: 脓毒症评分.csv, 发现 0 条重复数据。
处理数据: 血常规数据.csv, 发现 0 条重复数据。
处理数据: 血气分析（包括血乳酸）数据.csv, 发现 0 条重复数据。
处理数据: 血气补充数据.csv, 发现 0 条重复数据。
处理数据: 预后数据.csv, 发现 0 条重复数据。


  df['ID'] = df['subject_id'].astype(str) +'_'+ df['stay_id'].astype(str)


# 选择变量

In [3]:
SELECTED_VAR = ['ID',
                'age',
                'gender',
                'height',
                'weight',
                # 'BMI', # to derive
                'vitaltemperature', # 体温
                'vitalhr', # 心率
                'vitalrr', # 呼吸频率
                'vitalnbps', # SBP
                'vitalnbpd', # DBP
                'vitalnbpm', # MAP
                'vitalspo2', # 血氧饱和度

                'cancer',
                'ganyinghua',
                'manxingxinshuai',
                'secondarymalignantneoplasm',
                'bloodcancer_baixuebing',
                'bloodcancer_linbaliu',
                'bloodcancer_gusuiliu',
                'hivaids',

                'sofa_respiration',
                'sofa_coagulation',
                'sofa_liver',
                'sofa_cardiovascular',
                'sofa_renal',
                'sofa_cns',
                'sofa_score',
                
                'apsiii',
                'sirs',
                'sapsii',
                'oasis',
                'gcs',
                'charlson',

                'crrt',
                'ventilation',
                'invasive_MV',
                'non_invasive_ventilation',

                'labrbc',  # RBC
                'labwbc',  # WBC
                'labhemoglobin',  # Hb
                'labneutrophilcount',  # NE#
                'lablymphocytes',  # LYN#
                'labplateletcount',  # PLT
                'labhematocrit',  # HCT

                'labalt', 
                'labast', 
                'labbilirubintotal', # labbilirubintotal #! mg/dL
                'labureanitrogen', # BUN, #! mg/dL
                'labcreatinine', # Scr, #! mg/dL
                'laba1c', #? Hba1c
                'labglucose', # Glu, #! mg/dL
                'labsodium', # Na
                'labpotassium', # K
                'labcalciumtotal', #? Ca
                'labchloride', #? Cl

                'first_crp', # CRP

                'labfibrinogen', # Fg
                'labptt', # APTT
                'labpt', # 凝血酶原时间
                'labddimer', # D-二聚体

                'labph', # PH值
                'labpo2', # PaO2
                'pao2fio2ratio', # PaO2/FiO2
                'first_bicarbonateblood', # HCO3-
                'labpco2', # PaCO2
                'lablactate', # Lac 血乳酸(mmol/L)

                'hosp_survival_day', 'icu_survival_day','death_within_hosp_28days', 'death_within_icu_28days',
                'is_hosp_dead', 'is_icu_dead'
                ]

df_selected = df[SELECTED_VAR].set_index('ID')
df_selected.to_excel(f'{DATA}/EXIT_SEP_selected.xlsx')

# 清洗数据

In [4]:
df_selected
# (df_selected['ventilation'].fillna(0) == df_selected['invasive_MV'].fillna(df_selected['non_invasive_ventilation']).fillna(0)).all()
# df_selected['vitalnbps'].describe()
# df_selected['ventilation'].sum()

Unnamed: 0_level_0,age,gender,height,weight,vitaltemperature,vitalhr,vitalrr,vitalnbps,vitalnbpd,vitalnbpm,vitalspo2,cancer,ganyinghua,manxingxinshuai,secondarymalignantneoplasm,bloodcancer_baixuebing,bloodcancer_linbaliu,bloodcancer_gusuiliu,hivaids,sofa_respiration,sofa_coagulation,sofa_liver,sofa_cardiovascular,sofa_renal,sofa_cns,sofa_score,apsiii,sirs,sapsii,oasis,gcs,charlson,crrt,ventilation,invasive_MV,non_invasive_ventilation,labrbc,labwbc,labhemoglobin,labneutrophilcount,lablymphocytes,labplateletcount,labhematocrit,labalt,labast,labbilirubintotal,labureanitrogen,labcreatinine,laba1c,labglucose,labsodium,labpotassium,labcalciumtotal,labchloride,first_crp,labfibrinogen,labptt,labpt,labddimer,labph,labpo2,pao2fio2ratio,first_bicarbonateblood,labpco2,lablactate,hosp_survival_day,icu_survival_day,death_within_hosp_28days,death_within_icu_28days,is_hosp_dead,is_icu_dead
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1
10000690_37081114,87,F,,55.30,36.50,79.0,23.0,107.0,63.0,71.0,100.0,,,,,,,,,2,0,0,1,0,0,3,52,3,40,41,12.0,5,,1.0,,1.0,3.07,7.5,9.5,,,199.0,28.5,9.0,14.0,0.4,21.0,0.9,,77.0,137.0,4.4,9.0,104.0,,,29.7,12.1,,7.45,68.0,123.000000,26.0,52.0,,453.25,453.18,0,0,0,0
10001843_39698942,77,M,,76.80,36.61,118.0,17.0,112.0,71.0,79.0,88.0,1.0,,,1.0,,,,,0,0,2,0,1,0,3,90,3,69,38,13.0,15,,,,,3.24,10.5,9.1,,,609.0,31.6,90.0,196.0,4.5,33.0,1.7,,108.0,137.0,5.4,8.2,101.0,,,42.0,23.5,,7.18,50.0,103.666667,20.0,84.0,1.7,0.99,0.22,1,1,1,1
10002013_39060235,57,F,157.0,100.05,36.22,80.0,14.0,104.0,70.0,77.0,100.0,,,1.0,,,,,,1,0,0,1,0,0,2,29,3,32,32,15.0,7,,1.0,1.0,1.0,2.95,18.2,10.2,,,252.0,28.6,,,,16.0,1.1,,98.0,140.0,4.0,,109.0,,212.0,25.4,12.7,,7.35,421.0,256.000000,23.0,45.0,3.3,,,0,0,0,0
10002114_34672098,56,M,173.0,64.10,36.61,105.0,22.0,104.0,81.0,89.0,100.0,,1.0,,,,,,,1,1,0,1,2,0,5,59,3,43,32,15.0,2,,1.0,,1.0,3.44,8.4,10.8,5.45,0.73,113.0,30.1,28.0,62.0,1.1,44.0,2.7,4.9,132.0,125.0,2.8,8.1,67.0,,230.0,31.4,21.8,,7.65,148.0,463.333333,41.0,40.0,5.2,296.06,296.02,0,0,0,0
10002155_33685454,82,F,,54.00,35.50,68.0,18.0,126.0,61.0,78.0,97.0,,,,,,,,,0,0,0,1,1,0,2,37,1,31,29,15.0,10,,1.0,,1.0,4.19,5.5,12.5,,,185.0,37.9,,,,19.0,0.9,,95.0,139.0,4.5,8.8,106.0,,,40.7,11.9,,7.36,76.0,0.000000,25.0,48.0,,582.47,582.47,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19998591_36794489,54,F,,49.40,35.39,98.0,13.0,111.0,57.0,69.0,99.0,,,1.0,,,,,,0,0,0,4,2,1,7,86,4,44,33,9.0,3,,1.0,,1.0,2.92,12.0,10.7,,,352.0,33.4,20.0,41.0,0.7,69.0,2.4,,93.0,165.0,3.6,6.2,131.0,,,43.4,48.0,,7.33,43.0,0.000000,25.0,52.0,1.3,154.15,154.05,0,0,0,0
19998770_37676535,50,F,,67.90,36.94,100.0,14.0,,,,97.0,,,,,,,,,0,0,0,1,0,1,2,24,1,20,23,14.0,1,,1.0,,1.0,3.77,8.3,12.8,,,253.0,37.0,10.0,18.0,,10.0,0.7,,146.0,142.0,4.0,8.2,105.0,,,28.0,12.1,,,,0.000000,26.0,,,,,0,0,0,0
19998843_30988867,45,M,180.0,90.00,3.89,46.0,21.0,,,,100.0,,,,,,,,,1,1,0,1,0,0,3,83,4,60,40,3.0,0,,,,,3.40,15.1,10.4,,,135.0,30.6,15.0,39.0,0.2,18.0,1.1,5.5,108.0,130.0,4.4,6.6,104.0,,115.0,30.4,14.5,,7.36,353.0,700.250000,22.0,39.0,1.8,2.61,2.58,1,1,1,1
19999442_32336619,44,M,193.0,107.50,36.83,88.0,15.0,150.0,90.0,103.0,96.0,,,,,,,,,0,1,0,0,0,2,3,28,3,20,25,10.0,4,,1.0,1.0,1.0,4.63,9.5,14.3,,,126.0,40.9,63.0,20.0,0.4,12.0,0.9,,136.0,141.0,3.7,8.7,107.0,,,23.3,13.1,,7.46,131.0,460.000000,23.0,32.0,3.0,,,0,0,0,0


In [7]:
df_clean = df_selected[['age']].copy()
df_clean['sex'] = df_selected['gender'].replace({'M':1, 'F':0})

df_clean['weight'] = replace_abnormal_values(df_selected['weight'], lower_bound=20, upper_bound=500)  # 体重 异常值排除
df_clean['height'] = df_selected['height'] # 留作缺失填补用， 建模时只留BMI

df_clean['BMI'] = df_clean['weight'] / ((df_selected['height']/100)**2)
df_clean['BMI'] = replace_abnormal_values(df_clean['BMI'], lower_bound=10, upper_bound=70)  # BMI 异常值排除

df_clean['temperature'] = replace_abnormal_values(df_selected['vitaltemperature'], lower_bound=32, upper_bound=42)  # 体温 异常值排除
df_clean['heart_rate'] = replace_abnormal_values(df_selected['vitalhr'], lower_bound=20, upper_bound=400)  # 心率 异常值排除
df_clean['respir_rate'] = replace_abnormal_values(df_selected['vitalrr'], lower_bound=6, upper_bound=60) # 呼吸
df_clean['SBP'] = replace_abnormal_values(df_selected['vitalnbps'], lower_bound=30, upper_bound=300) # SBP 异常值排除
df_clean['DBP'] = replace_abnormal_values(df_selected['vitalnbpd'], lower_bound=20, upper_bound=200) # DBP
df_clean['MAP'] = replace_abnormal_values(df_selected['vitalnbpm'], lower_bound=20, upper_bound=200) # MAP
df_clean['SPO2'] = replace_abnormal_values(df_selected['vitalspo2'], lower_bound=50, upper_bound=100) # SPO2

df_clean['cancer'] = df_selected['cancer'].fillna(0) # model
df_clean['liver_cirrhosis'] = df_selected['ganyinghua'].fillna(0)
df_clean['chronic_heart_failure'] = df_selected['manxingxinshuai'].fillna(0)
df_clean['metastatic_cancer'] = df_selected['secondarymalignantneoplasm'].fillna(0) # model
df_clean['leukemia'] = df_selected['bloodcancer_baixuebing'].fillna(0)
df_clean['lymphoma'] = df_selected['bloodcancer_linbaliu'].fillna(0)
df_clean['myeloma'] = df_selected['bloodcancer_gusuiliu'].fillna(0)
df_clean['hematologic_cancer'] = df_clean[['leukemia', 'lymphoma', 'myeloma']].mean(axis=1) # model
df_clean['AIDS'] = df_selected['hivaids'].fillna(0) # model

df_clean['SOFA_respiration'] = replace_abnormal_values(df_selected['sofa_respiration'], lower_bound=0, upper_bound=4)
df_clean['SOFA_coagulation'] = replace_abnormal_values(df_selected['sofa_coagulation'], lower_bound=0, upper_bound=4)
df_clean['SOFA_liver'] = replace_abnormal_values(df_selected['sofa_liver'], lower_bound=0, upper_bound=4)
df_clean['SOFA_cardio'] = replace_abnormal_values(df_selected['sofa_cardiovascular'], lower_bound=0, upper_bound=4)
df_clean['SOFA_cns'] = replace_abnormal_values(df_selected['sofa_cns'], lower_bound=0, upper_bound=4)
df_clean['SOFA_renal'] = replace_abnormal_values(df_selected['sofa_renal'], lower_bound=0, upper_bound=4)
df_clean['SOFA'] = replace_abnormal_values(df_selected['sofa_score'], lower_bound=0, upper_bound=24)
df_clean['GCS'] = replace_abnormal_values(df_selected['gcs'], lower_bound=0, upper_bound=15) # Glasgow Coma Scale 昏迷评分

df_clean['APS_III'] = replace_abnormal_values(df_selected['apsiii'], lower_bound=0, upper_bound=163) 
df_clean['SIRS'] = replace_abnormal_values(df_selected['sirs'], lower_bound=0, upper_bound=4) 
df_clean['SAPS_II'] = replace_abnormal_values(df_selected['sapsii'], lower_bound=0, upper_bound=163) 
df_clean['OASIS'] = replace_abnormal_values(df_selected['oasis'], lower_bound=0, upper_bound=83)
df_clean['Charlson'] = replace_abnormal_values(df_selected['charlson'], lower_bound=0, upper_bound=37) 

df_clean['CCRT'] = df_selected['crrt'].fillna(0)
df_clean['MV'] = df_selected['invasive_MV'].fillna(0)
df_clean['NIPPV'] = df_selected['non_invasive_ventilation'].fillna(0)

df_clean['RBC'] = replace_abnormal_values(df_selected['labrbc'], lower_bound=1, upper_bound=10)
df_clean['WBC'] = replace_abnormal_values(df_selected['labwbc'], lower_bound=1, upper_bound=100)
df_clean['Hb'] = replace_abnormal_values(df_selected['labhemoglobin']*10, lower_bound=10, upper_bound=250)
df_clean['NE#'] = replace_abnormal_values(df_selected['labneutrophilcount'], lower_bound=0.5, upper_bound=50)
df_clean['NE%'] = replace_abnormal_values(df_clean['NE#'] / df_clean['WBC'], lower_bound=0.05, upper_bound=1)
df_clean['LYM#'] = replace_abnormal_values(df_selected['lablymphocytes'], lower_bound=0.2, upper_bound=10)
df_clean['LYM%'] = replace_abnormal_values(df_clean['LYM#'] / df_clean['WBC'], lower_bound=0.01, upper_bound=1)
df_clean['PLT'] = replace_abnormal_values(df_selected['labplateletcount'], lower_bound=10, upper_bound=1000)
df_clean['HCT'] = replace_abnormal_values(df_selected['labhematocrit'], lower_bound=10, upper_bound=80)

df_clean['ALT'] = replace_abnormal_values(df_selected['labalt'], lower_bound=0, upper_bound=5000)
df_clean['AST'] = replace_abnormal_values(df_selected['labast'], lower_bound=0, upper_bound=5000)
df_clean['STB'] = replace_abnormal_values(df_selected['labbilirubintotal'] * 17.104, lower_bound=0, upper_bound=400) # mg/dL -> umol/L
df_clean['BUN'] = replace_abnormal_values(df_selected['labureanitrogen'] * 0.357, lower_bound=0, upper_bound=50) # mg/dL -> mmol/L
df_clean['Scr'] = replace_abnormal_values(df_selected['labcreatinine'] * 88.4, lower_bound=10, upper_bound=1000) # mg/dL -> umol/L
df_clean['HbA1c'] = replace_abnormal_values(df_selected['laba1c'], lower_bound=0, upper_bound=20)
df_clean['Glu'] = replace_abnormal_values(df_selected['labglucose'] * 0.0555, lower_bound=1, upper_bound=50) # mg/dL -> mmol/L
df_clean['K+'] = replace_abnormal_values(df_selected['labpotassium'], lower_bound=1, upper_bound=8)
df_clean['Na+'] = replace_abnormal_values(df_selected['labsodium'], lower_bound=120, upper_bound=160)
df_clean['Ca2+'] = replace_abnormal_values(df_selected['labcalciumtotal'] * 0.25, lower_bound=1, upper_bound=4)
df_clean['Cl-'] = replace_abnormal_values(df_selected['labchloride'], lower_bound=80, upper_bound=120)

df_clean['Fg'] = replace_abnormal_values(df_selected['labfibrinogen'] * 0.01, lower_bound=0, upper_bound=10) # mg/dL -> g/L
df_clean['PT'] = replace_abnormal_values(df_selected['labpt'], lower_bound=0, upper_bound=50)
df_clean['APTT'] = replace_abnormal_values(df_selected['labptt'], lower_bound=0, upper_bound=100)
df_clean['D-Dimer'] = replace_abnormal_values(df_selected['labddimer'], lower_bound=0, upper_bound=np.inf)
df_clean['CRP'] = replace_abnormal_values(df_selected['first_crp'], lower_bound=0, upper_bound=500)
# MIMIC 无 PCT 
df_clean['PH'] = replace_abnormal_values(df_selected['labph'], lower_bound=6.5, upper_bound=8.0)
df_clean['PaO2/FiO2'] = replace_abnormal_values(df_selected['pao2fio2ratio'], lower_bound=0, upper_bound=1000)
df_clean['HCO3-'] = replace_abnormal_values(df_selected['first_bicarbonateblood'], lower_bound=5, upper_bound=50)
df_clean['PaO2'] = replace_abnormal_values(df_selected['labpo2'], lower_bound=30, upper_bound=1000)
df_clean['Lac'] = replace_abnormal_values(df_selected['lablactate'], lower_bound=0.1, upper_bound=20)
df_clean['PaCO2'] = replace_abnormal_values(df_selected['labpco2'], lower_bound=10, upper_bound=120)

df_clean['in_hospital_mortality'] = df_selected[['is_hosp_dead','is_icu_dead']].max(axis=1).fillna(0)
df_clean['28d_mortality'] = df_selected[['death_within_hosp_28days','death_within_icu_28days']].max(axis=1).fillna(0)

# 排除24h内死亡患者
mask_death_in_24h = (df_selected['hosp_survival_day'] < 1) | (df_selected['icu_survival_day'] < 1)
df_clean = df_clean[~mask_death_in_24h].copy()
print(f"排除: {mask_death_in_24h.sum()} 入院24小时内死亡")

# 排除缺失率过高的患者
mask_missing_patient = (df_clean.isna().sum(axis=1) / df_clean.shape[1]) >= 0.7
df_clean = df_clean[~mask_missing_patient].copy()
print(f"排除: {mask_missing_patient.sum()} 缺失特征>=70% ")

# 排除缺失率过高的特征
# mask_missing_feature = (df_clean.isna().sum() / df_clean.shape[0]) >= 0.7
# exclude_feature = df_clean.columns[mask_missing_feature]
# print(f"排除: {list(exclude_feature)} 数据缺失率>=70% ")
# df_clean = df_clean.drop(columns=exclude_feature)
for feature, missing_rate in (df_clean.isna().sum() / df_clean.shape[0]).to_dict().items():
    if missing_rate >= 0.9:
        mark = '***'
    elif missing_rate >= 0.7:
        mark = '**'
    elif missing_rate >= 0.5:
        mark = '*'
    else:
        mark = ''
    print(f'{feature:>20} | {missing_rate*100:.2f}%  {mark}')

mask_in_hos_death = (df_clean['in_hospital_mortality']==1)
mask_28d_death = (df_clean['28d_mortality']==1)
print(f'28天死亡: {mask_28d_death.sum()} ({mask_28d_death.sum()/len(df_clean)*100:.2f}%)')
print(f'住院死亡: {mask_in_hos_death.sum()} ({mask_in_hos_death.sum()/len(df_clean)*100:.2f}%)')
df_clean.to_csv(f'{DATA}/MIMIC_IV_clean.tsv.gz', sep='\t', compression='gzip')

              weight | 12 (0.06%)	 abnormal couunts
                 BMI | 33 (0.16%)	 abnormal couunts
    vitaltemperature | 83 (0.40%)	 abnormal couunts
             vitalhr | 7 (0.03%)	 abnormal couunts
             vitalrr | 115 (0.56%)	 abnormal couunts
           vitalnbps | 0 (0.00%)	 abnormal couunts
           vitalnbpd | 25 (0.12%)	 abnormal couunts
           vitalnbpm | 10 (0.05%)	 abnormal couunts
           vitalspo2 | 16 (0.08%)	 abnormal couunts
    sofa_respiration | 0 (0.00%)	 abnormal couunts
    sofa_coagulation | 0 (0.00%)	 abnormal couunts
          sofa_liver | 0 (0.00%)	 abnormal couunts
 sofa_cardiovascular | 0 (0.00%)	 abnormal couunts
            sofa_cns | 0 (0.00%)	 abnormal couunts
          sofa_renal | 0 (0.00%)	 abnormal couunts
          sofa_score | 0 (0.00%)	 abnormal couunts
                 gcs | 0 (0.00%)	 abnormal couunts
              apsiii | 10 (0.05%)	 abnormal couunts
                sirs | 0 (0.00%)	 abnormal couunts
              sapsii |

In [8]:
df_clean

Unnamed: 0_level_0,age,sex,weight,height,BMI,temperature,heart_rate,respir_rate,SBP,DBP,MAP,SPO2,cancer,liver_cirrhosis,chronic_heart_failure,metastatic_cancer,leukemia,lymphoma,myeloma,hematologic_cancer,AIDS,SOFA_respiration,SOFA_coagulation,SOFA_liver,SOFA_cardio,SOFA_cns,SOFA_renal,SOFA,GCS,APS_III,SIRS,SAPS_II,OASIS,Charlson,CCRT,MV,NIPPV,RBC,WBC,Hb,NE#,NE%,LYM#,LYM%,PLT,HCT,ALT,AST,STB,BUN,Scr,HbA1c,Glu,K+,Na+,Ca2+,Cl-,Fg,PT,APTT,D-Dimer,CRP,PH,PaO2/FiO2,HCO3-,PaO2,Lac,PaCO2,in_hospital_mortality,28d_mortality
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1
10000690_37081114,87,0,55.30,,,36.50,79.0,23.0,107.0,63.0,71.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,3.0,12.0,52.0,3.0,40.0,41.0,5.0,0.0,0.0,1.0,3.07,7.5,95.0,,,,,199.0,28.5,9.0,14.0,6.8416,7.497,79.56,,4.2735,4.4,137.0,2.250,104.0,,12.1,29.7,,,7.45,123.000000,26.0,68.0,,52.0,0,0
10002013_39060235,57,0,100.05,157.0,40.589882,36.22,80.0,14.0,104.0,70.0,77.0,100.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,15.0,29.0,3.0,32.0,32.0,7.0,0.0,1.0,1.0,2.95,18.2,102.0,,,,,252.0,28.6,,,,5.712,97.24,,5.4390,4.0,140.0,,109.0,2.12,12.7,25.4,,,7.35,256.000000,23.0,421.0,3.3,45.0,0,0
10002114_34672098,56,1,64.10,173.0,21.417354,36.61,105.0,22.0,104.0,81.0,89.0,100.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2.0,5.0,15.0,59.0,3.0,43.0,32.0,2.0,0.0,0.0,1.0,3.44,8.4,108.0,5.45,0.64881,0.73,0.086905,113.0,30.1,28.0,62.0,18.8144,15.708,238.68,4.9,7.3260,2.8,125.0,2.025,,2.30,21.8,31.4,,,7.65,463.333333,41.0,148.0,5.2,40.0,0,0
10002155_33685454,82,0,54.00,,,35.50,68.0,18.0,126.0,61.0,78.0,97.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,15.0,37.0,1.0,31.0,29.0,10.0,0.0,0.0,1.0,4.19,5.5,125.0,,,,,185.0,37.9,,,,6.783,79.56,,5.2725,4.5,139.0,2.200,106.0,,11.9,40.7,,,7.36,0.000000,25.0,76.0,,48.0,0,0
10002443_35044219,54,1,156.10,178.0,49.267769,35.94,81.0,23.0,136.0,81.0,99.0,94.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,15.0,41.0,4.0,28.0,24.0,3.0,0.0,0.0,1.0,4.69,20.7,144.0,,,,,307.0,43.4,80.0,68.0,8.5520,7.140,70.72,,12.5430,5.2,133.0,1.900,101.0,,13.8,24.6,,,7.27,74.000000,20.0,74.0,1.7,49.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19998591_36794489,54,0,49.40,,,35.39,98.0,13.0,111.0,57.0,69.0,99.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,7.0,9.0,86.0,4.0,44.0,33.0,3.0,0.0,0.0,1.0,2.92,12.0,107.0,,,,,352.0,33.4,20.0,41.0,11.9728,24.633,212.16,,5.1615,3.6,,1.550,,,48.0,43.4,,,7.33,0.000000,25.0,43.0,1.3,52.0,0,0
19998770_37676535,50,0,67.90,,,36.94,100.0,14.0,,,,97.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,14.0,24.0,1.0,20.0,23.0,1.0,0.0,0.0,1.0,3.77,8.3,128.0,,,,,253.0,37.0,10.0,18.0,,3.570,61.88,,8.1030,4.0,142.0,2.050,105.0,,12.1,28.0,,,,0.000000,26.0,,,,0,0
19998843_30988867,45,1,90.00,180.0,27.777778,,46.0,21.0,,,,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,3.0,3.0,83.0,4.0,60.0,40.0,0.0,0.0,0.0,0.0,3.40,15.1,104.0,,,,,135.0,30.6,15.0,39.0,3.4208,6.426,97.24,5.5,5.9940,4.4,130.0,1.650,104.0,1.15,14.5,30.4,,,7.36,700.250000,22.0,353.0,1.8,39.0,1,1
19999442_32336619,44,1,107.50,193.0,28.859835,36.83,88.0,15.0,150.0,90.0,103.0,96.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,3.0,10.0,28.0,3.0,20.0,25.0,4.0,0.0,1.0,1.0,4.63,9.5,143.0,,,,,126.0,40.9,63.0,20.0,6.8416,4.284,79.56,,7.5480,3.7,141.0,2.175,107.0,,13.1,23.3,,,7.46,460.000000,23.0,131.0,3.0,32.0,0,0
