In [None]:
# Import necessary packages
import pandas as pd
from omegaconf import OmegaConf

In [None]:
# Global configs
yaml_cfg = """
train_data_path: ./raw_data/time_series_375_prerpocess_en.xlsx
test_data_path: ./raw_data/time_series_test_110_preprocess_en.xlsx
seed: 42
"""
config = OmegaConf.create(yaml_cfg)
print(config.seed)


In [None]:
# Read raw data
df_train: pd.DataFrame = pd.read_excel(config.train_data_path)
print(df_train.describe())

Steps:

- fill `patient_id` and only reserve y-m-d for `RE_DATE` column
- merge lab tests of the same (patient_id, date)
- normalize data
- feature selection
- fill missing data (our filling strategy will be described below)
- combine above data to time series data (one patient one record)
- export to python pickle file

In [None]:
# fill `patient_id` rows
df_train['PATIENT_ID'].fillna(method='ffill', inplace=True)

# only reserve y-m-d for `RE_DATE` column
df_train['RE_DATE'] = df_train['RE_DATE'].dt.strftime('%Y-%m-%d')
print(df_train)

In [None]:
# merge lab tests of the same (patient_id, date)
df_train = df_train.groupby(['PATIENT_ID', 'RE_DATE']).mean()
print(df_train)

In [None]:
features_str = """Hypersensitive cardiac troponinI	hemoglobin	Serum chloride	Prothrombin time	procalcitonin	eosinophils(%)	Interleukin 2 receptor	Alkaline phosphatase	albumin	basophil(%)	Interleukin 10	Total bilirubin	Platelet count	monocytes(%)	antithrombin	Interleukin 8	indirect bilirubin	Red blood cell distribution width 	neutrophils(%)	total protein	Quantification of Treponema pallidum antibodies	Prothrombin activity	HBsAg	mean corpuscular volume	hematocrit	White blood cell count	Tumor necrosis factorα	mean corpuscular hemoglobin concentration	fibrinogen	Interleukin 1β	Urea	lymphocyte count	PH value	Red blood cell count	Eosinophil count	Corrected calcium	Serum potassium	glucose	neutrophils count	Direct bilirubin	Mean platelet volume	ferritin	RBC distribution width SD	Thrombin time	(%)lymphocyte	HCV antibody quantification	D-D dimer	Total cholesterol	aspartate aminotransferase	Uric acid	HCO3-	calcium	Amino-terminal brain natriuretic peptide precursor(NT-proBNP)	Lactate dehydrogenase	platelet large cell ratio 	Interleukin 6	Fibrin degradation products	monocytes count	PLT distribution width	globulin	γ-glutamyl transpeptidase	International standard ratio	basophil count(#)	2019-nCoV nucleic acid detection	mean corpuscular hemoglobin 	Activation of partial thromboplastin time	Hypersensitive c-reactive protein	HIV antibody quantification	serum sodium	thrombocytocrit	ESR	glutamic-pyruvic transaminase	eGFR	creatinine"""
features_to_normalize = features_str.split('\t')
print(features_to_normalize)


In [None]:
# normalize data
def normalize_data(df):
    df_features = df.iloc[:, 7:]
    df_features = df_features.apply(lambda x: (x - x.mean()) / (x.std()+1e-12))
    # print(df_features)
    df = pd.concat([df.iloc[:,0:7], df_features], axis=1)
    return df
df_train = normalize_data(df_train)
print(df_train)

In [None]:
def calculate_data_existing_length(data):
    res = 0
    for i in data:
        if not pd.isna(i):
            res += 1
    return res
# 默认 data 中的元素都是按时间排序的
def our_fill(data, mean=0):
    data_len = len(data)
    data_exist_len = calculate_data_existing_length(data)
    if data_len == data_exist_len:
        return data
    elif data_exist_len == 0:
        for i in range(data_len):
            data[i] = mean
        return data
    if pd.isna(data[0]):
        # 只考虑length of data > 0
        # 这一部分保证了data[0]非空
        not_na_pos = 0
        for i in range(data_len):
            if not pd.isna(data[i]):
                not_na_pos = i
                break
        for i in range(not_na_pos):
            data[i] = data[not_na_pos]
    for i in range(1, data_len):
        if pd.isna(data[i]):
            data[i] = data[i-1]
    return data

In [None]:
# fill missing data using our strategy and convert to time series records
features_to_fill = features_to_normalize
grouped = df_train.groupby('PATIENT_ID')

all_x = []
all_y = []
all_res = []  # 单个患者每次历史就诊的完整数据导出
for name, group in grouped:
    sorted_group = group.sort_values(by=['RE_DATE'], ascending=True)
    print(df_train)
    patient = []
    for f in features_to_fill:
        our_fill(sorted_group[f].values)
    cur = 0
    for _, i in sorted_group.iterrows():
        if cur == 0:
            all_y.append(i['outcome'])
        cur += 1
        # lab = [i['patient_id'], i['date'], i['co2'], i['wbc'], i['hgb'], i['ca'], i['k'], i['na'], i['cre'], i['p'], i['alb'], i['glu'], i['pre_weight'], i['pst_weight'],
        #        i['pre_sys'], i['pst_sys'], i['pre_dia'], i['pst_dia'], i['pre_urea'], i['pst_urea'], i['bmi'], i['gender'], i['age'], i['origin_disease'], i['diabetes']]
        # all_res.append([i['patient_id'],i['date'], i['co2'], i['wbc'], i['hgb'], i['ca'], i['k'], i['na'], i['cre'], i['p'], i['alb'], i['glu'], i['pre_weight'], i['pst_weight'],i['pre_sys'], i['pst_sys'], i['pre_dia'], i['pst_dia'], i['pre_urea'], i['pst_urea'], i['bmi'], i['gender'], i['age'], i['origin_disease'], i['diabetes'], i['death']])
        # patient.append(lab)
    all_x.append(patient)

# all_x (三维数组，每个患者的各个指标)
# all_y (患者结局)
print(all_y)