In [None]:
# Import necessary packages
import pandas as pd
from omegaconf import OmegaConf

In [None]:
# Global configs
yaml_cfg = """
train_data_path: ./raw_data/time_series_375_prerpocess_en.xlsx
test_data_path: ./raw_data/time_series_test_110_preprocess_en.xlsx
seed: 42
predict_target: outcome # LOS
"""
config = OmegaConf.create(yaml_cfg)
print(config.seed)


In [None]:
# Read raw data
df_train: pd.DataFrame = pd.read_excel(config.train_data_path)

Steps:

- fill `patient_id`
- add 2 new columns: total days in hospital (`TOT_DAY`), remaining days in hospital (`LOS`)
- only reserve y-m-d for `RE_DATE` column
- merge lab tests of the same (patient_id, date)
- normalize data
- feature selection
- fill missing data (our filling strategy will be described below)
- combine above data to time series data (one patient one record)
- export to python pickle file

In [None]:
# fill `patient_id` rows
df_train['PATIENT_ID'].fillna(method='ffill', inplace=True)

# add 2 new columns: total days in hospital (`TOT_DAY`), remaining days in hospital (`LOS`)
df_train['LOS'] = (df_train['Discharge time'] - df_train['RE_DATE']).dt.days
df_train['TOT_DAY'] = (df_train['Discharge time'] - df_train['Admission time']).dt.days

# only reserve y-m-d for `RE_DATE` column
df_train['RE_DATE'] = df_train['RE_DATE'].dt.strftime('%Y-%m-%d')

In [None]:
# merge lab tests of the same (patient_id, date)
df_train = df_train.groupby(['PATIENT_ID', 'RE_DATE']).mean()


In [None]:
features_str = """age
gender
White blood cell count
Red blood cell count
Serum potassium
calcium
hemoglobin
Serum chloride
serum sodium
Platelet count
neutrophils(%)
neutrophils count
mean corpuscular hemoglobin 
Urea
eGFR
aspartate aminotransferase
Lactate dehydrogenase
total protein
Alkaline phosphatase
Total bilirubin
γ-glutamyl transpeptidase
glucose
Hypersensitive c-reactive protein
Prothrombin time
Prothrombin activity
Hypersensitive cardiac troponinI
Amino-terminal brain natriuretic peptide precursor(NT-proBNP)"""

features = [f for f in features_str.split('\n')]
print(features)
# print(df_train[features])

# we have 2 types of prediction tasks: 1) predict mortality outcome, 2) length of stay

# df_train.to_csv('a.csv')

In [None]:
# normalize data
def normalize_data(df):
    df_features = df[features]
    df_features = df_features.apply(lambda x: (x - x.mean()) / (x.std()+1e-12))
    # print(df_features)
    df = pd.concat([df[['outcome', 'LOS']], df_features], axis=1)
    return df
df_train = normalize_data(df_train)
# print(df_train)
df_train.to_csv('a.csv')

In [None]:
def calculate_data_existing_length(data):
    res = 0
    for i in data:
        if not pd.isna(i):
            res += 1
    return res
# 默认 data 中的元素都是按时间排序的
def our_fill(data, mean=0):
    data_len = len(data)
    data_exist_len = calculate_data_existing_length(data)
    if data_len == data_exist_len:
        return data
    elif data_exist_len == 0:
        for i in range(data_len):
            data[i] = mean
        return data
    if pd.isna(data[0]):
        # 只考虑length of data > 0
        # 这一部分保证了data[0]非空
        not_na_pos = 0
        for i in range(data_len):
            if not pd.isna(data[i]):
                not_na_pos = i
                break
        for i in range(not_na_pos):
            data[i] = data[not_na_pos]
    for i in range(1, data_len):
        if pd.isna(data[i]):
            data[i] = data[i-1]
    return data

In [None]:
# fill missing data using our strategy and convert to time series records
features_to_fill = features
grouped = df_train.groupby('PATIENT_ID')

all_x = []
all_y = []

for name, group in grouped:
    sorted_group = group.sort_values(by=['RE_DATE'], ascending=True)
    # print(df_train)
    patient = []
    patient_y = []
    for f in features_to_fill:
        our_fill(sorted_group[f].values)
    for _, i in sorted_group.iterrows():
        patient_y.append(i[config.predict_target])
        lab = []
        for f in features:
            lab.append(i[f])
        patient.append(lab)
    all_x.append(patient)
    all_y.append(patient_y[-1])

# all_x (三维数组，每个患者的各个指标)
# all_y (患者结局)
# print(all_y)

In [None]:
# save pickle format dataset
pd.to_pickle(all_x,f'./processed_data/train_x_{config.predict_target}_prediction.pkl' )
pd.to_pickle(all_y,f'./processed_data/train_y_{config.predict_target}_prediction.pkl' )