In [None]:
# Import necessary packages
import pandas as pd
from omegaconf import OmegaConf

In [None]:
# Global configs
yaml_cfg = """
train_data_path: ./raw_data/time_series_375_prerpocess_en.xlsx
test_data_path: ./raw_data/time_series_test_110_preprocess_en.xlsx
seed: 42
predict_target: LOS # LOS
"""
config = OmegaConf.create(yaml_cfg)
print(config.seed)


In [None]:
# Read raw data
df_train: pd.DataFrame = pd.read_excel(config.train_data_path)

Steps:

- fill `patient_id`
- add 2 new columns: total days in hospital (`TOT_DAY`), remaining days in hospital (`LOS`)
- only reserve y-m-d for `RE_DATE` column
- merge lab tests of the same (patient_id, date)
- calculate and save features' statistics information (demographic and lab test data are calculated separately)
- normalize data
- feature selection
- fill missing data (our filling strategy will be described below)
- combine above data to time series data (one patient one record)
- export to python pickle file

In [None]:
# fill `patient_id` rows
df_train['PATIENT_ID'].fillna(method='ffill', inplace=True)

# add 2 new columns: total days in hospital (`TOT_DAY`), remaining days in hospital (`LOS`)
df_train['LOS'] = (df_train['Discharge time'] - df_train['RE_DATE']).dt.days
df_train['TOT_DAY'] = (df_train['Discharge time'] - df_train['Admission time']).dt.days

# only reserve y-m-d for `RE_DATE` column
df_train['RE_DATE'] = df_train['RE_DATE'].dt.strftime('%Y-%m-%d')



In [None]:
# merge lab tests of the same (patient_id, date)
df_train = df_train.groupby(['PATIENT_ID', 'RE_DATE'], dropna=True, as_index = False).mean()

# print(df_train)


In [None]:
labtest_features_str = """
White blood cell count
Red blood cell count
Serum potassium
calcium
hemoglobin
Serum chloride
serum sodium
Platelet count
neutrophils(%)
neutrophils count
mean corpuscular hemoglobin 
Urea
eGFR
aspartate aminotransferase
Lactate dehydrogenase
total protein
Alkaline phosphatase
Total bilirubin
γ-glutamyl transpeptidase
glucose
Hypersensitive c-reactive protein
Prothrombin time
Prothrombin activity
Hypersensitive cardiac troponinI
Amino-terminal brain natriuretic peptide precursor(NT-proBNP)
"""

demographic_features_str = """
age
gender
"""

labtest_features = [f for f in labtest_features_str.strip().split('\n')]
demographic_features = [f for f in demographic_features_str.strip().split('\n')]
target_features = ['outcome', 'LOS']

# print(demographic_features)

# print(df_train[features])

# we have 2 types of prediction tasks: 1) predict mortality outcome, 2) length of stay

# df_train.to_csv('a.csv')

In [None]:
# save features' statistics information

def calculate_statistic_info(df, features):
    statistic_info = {}
    len_df = len(df)
    for _, e in enumerate(features):
        h = {}
        h['count'] = int(df[e].count())
        h['missing'] = str(round(float((100-df[e].count()*100/len_df)),3))+"%"
        h['mean'] = float(df[e].mean())
        h['max'] = float(df[e].max())
        h['min'] = float(df[e].min())
        h['median'] = float(df[e].median())
        h['std'] = float(df[e].std())
        statistic_info[e] = h
    return statistic_info

labtest_statistic_info = calculate_statistic_info(df_train, labtest_features)

groupby_patientid_df = df_train.groupby(['PATIENT_ID'], dropna=True, as_index = False).mean()

labtest_patientwise_statistic_info = calculate_statistic_info(groupby_patientid_df, labtest_features)
demographic_statistic_info = calculate_statistic_info(groupby_patientid_df, demographic_features)
# print(len(statistic_info))

statistic_info = labtest_statistic_info | demographic_statistic_info


In [None]:
# len(labtest_patientwise_statistic_info)
# observe features
to_export_dict = {'name': [], 'missing_rate': [], 'count': [], 'mean': [], 'max': [], 'min': [], 'median': [], 'std': []}
for key in labtest_patientwise_statistic_info:
    print(key)
    detail = labtest_patientwise_statistic_info[key]
    to_export_dict['name'].append(key)
    to_export_dict['count'].append(detail['count'])
    to_export_dict['missing_rate'].append(detail['missing'])
    to_export_dict['mean'].append(detail['mean'])
    to_export_dict['max'].append(detail['max'])
    to_export_dict['min'].append(detail['min'])
    to_export_dict['median'].append(detail['median'])
    to_export_dict['std'].append(detail['std'])

# print(to_export_dict)
to_export_df = pd.DataFrame.from_dict(to_export_dict)
to_export_df.to_csv('statistic_info.csv')


# labtest_features = selected_labtest_features

In [None]:
# normalize data
def normalize_data(df, features, statistic_info):
    df_features = df[features]
    df_features = df_features.apply(lambda x: (x - statistic_info[x.name]['mean']) / (statistic_info[x.name]['std']+1e-12))
    # print(df_features)
    df = pd.concat([df[['PATIENT_ID', 'RE_DATE', 'outcome', 'LOS']], df_features], axis=1)
    return df
df_train = normalize_data(df_train, demographic_features + labtest_features, statistic_info)
# print(df_train)
# df_train.to_csv('a.csv')

In [None]:
def calculate_data_existing_length(data):
    res = 0
    for i in data:
        if not pd.isna(i):
            res += 1
    return res
# 默认 data 中的元素都是按时间排序的
def our_fill(data, mean=0):
    data_len = len(data)
    data_exist_len = calculate_data_existing_length(data)
    if data_len == data_exist_len:
        return data
    elif data_exist_len == 0:
        for i in range(data_len):
            data[i] = mean
        return data
    if pd.isna(data[0]):
        # 只考虑length of data > 0
        # 这一部分保证了data[0]非空
        not_na_pos = 0
        for i in range(data_len):
            if not pd.isna(data[i]):
                not_na_pos = i
                break
        for i in range(not_na_pos):
            data[i] = data[not_na_pos]
    for i in range(1, data_len):
        if pd.isna(data[i]):
            data[i] = data[i-1]
    return data
# print(df_train)

In [None]:
# fill missing data using our strategy and convert to time series records
grouped = df_train.groupby('PATIENT_ID')

all_x_demographic = []
all_x_labtest = []
all_y = []

for name, group in grouped:
    sorted_group = group.sort_values(by=['RE_DATE'], ascending=True)
    # print(df_train)
    patient_demographic = []
    patient_labtest = []
    patient_y = []
    for f in labtest_features:
        our_fill(sorted_group[f].values)
    for _, v in sorted_group.iterrows():
        if config.predict_target == 'outcome':
            patient_y.append(v[config.predict_target])
        elif config.predict_target == 'LOS':
            if v['outcome'] == 1:
                patient_y.append(70-v['LOS'])
            else:
                patient_y.append(v['LOS'])
        demo = []
        lab = []
        for f in demographic_features:
            demo.append(v[f])
        for f in labtest_features:
            lab.append(v[f])
        patient_labtest.append(lab)
        patient_demographic.append(demo)
    all_x_demographic.append(patient_demographic[-1])
    all_x_labtest.append(patient_labtest)
    if config.predict_target == 'outcome':
        all_y.append(patient_y[-1])
    elif config.predict_target == 'LOS':
        all_y.append(patient_y)
        

# all_x_demographic (二维数组，每个患者对应的静态指标)
# all_x_labtest (三维数组，每个患者的各个指标)
# all_y (二维患者结局/三维Length of stay)

In [None]:
# print(len(all_x_demographic))
# print(len(all_x_labtest))
# print(len(all_y))

# print('---')
# print(len(all_x_demographic[0]))
# print(len(all_x_labtest[0]))
# # print((all_y[0]))

# print('---')
# print((all_x_demographic[0]))
# print((all_x_labtest[0][0]))
# print((all_y[0]))

In [None]:
# save pickle format dataset
pd.to_pickle(all_x_demographic,f'./processed_data/train_x_demographic.pkl' )
pd.to_pickle(all_x_labtest,f'./processed_data/train_x_labtest.pkl' )
pd.to_pickle(all_y,f'./processed_data/train_y_{config.predict_target}.pkl' )