In [None]:
# Import necessary packages
import pandas as pd
import numpy as np
from omegaconf import OmegaConf
import math

In [None]:
# Global configs
yaml_cfg = """
train_data_path: ./raw_data/hm_demo.csv
seed: 42
predict_target: outcome # outcome/LOS
"""
config = OmegaConf.create(yaml_cfg)
print(config.seed)


In [None]:
# Read raw data

# parser = lambda x: pd.to_datetime(x, format='%d/%m/%Y %H:%M:%S')
# df_demo: pd.DataFrame = pd.read_csv("./raw_data/hm_demo.csv", encoding='unicode_escape', sep=",", date_parser=parser)
df_demo: pd.DataFrame = pd.read_csv("./raw_data/hm_demo.csv", encoding='unicode_escape', sep=",", converters={'F_INGRESO_ING' : str})
df_labtest: pd.DataFrame = pd.read_csv("./raw_data/hm_labtest.csv", encoding='unicode_escape', sep=",")


In [None]:
df_demo['SEX'].replace('MALE', 1, inplace=True)
df_demo['SEX'].replace('FEMALE', 0, inplace=True)
df_demo.rename(columns={'IDINGRESO': 'PATIENT_ID', 'EDAD': 'AGE', 'DIAGNOSTICO_ING': 'outcome'}, inplace=True)
df_demo['outcome'].replace('COVID19 - POSITIVO', 1, inplace=True)
df_demo['outcome'].replace('COVID19 - PENDIENTE', 0, inplace=True)

cols_str = """AGE	TA_MAX_PRIMERA_URG	TA_MIN_PRIMERA_URG	TEMP_PRIMERA_URG	FC_PRIMERA_URG	SAT_02_PRIMERA_URG	GLU_PRIMERA_URG	DIURESIS_PRIMERA_URG	HORA_CONSTANTES_ULTIMA_URG	TA_MAX_ULTIMA_URG	TA_MIN_ULTIMA_URG	TEMP_ULTIMA_URG	FC_ULTIMA_URG	SAT_02_ULTIMA_URG	GLU_ULTIMA_URG"""
cols = cols_str.strip().split()
df_demo[cols] = df_demo[cols].replace([0, 0.0, '0'], np.nan)

In [None]:
# df_demo.to_csv('demo.csv', index=False)
# df_demo = df_demo.astype({"F_INGRESO_ING": str}, errors='raise')
# df_demo['F_INGRESO_ING'] = df_demo['F_INGRESO_ING'].str.replace(r'(\d+)/(\d+)/(\d+)(.*)', r'\2/\1/\3\4', regex=True)
# df_demo['F_INGRESO_ING'] = df_demo['F_INGRESO_ING'].str.replace('/','-')
# df_demo['F_INGRESO_ING'] = df_demo['F_INGRESO_ING'].str.replace(' AM','')
# df_demo['F_INGRESO_ING'].to_csv('test.csv', index=False)
df_demo['F_INGRESO_ING']
df_demo['F_ALTA_ING']

In [None]:
labtest_features = df_labtest['DETERMINACION/ITEM_LAB'].unique().tolist()
labtest_features[0:10]
len(labtest_features)

In [None]:
df_labtest.rename(columns={'PATIENT ID': 'PATIENT_ID'}, inplace=True)
df_labtest = df_labtest[['PATIENT_ID', 'FECHA_PETICION/LAB_DATE', 'DETERMINACION/ITEM_LAB', 'PETICION_LABORATORIO/LAB_NUMBER', 'RESULTADO/VAL_RESULT']].set_index(['PATIENT_ID', 'FECHA_PETICION/LAB_DATE', 'DETERMINACION/ITEM_LAB', 'PETICION_LABORATORIO/LAB_NUMBER'], drop = True).unstack('DETERMINACION/ITEM_LAB')['RESULTADO/VAL_RESULT'].reset_index()

In [None]:
df_labtest.to_csv('labtest.csv', index=False)
df_labtest

In [None]:
df_train = df_labtest.set_index('PATIENT_ID').join(df_demo.set_index('PATIENT_ID')).reset_index()

# df_train.dropna(subset=['FECHA_PETICION/LAB_DATE', 'F_INGRESO_ING', 'F_ALTA_ING'], how='any', inplace=True)

# df_train['F_INGRESO_ING'] = df_train['F_INGRESO_ING'].str.replace(r'(\d+)/(\d+)/(\d+)(.*)', r'\2/\1/\3\4', regex=True)
# df_train['F_ALTA_ING'] = df_train['F_ALTA_ING'].str.replace(r'(\d+)/(\d+)/(\d+)(.*)', r'\2/\1/\3\4', regex=True)

# df_train['FECHA_PETICION/LAB_DATE'] = df_train['FECHA_PETICION/LAB_DATE'].str.replace('/','-')
# df_train['F_INGRESO_ING'] = df_train['F_INGRESO_ING'].str.replace('/','-')
# df_train['F_ALTA_ING'] = df_train['F_ALTA_ING'].str.replace('/','-')

# df_train['FECHA_PETICION/LAB_DATE']

# df_train['FECHA_PETICION/LAB_DATE'] = pd.to_datetime(df_train['FECHA_PETICION/LAB_DATE']).apply(lambda x: x.date())
# df_train['F_INGRESO_ING'] = pd.to_datetime(df_train['F_INGRESO_ING']).apply(lambda x: x.date())
# df_train['F_ALTA_ING'] = pd.to_datetime(df_train['F_ALTA_ING']).apply(lambda x: x.date())

datetime_error_setting = 'raise' # 'raise' / 'ignore' / 'coerce'
# df_train['FECHA_PETICION/LAB_DATE'] = pd.to_datetime(df_train['FECHA_PETICION/LAB_DATE'], format='%d/%m/%Y', errors=datetime_error_setting)
df_train['FECHA_PETICION/LAB_DATE'] = pd.to_datetime(df_train['FECHA_PETICION/LAB_DATE'], format='%d-%m-%Y', errors=datetime_error_setting)
# df_train['F_INGRESO_ING'] = pd.to_datetime(df_train['F_INGRESO_ING'], format='%m/%d/%Y %H:%M', errors=datetime_error_setting)
df_train['F_INGRESO_ING'] = pd.to_datetime(df_train['F_INGRESO_ING'], format='%d/%m/%Y', errors=datetime_error_setting)
# df_train['F_ALTA_ING'] = pd.to_datetime(df_train['F_ALTA_ING'], format='%m/%d/%Y %H:%M', errors=datetime_error_setting)
df_train['F_ALTA_ING'] = pd.to_datetime(df_train['F_ALTA_ING'], format='%d/%m/%Y', errors=datetime_error_setting)

df_train.dropna(subset=['FECHA_PETICION/LAB_DATE', 'F_INGRESO_ING', 'F_ALTA_ING'], how='any', inplace=True)
df_train

In [None]:
df_train['FECHA_PETICION/LAB_DATE'].describe(datetime_is_numeric=True)
# df_train['F_ALTA_ING'].describe(datetime_is_numeric=True)
# df_train['F_INGRESO_ING'].describe(datetime_is_numeric=True)

df_train['LOS'] = (df_train['F_ALTA_ING'] - df_train['FECHA_PETICION/LAB_DATE']).dt.days
df_train['TOT_DAY'] = (df_train['F_ALTA_ING'] - df_train['F_INGRESO_ING']).dt.days

In [None]:
df_train = df_train[(df_train['LOS'] >= 0) & (df_train['LOS'] < 35) & (df_train['TOT_DAY'] >= 0 ) & (df_train['TOT_DAY'] < 35 )]
# df_train.to_csv('train.csv', index=False)
df_train

In [None]:
for c in labtest_features:
    df_train[c] = pd.to_numeric(df_train[c], errors='coerce')

df_train = df_train.groupby(['PATIENT_ID', 'FECHA_PETICION/LAB_DATE'], dropna=True, as_index = False).mean()
df_train.to_csv('train.csv', index=False)
df_train
# df_train[labtest_features]

In [None]:
# labtest_features are already defined

demographic_features_str = """
AGE	SEX TA_MAX_PRIMERA_URG	TA_MIN_PRIMERA_URG	TEMP_PRIMERA_URG	FC_PRIMERA_URG	SAT_02_PRIMERA_URG
TA_MAX_ULTIMA_URG	TA_MIN_ULTIMA_URG	TEMP_ULTIMA_URG	FC_ULTIMA_URG	SAT_02_ULTIMA_URG
UCI_DAYS
"""

demographic_features = [f for f in demographic_features_str.strip().split()]
target_features = ['outcome', 'LOS']
demographic_features

In [None]:
# save features' statistics information
def calculate_statistic_info(df, features):
    statistic_info = {}
    len_df = len(df)
    for _, e in enumerate(features):
        h = {}
        h['count'] = int(df[e].count())
        h['missing'] = float((100-df[e].count()*100/len_df))
        # print(h['missing'],'% missing')
        h['mean'] = float(df[e].mean())
        h['max'] = float(df[e].max())
        h['min'] = float(df[e].min())
        h['median'] = float(df[e].median())
        h['std'] = float(df[e].std())
        statistic_info[e] = h
    return statistic_info

labtest_statistic_info = calculate_statistic_info(df_train, labtest_features)

groupby_patientid_df = df_train.groupby(['PATIENT_ID'], dropna=True, as_index = False).mean()
labtest_patientwise_statistic_info = calculate_statistic_info(groupby_patientid_df, labtest_features)
# print(groupby_patientid_df)
demographic_statistic_info = calculate_statistic_info(groupby_patientid_df, demographic_features)

statistic_info = labtest_statistic_info | demographic_statistic_info

In [None]:
# len(labtest_patientwise_statistic_info)
# observe features
to_export_dict = {'name': [], 'missing_rate': [], 'count': [], 'mean': [], 'max': [], 'min': [], 'median': [], 'std': []}

to_export_statistic_info = demographic_statistic_info | labtest_patientwise_statistic_info
for key in to_export_statistic_info:
    print(key)
    detail = to_export_statistic_info[key]
    to_export_dict['name'].append(key)
    to_export_dict['count'].append(detail['count'])
    to_export_dict['missing_rate'].append(detail['missing'])
    to_export_dict['mean'].append(detail['mean'])
    to_export_dict['max'].append(detail['max'])
    to_export_dict['min'].append(detail['min'])
    to_export_dict['median'].append(detail['median'])
    to_export_dict['std'].append(detail['std'])

# print(to_export_dict)
to_export_df = pd.DataFrame.from_dict(to_export_dict)
to_export_df.to_csv('statistic_info.csv')


# labtest_features = selected_labtest_features

In [None]:
selected_labtest_features = []
for f in labtest_statistic_info:
    if labtest_statistic_info[f]['missing'] < 50:
        selected_labtest_features.append(f)
len(selected_labtest_features)
labtest_features = selected_labtest_features
# demographic_statistic_info

In [None]:
# normalize data
def normalize_data(df, features, statistic_info):
    df_features = df[features]
    df_features = df_features.apply(lambda x: (x - statistic_info[x.name]['mean']) / (statistic_info[x.name]['std']+1e-12))
    # print(df_features)
    df = pd.concat([df[['PATIENT_ID', 'FECHA_PETICION/LAB_DATE', 'outcome', 'LOS']], df_features], axis=1)
    return df
df_train = normalize_data(df_train, demographic_features + labtest_features, statistic_info)

In [None]:
# def is_na(x):
#     if math.isnan(x):
#         return True
#     if pd.isna(x):
#         return True
#     return False

def calculate_data_existing_length(data):
    res = 0
    for i in data:
        if not pd.isna(i):
            res += 1
    return res
# 默认 data 中的元素都是按时间排序的
def our_fill(data, mean=0):
    data_len = len(data)
    data_exist_len = calculate_data_existing_length(data)
    if data_len == data_exist_len:
        return data
    elif data_exist_len == 0:
        for i in range(data_len):
            data[i] = mean
        return data
    if pd.isna(data[0]):
        # 只考虑length of data > 0
        # 这一部分保证了data[0]非空
        not_na_pos = 0
        for i in range(data_len):
            if not pd.isna(data[i]):
                not_na_pos = i
                break
        for i in range(not_na_pos):
            data[i] = data[not_na_pos]
    for i in range(1, data_len):
        if pd.isna(data[i]):
            data[i] = data[i-1]
    return data
# print(df_train)

In [None]:
# fill missing data using our strategy and convert to time series records
grouped = df_train.groupby('PATIENT_ID')

all_x_demographic = []
all_x_labtest = []
all_y = []

for name, group in grouped:
    sorted_group = group.sort_values(by=['FECHA_PETICION/LAB_DATE'], ascending=True)
    # print(df_train)
    patient_demographic = []
    patient_labtest = []
    patient_y = []
    for f in labtest_features+demographic_features:
        our_fill(sorted_group[f].values)
    for _, v in sorted_group.iterrows():
        if config.predict_target == 'outcome':
            patient_y.append(v[config.predict_target])
        elif config.predict_target == 'LOS':
            if v['outcome'] == 1:
                patient_y.append(70-v['LOS'])
            else:
                patient_y.append(v['LOS'])
        demo = []
        lab = []
        for f in demographic_features:
            demo.append(v[f])
        for f in labtest_features:
            lab.append(v[f])
        patient_labtest.append(lab)
        patient_demographic.append(demo)
    all_x_demographic.append(patient_demographic[-1])
    all_x_labtest.append(patient_labtest)
    if config.predict_target == 'outcome':
        all_y.append(patient_y[-1])
    elif config.predict_target == 'LOS':
        all_y.append(patient_y)
        

# all_x_demographic (二维数组，每个患者对应的静态指标)
# all_x_labtest (三维数组，每个患者的各个指标)
# all_y (二维患者结局/三维Length of stay)

In [None]:
# save pickle format dataset
pd.to_pickle(all_x_demographic,f'./processed_data/train_x_demographic.pkl' )
pd.to_pickle(all_x_labtest,f'./processed_data/train_x_labtest.pkl' )
pd.to_pickle(all_y,f'./processed_data/train_y_{config.predict_target}.pkl' )

In [None]:
# np.array(all_x_demographic).shape # 13 demo features
# np.array(all_x_labtest).shape
# len(all_x_labtest[0][1]) # 36 labtest features
print(all_x_demographic[1])
print(all_x_demographic[0][3])
print(type(all_x_demographic[0][3]))
print(math.isnan(all_x_demographic[1][2]))
all_x_demographic[0][3] = 3.0
print(all_x_demographic[0][3])


In [None]:
df_y = pd.DataFrame({'y':all_y})
df_y.describe()