In [None]:
# Import necessary packages
import pandas as pd
import numpy as np
from omegaconf import OmegaConf

In [None]:
# Global configs
yaml_cfg = """
train_data_path: ./raw_data/hm_demo.csv
seed: 42
predict_target: outcome # outcome/LOS
"""
config = OmegaConf.create(yaml_cfg)
print(config.seed)


In [None]:
# Read raw data
df_demo: pd.DataFrame = pd.read_csv("./raw_data/hm_demo.csv", encoding='unicode_escape', sep=",")
df_labtest: pd.DataFrame = pd.read_csv("./raw_data/hm_labtest.csv", encoding='unicode_escape', sep=",")


In [None]:
df_demo['SEX'].replace('MALE', 1, inplace=True)
df_demo['SEX'].replace('FEMALE', 0, inplace=True)
df_demo.rename(columns={'IDINGRESO': 'PATIENT_ID', 'EDAD': 'AGE', 'DIAGNOSTICO_ING': 'outcome'}, inplace=True)
df_demo['outcome'].replace('COVID19 - POSITIVO', 1, inplace=True)
df_demo['outcome'].replace('COVID19 - PENDIENTE', 0, inplace=True)

cols_str = """AGE	TA_MAX_PRIMERA_URG	TA_MIN_PRIMERA_URG	TEMP_PRIMERA_URG	FC_PRIMERA_URG	SAT_02_PRIMERA_URG	GLU_PRIMERA_URG	DIURESIS_PRIMERA_URG	HORA_CONSTANTES_ULTIMA_URG	TA_MAX_ULTIMA_URG	TA_MIN_ULTIMA_URG	TEMP_ULTIMA_URG	FC_ULTIMA_URG	SAT_02_ULTIMA_URG	GLU_ULTIMA_URG"""
cols = cols_str.strip().split()
df_demo[cols] = df_demo[cols].replace([0, 0.0, '0'], np.nan)

In [None]:
df_demo.to_csv('demo.csv', index=False)
df_demo

In [None]:
labtest_features = df_labtest['DETERMINACION/ITEM_LAB'].unique()
labtest_features[0:10]
len(labtest_features)

In [None]:
df_labtest.rename(columns={'PATIENT ID': 'PATIENT_ID'}, inplace=True)
df_labtest = df_labtest[['PATIENT_ID', 'FECHA_PETICION/LAB_DATE', 'DETERMINACION/ITEM_LAB', 'PETICION_LABORATORIO/LAB_NUMBER', 'RESULTADO/VAL_RESULT']].set_index(['PATIENT_ID', 'FECHA_PETICION/LAB_DATE', 'DETERMINACION/ITEM_LAB', 'PETICION_LABORATORIO/LAB_NUMBER'], drop = True).unstack('DETERMINACION/ITEM_LAB')['RESULTADO/VAL_RESULT'].reset_index()

In [None]:
df_labtest.to_csv('labtest.csv', index=False)
df_labtest

In [None]:
df_train = df_labtest.set_index('PATIENT_ID').join(df_demo.set_index('PATIENT_ID')).reset_index()

# df_train.dropna(subset=['FECHA_PETICION/LAB_DATE', 'F_INGRESO_ING', 'F_ALTA_ING'], how='any', inplace=True)

df_train['FECHA_PETICION/LAB_DATE'] = df_train['FECHA_PETICION/LAB_DATE'].str.replace('/','-')
df_train['F_INGRESO_ING'] = df_train['F_INGRESO_ING'].str.replace('/','-')
df_train['F_ALTA_ING'] = df_train['F_ALTA_ING'].str.replace('/','-')

# df_train['FECHA_PETICION/LAB_DATE'] = pd.to_datetime(df_train['FECHA_PETICION/LAB_DATE']).apply(lambda x: x.date())
# df_train['F_INGRESO_ING'] = pd.to_datetime(df_train['F_INGRESO_ING']).apply(lambda x: x.date())
# df_train['F_ALTA_ING'] = pd.to_datetime(df_train['F_ALTA_ING']).apply(lambda x: x.date())

df_train['FECHA_PETICION/LAB_DATE'] = pd.to_datetime(df_train['FECHA_PETICION/LAB_DATE'], format='%d-%m-%Y', errors='coerce')
df_train['F_INGRESO_ING'] = pd.to_datetime(df_train['F_INGRESO_ING'], format='%d-%m-%Y %H:%M:%S', errors='coerce')
df_train['F_ALTA_ING'] = pd.to_datetime(df_train['F_ALTA_ING'], format='%d-%m-%Y %H:%M:%S', errors='coerce')

df_train.dropna(subset=['FECHA_PETICION/LAB_DATE', 'F_INGRESO_ING', 'F_ALTA_ING'], how='any', inplace=True)
df_train.to_csv('train.csv', index=False)
df_train

In [None]:
df_train['F_ALTA_ING']
# df_train['FECHA_PETICION/LAB_DATE'].describe()
# df_train['F_INGRESO_ING'].describe()
# df_train['LOS'] = (df_train['F_ALTA_ING'] - df_train['FECHA_PETICION/LAB_DATE']).dt.days
# df_train['TOT_DAY'] = (df_train['F_ALTA_ING'] - df_train['F_INGRESO_ING']).dt.days

In [None]:
df_train