## Import packages

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from utils.tools import forward_fill_pipeline, normalize_dataframe, normalize_df_with_statistics

data_dir = "./cdsl/"
Path(os.path.join(data_dir, 'processed')).mkdir(parents=True, exist_ok=True)
Path(os.path.join(data_dir, 'statistics')).mkdir(parents=True, exist_ok=True)

SEED = 42

## Read data from files

### Record feature names

In [None]:
basic_records = ['PatientID', 'RecordTime', 'AdmissionTime', 'DischargeTime']
target_features = ['Outcome', 'LOS']
demographic_features = ['Sex', 'Age']
labtest_features = ['MAX_BLOOD_PRESSURE', 'MIN_BLOOD_PRESSURE', 'TEMPERATURE', 'HEART_RATE', 'OXYGEN_SATURATION', 'ADW -- Coeficiente de anisocitosis', 'ADW -- SISTEMATICO DE SANGRE', 'ALB -- ALBUMINA', 'AMI -- AMILASA', 'AP -- ACTIVIDAD DE PROTROMBINA', 'APTT -- TIEMPO DE CEFALINA (APTT)', 'AU -- ACIDO URICO', 'BAS -- Bas¢filos', 'BAS -- SISTEMATICO DE SANGRE', 'BAS% -- Bas¢filos %', 'BAS% -- SISTEMATICO DE SANGRE', 'BD -- BILIRRUBINA DIRECTA', 'BE(b) -- BE(b)', 'BE(b)V -- BE (b)', 'BEecf -- BEecf', 'BEecfV -- BEecf', 'BT -- BILIRRUBINA TOTAL', 'BT -- BILIRRUBINA TOTAL                                                               ', 'CA -- CALCIO                                                                          ', 'CA++ -- Ca++ Gasometria', 'CHCM -- Conc. Hemoglobina Corpuscular Media', 'CHCM -- SISTEMATICO DE SANGRE', 'CK -- CK (CREATINQUINASA)', 'CL -- CLORO', 'CREA -- CREATININA', 'DD -- DIMERO D', 'EOS -- Eosin¢filos', 'EOS -- SISTEMATICO DE SANGRE', 'EOS% -- Eosin¢filos %', 'EOS% -- SISTEMATICO DE SANGRE', 'FA -- FOSFATASA ALCALINA', 'FER -- FERRITINA', 'FIB -- FIBRINàGENO', 'FOS -- FOSFORO', 'G-CORONAV (RT-PCR) -- Tipo de muestra: Exudado Far¡ngeo/Nasofar¡ngeo', 'GGT -- GGT (GAMMA GLUTAMIL TRANSPEPTIDASA)', 'GLU -- GLUCOSA', 'GOT -- GOT (AST)', 'GPT -- GPT (ALT)', 'HCM -- Hemoglobina Corpuscular Media', 'HCM -- SISTEMATICO DE SANGRE', 'HCO3 -- HCO3-', 'HCO3V -- HCO3-', 'HCTO -- Hematocrito', 'HCTO -- SISTEMATICO DE SANGRE', 'HEM -- Hemat¡es', 'HEM -- SISTEMATICO DE SANGRE', 'HGB -- Hemoglobina', 'HGB -- SISTEMATICO DE SANGRE', 'INR -- INR', 'K -- POTASIO', 'LAC -- LACTATO', 'LDH -- LDH', 'LEUC -- Leucocitos', 'LEUC -- SISTEMATICO DE SANGRE', 'LIN -- Linfocitos', 'LIN -- SISTEMATICO DE SANGRE', 'LIN% -- Linfocitos %', 'LIN% -- SISTEMATICO DE SANGRE', 'MG -- MAGNESIO', 'MONO -- Monocitos', 'MONO -- SISTEMATICO DE SANGRE', 'MONO% -- Monocitos %', 'MONO% -- SISTEMATICO DE SANGRE', 'NA -- SODIO', 'NEU -- Neutr¢filos', 'NEU -- SISTEMATICO DE SANGRE', 'NEU% -- Neutr¢filos %', 'NEU% -- SISTEMATICO DE SANGRE', 'PCO2 -- pCO2', 'PCO2V -- pCO2', 'PCR -- PROTEINA C REACTIVA', 'PH -- pH', 'PHV -- pH', 'PLAQ -- Recuento de plaquetas', 'PLAQ -- SISTEMATICO DE SANGRE', 'PO2 -- pO2', 'PO2V -- pO2', 'PROCAL -- PROCALCITONINA', 'PT -- PROTEINAS TOTALES', 'SO2C -- sO2c (Saturaci¢n de ox¡geno)', 'SO2CV -- sO2c (Saturaci¢n de ox¡geno)', 'TCO2 -- tCO2(B)c', 'TCO2V -- tCO2 (B)', 'TP -- TIEMPO DE PROTROMBINA', 'TROPO -- TROPONINA', 'U -- UREA', 'VCM -- SISTEMATICO DE SANGRE', 'VCM -- Volumen Corpuscular Medio', 'VPM -- SISTEMATICO DE SANGRE', 'VPM -- Volumen plaquetar medio', 'VSG -- VSG']
require_impute_features = labtest_features
normalize_features = ['Age'] + labtest_features + ['LOS']

In [None]:
df = pd.read_csv(os.path.join(data_dir, "processed", f"cdsl_dataset_formatted.csv"))
df

## Stratified split dataset into `Training`, `Validation` and `Test` sets

- Also include (Imputation & Normalization & Outlier Filtering) steps
- The train, validation and test sets are saved in the `./processed/{fold_x}` folder
- use 8:1:1 10-fold strategy (Patient-level split)

In [None]:
num_folds = 10

# Group the dataframe by patient ID
grouped = df.groupby('PatientID')

# Split the patient IDs into train/val/test sets
patients = np.array(list(grouped.groups.keys()))
kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=SEED)

for fold, (train_val_index, test_index) in enumerate(kf.split(patients, df.groupby('PatientID')['Outcome'].first())):
    # Get the train/val/test patient IDs for the current fold
    train_val_patients, test_patients = patients[train_val_index], patients[test_index]

    # Split the train_val_patients into train/val sets
    train_patients, val_patients = train_test_split(train_val_patients, test_size=1/(num_folds-1), random_state=SEED, stratify=df[df['PatientID'].isin(train_val_patients)].groupby('PatientID')['Outcome'].first())

    # Create train, val, and test dataframes for the current fold
    train_df = df[df['PatientID'].isin(train_patients)]
    val_df = df[df['PatientID'].isin(val_patients)]
    test_df = df[df['PatientID'].isin(test_patients)]
    
    assert len(train_df) + len(val_df) + len(test_df) == len(df)

    # Save the train, val, and test dataframes for the current fold to csv files
    
    fold_dir = os.path.join(data_dir, 'processed', f'fold_{fold}')
    Path(fold_dir).mkdir(parents=True, exist_ok=True)
    train_df.to_csv(os.path.join(fold_dir, "train_raw.csv"), index=False)
    val_df.to_csv(os.path.join(fold_dir, "val_raw.csv"), index=False)
    test_df.to_csv(os.path.join(fold_dir, "test_raw.csv"), index=False)

    # Calculate the mean and std of the train set (include age, lab test features, and LOS) on the data in 5% to 95% quantile range

    # Normalize data
    train_df, val_df, test_df, default_fill, los_info, train_mean, train_std = normalize_dataframe(train_df, val_df, test_df, normalize_features)
    
    # Drop rows if all features are recorded NaN
    train_df = train_df.dropna(axis=0, how='all', subset=normalize_features)
    val_df = val_df.dropna(axis=0, how='all', subset=normalize_features)
    test_df = test_df.dropna(axis=0, how='all', subset=normalize_features)

    # Save the train, val, and test dataframes for the current fold to csv files
    train_df.to_csv(os.path.join(fold_dir, "train_after_zscore.csv"), index=False)
    val_df.to_csv(os.path.join(fold_dir, "val_after_zscore.csv"), index=False)
    test_df.to_csv(os.path.join(fold_dir, "test_after_zscore.csv"), index=False)

    # Forward Imputation after grouped by PatientID
    # Notice: if a patient has never done certain lab test, the imputed value will be the median value calculated from train set
    train_x, train_y, train_pid = forward_fill_pipeline(train_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)
    val_x, val_y, val_pid = forward_fill_pipeline(val_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)
    test_x, test_y, test_pid = forward_fill_pipeline(test_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)

    # Save the imputed dataset to pickle file
    pd.to_pickle(train_x, os.path.join(fold_dir, "train_x.pkl"))
    pd.to_pickle(train_y, os.path.join(fold_dir, "train_y.pkl"))
    pd.to_pickle(train_pid, os.path.join(fold_dir, "train_pid.pkl"))
    pd.to_pickle(val_x, os.path.join(fold_dir, "val_x.pkl"))
    pd.to_pickle(val_y, os.path.join(fold_dir, "val_y.pkl"))
    pd.to_pickle(val_pid, os.path.join(fold_dir, "val_pid.pkl"))
    pd.to_pickle(test_x, os.path.join(fold_dir, "test_x.pkl"))
    pd.to_pickle(test_y, os.path.join(fold_dir, "test_y.pkl"))
    pd.to_pickle(test_pid, os.path.join(fold_dir, "test_pid.pkl"))
    pd.to_pickle(los_info, os.path.join(fold_dir, "los_info.pkl"))