## Import packages

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from utils.tools import forward_fill_pipeline, normalize_dataframe, normalize_df_with_statistics, calculate_missing_rate, export_missing_mask_pipeline

data_dir = "./tjh/"
Path(os.path.join(data_dir, 'processed')).mkdir(parents=True, exist_ok=True)
Path(os.path.join(data_dir, 'statistics')).mkdir(parents=True, exist_ok=True)

SEED = 42

In [None]:
def drop_features_randomly(df, X, drop_columns, seed=42):
    np.random.seed(seed)
    
    # Filter for the required columns
    sub_df = df[drop_columns]

    # Compute the overall missing rate for the specified columns
    overall_missing_rate = sub_df.isnull().mean().mean()
    print(f"Overall Missing Rate for Specified Columns: {overall_missing_rate * 100:.2f}%")

    # Compute the missing rate for each feature in the specified columns
    missing_rate = sub_df.isnull().mean()

    # Drop additional values based on X
    for column in drop_columns:
        # Ensure we don't drop values from columns that are already 100% missing
        if missing_rate[column] < 1:
            # Find the indices of non-missing values
            non_missing_indices = df[column].dropna().index
            
            # Calculate the number of additional values to drop
            num_to_drop = int(len(non_missing_indices) * X / 100)
            
            # Randomly select indices to drop
            indices_to_drop = np.random.choice(non_missing_indices, num_to_drop, replace=False)
            
            # Set these values to NaN
            df.loc[indices_to_drop, column] = np.nan


    # # Ensure no specified column is entirely empty after dropping
    # if df[drop_columns].dropna(axis=1, how='all').shape[1] != sub_df.shape[1]:
    #     raise ValueError("Some specified columns have become completely empty after dropping!")

    cur_missing = overall_missing_rate = df[drop_columns].isnull().mean().mean()
    print(f"Current Overall Missing Rate: {cur_missing * 100:.2f}%")
    
    return df


## Read data from files

### Record feature names

In [None]:
basic_records = ['PatientID', 'RecordTime', 'AdmissionTime', 'DischargeTime']
target_features = ['Outcome', 'LOS']
demographic_features = ['Sex', 'Age']
labtest_features = ['Hypersensitive cardiac troponinI', 'hemoglobin', 'Serum chloride', 'Prothrombin time', 'procalcitonin', 'eosinophils(%)', 'Interleukin 2 receptor', 'Alkaline phosphatase', 'albumin', 'basophil(%)', 'Interleukin 10', 'Total bilirubin', 'Platelet count', 'monocytes(%)', 'antithrombin', 'Interleukin 8', 'indirect bilirubin', 'Red blood cell distribution width ', 'neutrophils(%)', 'total protein', 'Quantification of Treponema pallidum antibodies', 'Prothrombin activity', 'HBsAg', 'mean corpuscular volume', 'hematocrit', 'White blood cell count', 'Tumor necrosis factorα', 'mean corpuscular hemoglobin concentration', 'fibrinogen', 'Interleukin 1β', 'Urea', 'lymphocyte count', 'PH value', 'Red blood cell count', 'Eosinophil count', 'Corrected calcium', 'Serum potassium', 'glucose', 'neutrophils count', 'Direct bilirubin', 'Mean platelet volume', 'ferritin', 'RBC distribution width SD', 'Thrombin time', '(%)lymphocyte', 'HCV antibody quantification', 'D-D dimer', 'Total cholesterol', 'aspartate aminotransferase', 'Uric acid', 'HCO3-', 'calcium', 'Amino-terminal brain natriuretic peptide precursor(NT-proBNP)', 'Lactate dehydrogenase', 'platelet large cell ratio ', 'Interleukin 6', 'Fibrin degradation products', 'monocytes count', 'PLT distribution width', 'globulin', 'γ-glutamyl transpeptidase', 'International standard ratio', 'basophil count(#)', 'mean corpuscular hemoglobin ', 'Activation of partial thromboplastin time', 'Hypersensitive c-reactive protein', 'HIV antibody quantification', 'serum sodium', 'thrombocytocrit', 'ESR', 'glutamic-pyruvic transaminase', 'eGFR', 'creatinine']
require_impute_features = labtest_features
normalize_features = ['Age'] + labtest_features + ['LOS']

drop_columns = labtest_features

In [None]:
df = pd.read_csv(os.path.join(data_dir, "processed", f"tjh_dataset_formatted.csv"))
# df

In [None]:
df = drop_features_randomly(df, X=90, drop_columns=drop_columns, seed=SEED)
fold_name = 'fold_9'

## Stratified split dataset into `Training`, `Validation` and `Test` sets

- Stratified dataset according to `Outcome` column
- Baseline: 70% Training, 10% Validation, 20% Test (No Calibration)
    - Name: train, val, test


In [None]:
# Group the dataframe by patient ID
grouped = df.groupby('PatientID')

# Get the patient IDs and outcomes
patients = np.array(list(grouped.groups.keys()))
patients_outcome = np.array([grouped.get_group(patient_id)['Outcome'].iloc[0] for patient_id in patients])

# Get the train_val/test patient IDs
train_val_patients, test_patients = train_test_split(patients, test_size=20/100, random_state=SEED, stratify=patients_outcome)

# Get the train/val patient IDs
train_val_patients_outcome = np.array([grouped.get_group(patient_id)['Outcome'].iloc[0] for patient_id in train_val_patients])
train_patients, val_patients = train_test_split(train_val_patients, test_size=10/80, random_state=SEED, stratify=train_val_patients_outcome)


In [None]:
# Create train, val, test, [traincal, calib] dataframes for the current fold
train_df = df[df['PatientID'].isin(train_patients)]
val_df = df[df['PatientID'].isin(val_patients)]
test_df = df[df['PatientID'].isin(test_patients)]


In [None]:
save_dir = os.path.join(data_dir, 'processed', fold_name) # forward fill
Path(save_dir).mkdir(parents=True, exist_ok=True)

# # Save the train, val, and test dataframes for the current fold to csv files
# train_df.to_csv(os.path.join(save_dir, "train_raw.csv"), index=False)
# val_df.to_csv(os.path.join(save_dir, "val_raw.csv"), index=False)
# test_df.to_csv(os.path.join(save_dir, "test_raw.csv"), index=False)
# traincal_df.to_csv(os.path.join(save_dir, "traincal_raw.csv"), index=False)
# calib_df.to_csv(os.path.join(save_dir, "calib_raw.csv"), index=False)

In [None]:
# Calculate the mean and std of the train set (include age, lab test features, and LOS) on the data in 5% to 95% quantile range
train_df, val_df, test_df, default_fill, los_info, train_mean, train_std = normalize_dataframe(train_df, val_df, test_df, normalize_features)


# Save the zscored dataframes to csv files
# train_df.to_csv(os.path.join(save_dir, "train_after_zscore.csv"), index=False)
# val_df.to_csv(os.path.join(save_dir, "val_after_zscore.csv"), index=False)
# test_df.to_csv(os.path.join(save_dir, "test_after_zscore.csv"), index=False)

# Forward Imputation after grouped by PatientID
# Notice: if a patient has never done certain lab test, the imputed value will be the median value calculated from train set
train_x, train_y, train_pid = forward_fill_pipeline(train_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)
val_x, val_y, val_pid = forward_fill_pipeline(val_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)
test_x, test_y, test_pid = forward_fill_pipeline(test_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)

# Save the imputed dataset to pickle file
pd.to_pickle(train_x, os.path.join(save_dir, "train_x.pkl"))
pd.to_pickle(train_y, os.path.join(save_dir, "train_y.pkl"))
pd.to_pickle(train_pid, os.path.join(save_dir, "train_pid.pkl"))
pd.to_pickle(val_x, os.path.join(save_dir, "val_x.pkl"))
pd.to_pickle(val_y, os.path.join(save_dir, "val_y.pkl"))
pd.to_pickle(val_pid, os.path.join(save_dir, "val_pid.pkl"))
pd.to_pickle(test_x, os.path.join(save_dir, "test_x.pkl"))
pd.to_pickle(test_y, os.path.join(save_dir, "test_y.pkl"))
pd.to_pickle(test_pid, os.path.join(save_dir, "test_pid.pkl"))
pd.to_pickle(los_info, os.path.join(save_dir, "los_info.pkl")) # LOS statistics (calculated from the train set)