In [1]:
import os
import textwrap

import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from numpy import nanmean as np_nanmean
from numpy import nanmedian as np_nanmedian
from sklearn import model_selection


# Compute Missing Rates


In [2]:
patientids = list()
with open(
    "./results/patients_18+_1d+_2+lactate_measurements_in_first2d_binned.txt", "r"
) as f:
    patientids.extend(int(x) for x in f.readlines())

os.makedirs("./results/missing_rate_plots/data", exist_ok=True)
df_missing_rates = pd.DataFrame()
for pid in patientids:
    df_binned = pd.read_csv(f"./results/{pid}/{pid}_dynamic_binned2h_48h.csv")
    pid_rates = dict()
    pid_rates["Total Records"] = len(df_binned["Patient ID"])
    # skip Patient ID and Minutes of Stay
    for col in df_binned.columns[2:]:
        # compute missing values
        pid_rates[col] = df_binned[col].isnull().sum()
    df_missing_rates = df_missing_rates.append(pid_rates, ignore_index=True)
# set pids as index
df_missing_rates.set_index(
    pd.Index(patientids, name="Patient ID"), drop=True, inplace=True
)
df_missing_rates.to_csv("./results/missing_rate_plots/data/missing_rates.csv")


# Drop Columns Having >40% Missing Rate (Patient-Level)


In [3]:
df_missing_rates = pd.read_csv(
    f"./results/missing_rate_plots/data/missing_rates.csv", index_col=0
)
total_pid = len(df_missing_rates)
missingness = dict()
# skip Total Records
for col in df_missing_rates.columns[1:]:
    # sum the number of patients not having any records of that variable
    missing_pid = len(
        df_missing_rates[df_missing_rates[col] == df_missing_rates["Total Records"]]
    )
    missingness[col] = (missing_pid, total_pid - missing_pid)

df_availability = pd.DataFrame(
    data=[(k, (total_pid - v[0]) / total_pid) for k, v in missingness.items()],
    columns=["Variables", "Available %"],
)
# drop variables having less than 60% availability
columns = df_availability[df_availability["Available %"] > 0.60]["Variables"].tolist()
columns.insert(0, "Minutes of Stay")
columns.insert(1, "Patient ID")

patientids = list()
with open(
    "./results/patients_18+_1d+_2+lactate_measurements_in_first2d_binned.txt", "r"
) as f:
    patientids.extend(int(x) for x in f.readlines())

for pid in patientids:
    df_binned = pd.read_csv(f"./results/{pid}/{pid}_dynamic_binned2h_48h.csv")
    df_binned = df_binned[columns]
    df_binned.set_index(
        "Minutes of Stay",
        drop=True,
        inplace=True,
    )
    df_binned.to_csv(f"./results/{pid}/{pid}_dynamic_binned2h_48h_dropped.csv")


# Drop Empty Rows


In [4]:
patientids = list()
with open(
    "./results/patients_18+_1d+_2+lactate_measurements_in_first2d_binned.txt", "r"
) as f:
    patientids.extend(int(x) for x in f.readlines())

for pid in patientids:
    df_binned_pre = pd.read_csv(
        f"./results/{pid}/{pid}_dynamic_binned2h_48h_dropped.csv"
    )
    df_binned_post = df_binned_pre.dropna(how="all", subset=df_binned_pre.columns[2:])
    if not df_binned_pre.equals(df_binned_post):
        print(
            f"{pid} had {len(df_binned_pre) - len(df_binned_post)} empty rows which have been removed."
        )
        df_binned_post.set_index(
            "Minutes of Stay",
            drop=True,
            inplace=True,
        )
        df_binned_post.to_csv(f"./results/{pid}/{pid}_dynamic_binned2h_48h_dropped.csv")


138 had 1 empty rows which have been removed.
660 had 1 empty rows which have been removed.
808 had 1 empty rows which have been removed.
809 had 1 empty rows which have been removed.
889 had 1 empty rows which have been removed.
1617 had 1 empty rows which have been removed.
2136 had 1 empty rows which have been removed.
2340 had 1 empty rows which have been removed.
2370 had 1 empty rows which have been removed.
2489 had 1 empty rows which have been removed.
2611 had 1 empty rows which have been removed.
2647 had 1 empty rows which have been removed.
2768 had 3 empty rows which have been removed.
3060 had 1 empty rows which have been removed.
3235 had 2 empty rows which have been removed.
3714 had 1 empty rows which have been removed.
3861 had 1 empty rows which have been removed.
3877 had 1 empty rows which have been removed.
4198 had 1 empty rows which have been removed.
4512 had 1 empty rows which have been removed.
4621 had 1 empty rows which have been removed.
5167 had 1 empty r

# RECompute Missing Rates


In [5]:
patientids = list()
with open(
    "./results/patients_18+_1d+_2+lactate_measurements_in_first2d_binned.txt", "r"
) as f:
    patientids.extend(int(x) for x in f.readlines())

os.makedirs("./results/missing_rate_plots/data", exist_ok=True)
df_missing_rates = pd.DataFrame()
for pid in patientids:
    df_binned = pd.read_csv(f"./results/{pid}/{pid}_dynamic_binned2h_48h_dropped.csv")
    pid_rates = dict()
    pid_rates[f"Total Records"] = len(df_binned["Patient ID"])
    # skip Patient ID and Minutes of Stay
    for col in df_binned.columns[2:]:
        # compute missing values
        pid_rates[col] = df_binned[col].isnull().sum()
    df_missing_rates = df_missing_rates.append(pid_rates, ignore_index=True)
# set pids as index
df_missing_rates.set_index(
    pd.Index(patientids, name="Patient ID"), drop=True, inplace=True
)
df_missing_rates.to_csv("./results/missing_rate_plots/data/missing_rates_dropped.csv")


# Plot Variables' Missing Rates


In [6]:
# retrieved from: https://stackoverflow.com/questions/6170246/how-do-i-use-matplotlib-autopct/6170354#6170354
def make_autopct(values):
    def my_autopct(pct):
        return f"{pct:.2f}%  ({round(pct * sum(values) / 100.0):d})"

    return my_autopct


df = pd.read_csv("./results/missing_rate_plots/data/missing_rates.csv", index_col=0)
df_dropped = pd.read_csv(
    "./results/missing_rate_plots/data/missing_rates_dropped.csv", index_col=0
)
total_pid = len(df)
missingness = dict()
# skip Total Records
for col in df.columns[1:]:
    # sum the number of missing values
    missing_ds = df[col].sum()
    # sum the number of total records
    total_ds = df["Total Records"].sum()
    # sum the number of patients not having any record of that variable
    missing_pid = len(df[df[col] == df["Total Records"]])

    # remove invalid characters from filename
    filename = col
    for c in ("\\", "/", ":", "*", "?", '"', "<", ">", "|"):
        filename = filename.replace(c, "_")

    missingness[col] = dict(
        ds=(missing_ds, total_ds - missing_ds),
        pid=(missing_pid, total_pid - missing_pid),
    )
    if col in df_dropped.columns:
        # sum the number of missing values
        missing_dropped_ds = df_dropped[col].sum()
        # sum the number of total records
        total_dropped_ds = df_dropped["Total Records"].sum()
        missingness[col]["dropped_ds"] = (
            missing_dropped_ds,
            total_dropped_ds - missing_dropped_ds,
        )
        # plots
        f, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(30, 10))
        # https://stackoverflow.com/questions/10351565/how-do-i-fit-long-title
        ax1.set_title(
            "\n".join(textwrap.wrap(f"{col} Missing Rate (Patients)", 45)), fontsize=25
        )
        ax2.set_title(
            "\n".join(textwrap.wrap(f"{col} Missing Rate (Dataset)", 45)), fontsize=25
        )
        ax3.set_title(
            "\n".join(textwrap.wrap(f"{col} Missing Rate (Dataset) AFTER Drop", 45)),
            fontsize=25,
        )
        ax1.pie(
            x=missingness[col]["pid"],
            labels=["Missing", "Present"],
            explode=[0.05, 0.05],
            autopct=make_autopct(missingness[col]["pid"]),
            textprops=dict(fontsize=22),
        )
        ax2.pie(
            x=missingness[col]["ds"],
            labels=["Missing", "Present"],
            explode=[0.05, 0.05],
            autopct=make_autopct(missingness[col]["ds"]),
            textprops=dict(fontsize=22),
        )
        ax3.pie(
            x=missingness[col]["dropped_ds"],
            labels=["Missing", "Present"],
            explode=[0.05, 0.05],
            autopct=make_autopct(missingness[col]["dropped_ds"]),
            textprops=dict(fontsize=22),
        )
        ax1.axis("equal")
        ax2.axis("equal")
        ax3.axis("equal")
        plt.tight_layout()
        plt.savefig(f"./results/missing_rate_plots/{filename}.png")
        plt.close()
    else:
        # plots
        f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 10))
        # https://stackoverflow.com/questions/10351565/how-do-i-fit-long-title
        ax1.set_title(
            "\n".join(textwrap.wrap(f"{col} Missing Rate (Patients)", 45)), fontsize=25
        )
        ax2.set_title(
            "\n".join(textwrap.wrap(f"{col} Missing Rate (Dataset)", 45)), fontsize=25
        )
        ax1.pie(
            x=missingness[col]["pid"],
            labels=["Missing", "Present"],
            explode=[0.05, 0.05],
            autopct=make_autopct(missingness[col]["pid"]),
            textprops=dict(fontsize=22),
        )
        ax2.pie(
            x=missingness[col]["ds"],
            labels=["Missing", "Present"],
            explode=[0.05, 0.05],
            autopct=make_autopct(missingness[col]["ds"]),
            textprops=dict(fontsize=22),
        )
        ax1.axis("equal")
        ax2.axis("equal")
        plt.tight_layout()
        plt.savefig(f"./results/missing_rate_plots/{filename}.png")
        plt.close()


def plot_total_missingness(data, title, ylim):
    data.sort_values(by="Available %", ascending=False, inplace=True)

    tmp_title = f"{title} 1st HALF"
    tmp_data = data.copy()
    tmp_data = tmp_data.iloc[: len(data) // 2]
    f1, ax1 = plt.subplots(figsize=(50, 27))
    sns.barplot(data=tmp_data, x="Variables", y="Available %", color="blue", ax=ax1)
    ax1.grid(axis="y")
    plt.title(tmp_title, fontsize=35)
    plt.ylim(ylim)
    plt.xlabel("Variables", fontsize=27)
    plt.ylabel("Availability %", fontsize=27)
    # disable scientific notation
    plt.ticklabel_format(style="plain", axis="y")
    # https://stackoverflow.com/questions/10351565/how-do-i-fit-long-title
    plt.xticks(
        ticks=list(range(len(tmp_data["Variables"]))),
        labels=[
            "\n".join(textwrap.wrap(f"{x}", 60)) for x in tmp_data["Variables"].tolist()
        ],
        rotation=90,
        fontsize=25,
    )
    plt.yticks(ticks=[x / 10 for x in range(11)], fontsize=25)
    plt.tight_layout()
    plt.savefig(f"./results/missing_rate_plots/{tmp_title}.png")
    plt.close()

    tmp_title = f"{title} 2nd HALF"
    tmp_data = data.copy()
    tmp_data = tmp_data.iloc[len(data) // 2 :]
    f2, ax2 = plt.subplots(figsize=(50, 27))
    sns.barplot(data=tmp_data, x="Variables", y="Available %", color="blue", ax=ax2)
    ax2.grid(axis="y")
    plt.title(tmp_title, fontsize=35)
    plt.ylim(ylim)
    plt.xlabel("Variables", fontsize=27)
    plt.ylabel("Availability %", fontsize=27)
    # disable scientific notation
    plt.ticklabel_format(style="plain", axis="y")
    # https://stackoverflow.com/questions/10351565/how-do-i-fit-long-title
    plt.xticks(
        ticks=list(range(len(tmp_data["Variables"]))),
        labels=[
            "\n".join(textwrap.wrap(f"{x}", 60)) for x in tmp_data["Variables"].tolist()
        ],
        rotation=90,
        fontsize=25,
    )
    plt.yticks(ticks=[x / 10 for x in range(11)], fontsize=25)
    plt.tight_layout()
    plt.savefig(f"./results/missing_rate_plots/{tmp_title}.png")
    plt.close()


plot_total_missingness(
    data=pd.DataFrame(
        data=[
            (k, (total_pid - v["pid"][0]) / total_pid) for k, v in missingness.items()
        ],
        columns=["Variables", "Available %"],
    ),
    title="AVAILABILITY BY PATIENTS (PERCENTAGE)",
    ylim=(0, 1),
)
plot_total_missingness(
    data=pd.DataFrame(
        data=[
            (k, v["ds"][0] / (v["ds"][0] + v["ds"][1])) for k, v in missingness.items()
        ],
        columns=["Variables", "Available %"],
    ),
    title="AVAILABILITY BY RECORDS (PERCENTAGE)",
    ylim=(0, 1),
)
plot_total_missingness(
    data=pd.DataFrame(
        data=[
            (k, v["dropped_ds"][0] / (v["dropped_ds"][0] + v["dropped_ds"][1]))
            for k, v in missingness.items()
            if "dropped_ds" in v
        ],
        columns=["Variables", "Available %"],
    ),
    title="AVAILABILITY BY RECORDS (PERCENTAGE) AFTER DROP",
    ylim=(0, 1),
)


# Create Class/Target Labels


In [7]:
patientids = list()
with open(
    "./results/patients_18+_1d+_2+lactate_measurements_in_first2d_binned.txt", "r"
) as f:
    patientids.extend(int(x) for x in f.readlines())


def get_class(row):
    if pd.notna(row["Lactate"]):
        if row["Lactate"] < 2:
            return 1
        elif row["Lactate"] < 4:  # and Lactate >= 2
            return 2
        else:  # Lactate >= 4
            return 3


for pid in patientids:
    df = pd.read_csv(f"./results/{pid}/{pid}_dynamic_binned2h_48h_dropped.csv")
    # label current lactate according to class
    df["Lactate_Class"] = df.apply(get_class, axis=1)
    df["Lactate_Class"].ffill(inplace=True)

    last_class = None
    last_outcome = None
    outcomes = list()
    # start from the bottom
    df_reversed = df.iloc[::-1].copy()
    # compute outcomes
    for tup in df_reversed.itertuples():
        last_outcome = (
            None
            if last_class is None
            else (
                0
                if last_class < tup.Lactate_Class
                or (last_class == 1 and tup.Lactate_Class == 1)
                else 1
            )
        )
        outcomes.append(last_outcome)
        if pd.notna(tup.Lactate):
            last_class = tup.Lactate_Class
    # label current lactate outcome according to previous one (next in terms of time)
    df_reversed["Lactate_Outcome"] = outcomes
    df_labelled = df_reversed.iloc[::-1].copy()
    df_labelled.set_index(
        "Minutes of Stay",
        drop=True,
        inplace=True,
    )
    # fill forward everything except Lactate
    columns_no_lactate = df_labelled.columns.tolist()
    columns_no_lactate.remove("Lactate")
    df_labelled[columns_no_lactate] = df_labelled[columns_no_lactate].ffill()
    # drop last row as it cannot be used for prediction
    df_labelled.drop(df_labelled.tail(1).index, inplace=True)
    df_labelled.to_csv(
        f"./results/{pid}/{pid}_dynamic_binned2h_48h_dropped_labelled.csv"
    )


# Merge Everything and Split According to Class


In [8]:
patientids = list()
with open(
    "./results/patients_18+_1d+_2+lactate_measurements_in_first2d_binned.txt", "r"
) as f:
    patientids.extend(int(x) for x in f.readlines())

df_complete = pd.concat(
    [
        pd.read_csv(f"./results/{pid}/{pid}_dynamic_binned2h_48h_dropped_labelled.csv")
        for pid in patientids
    ],
    ignore_index=True,
)

for cl, cl_name in ((1, "Normal"), (2, "Mild"), (3, "Severe")):
    df_tmp = df_complete[df_complete["Lactate_Class"] == cl]
    df_tmp.reset_index(drop=True, inplace=True)
    df_tmp.to_csv(f"./results/class_{cl}_{cl_name}.csv", index=False)


# Add Sequential ID to Lactates' Values and Apply Fill-Forward


In [9]:
# global counter
i = 0


def set_sequential_id(row):
    global i
    if pd.notna(row["Lactate"]):
        i = i + 1
        return i


for cl, cl_name in ((1, "Normal"), (2, "Mild"), (3, "Severe")):
    df_class = pd.read_csv(f"./results/class_{cl}_{cl_name}.csv")
    df_class["Sequential ID"] = df_class.apply(set_sequential_id, axis=1)
    df_class["Sequential ID"].ffill(inplace=True)
    df_class["Lactate"].ffill(inplace=True)
    df_class.to_csv(f"./results/class_{cl}_{cl_name}_sequentialised.csv", index=False)


# Split Datasets into Train-Test and Impute Missing Values


In [10]:
HYPER_categ_fillna = "mode"  # possible values: mode
HYPER_numer_fillna = "median"  # possible values: mean, median
print(
    f"Imputation Strategies:\nCategorical: {HYPER_categ_fillna}\nNumerical: {HYPER_numer_fillna}"
)


def custom_mode(x):
    m = pd.Series.mode(x)
    return m.iloc[0] if not m.empty else None


def get_fillna_method(x):
    if x == "mean":
        return np_nanmean
    elif x == "median":
        return np_nanmedian
    elif x == "mode":
        return custom_mode


os.makedirs("./results/splits", exist_ok=True)
for cl, cl_name in ((1, "Normal"), (2, "Mild"), (3, "Severe")):
    df_class = pd.read_csv(f"./results/class_{cl}_{cl_name}_sequentialised.csv")
    grouped = df_class.groupby(df_class["Sequential ID"])

    train_seq_ids, test_seq_ids = model_selection.train_test_split(
        list(grouped.groups.keys()), train_size=0.75, random_state=666
    )
    df_train = df_class[df_class["Sequential ID"].isin(train_seq_ids)]
    df_test = df_class[df_class["Sequential ID"].isin(test_seq_ids)]

    X_train = df_train[[x for x in df_train.columns if x != "Lactate_Outcome"]].copy()
    # remove duplicates of the same sequential id
    y_train = (
        df_train[["Sequential ID", "Lactate_Outcome"]]
        .groupby(by="Sequential ID")
        .first()["Lactate_Outcome"]
    )
    y_train.to_csv(f"./results/splits/y_train_{cl}_{cl_name}.csv", index=False)

    X_test = df_test[[x for x in df_test.columns if x != "Lactate_Outcome"]].copy()
    # remove duplicates of the same sequential id
    y_test = (
        df_test[["Sequential ID", "Lactate_Outcome"]]
        .groupby(by="Sequential ID")
        .first()["Lactate_Outcome"]
    )
    y_test.to_csv(f"./results/splits/y_test_{cl}_{cl_name}.csv", index=False)

    # extract excluded/categorical/numerical variables
    excluded_vars = (
        "Patient ID",
        "Minutes of Stay",
        "Lactate_Class",
        "Lactate_Outcome",
        "Sequential ID",
    )
    categorical_vars = (
        "Glasgow Coma Score - Verbal Response",
        "Glasgow Coma Score - Motor Response",
        "Glasgow Coma Score - Eye Opening",
        "Glasgow Coma Score - Total",
        "Circadian rhythm",
        "Richmond agitation-sedation scale",
        "Ventilator Airway Code",
    )
    numerical_vars = [
        x
        for x in df_class.columns
        if x not in excluded_vars and x not in categorical_vars
    ]
    # create dictionary for fill-forwarding with the appropriate method
    fillna_dict = {
        x: get_fillna_method(HYPER_categ_fillna)(X_train[x]) for x in categorical_vars
    }
    fillna_dict.update(
        {x: get_fillna_method(HYPER_numer_fillna)(X_train[x]) for x in numerical_vars}
    )

    X_train.fillna(fillna_dict, inplace=True)
    X_train.to_csv(f"./results/splits/X_train_{cl}_{cl_name}.csv", index=False)

    # impute test with values obtained from train
    X_test.fillna(fillna_dict, inplace=True)
    X_test.to_csv(f"./results/splits/X_test_{cl}_{cl_name}.csv", index=False)

    print(cl_name)
    print(fillna_dict)


Imputation Strategies:
Categorical: mode
Numerical: median
Normal
{'Glasgow Coma Score - Verbal Response': 5.0, 'Glasgow Coma Score - Motor Response': 6.0, 'Glasgow Coma Score - Eye Opening': 4.0, 'Glasgow Coma Score - Total': 15.0, 'Circadian rhythm': 1.0, 'Richmond agitation-sedation scale': -1.0, 'Ventilator Airway Code': 1.0, 'Alanine aminotransferase': 28.0, 'Anion Gap': 4.800000000000011, 'Aspartate aminotransferase': 43.0, 'Base excess': 0.1, 'Bicarbonate': 24.2, 'Body Height': 170.0, 'Body Weight': 75.0, 'Calcium.ionized': 1.15, 'Carbon dioxide [Partial pressure]': 36.3, 'Central Venous Pressure': 8.183333333333334, 'Chloride': 108.0, 'Creatinine': 80.0, 'Diastolic Arterial Blood Pressure': 55.27272727272727, 'Fraction of Inspired Oxygen': 43.7, 'Glucose': 7.4, 'Heart rate': 84.30471628592483, 'Hemoglobin': 102.0, 'Invasive Diastolic Arterial Blood Pressure': 55.21666666666667, 'Invasive Mean Arterial Blood Pressure': 75.06666666666666, 'Invasive Systolic Arterial Blood Pressur

# Add Padding/Remove Exceeding Rows and Translate to a Single Row


In [11]:
HYPER_max_pad = 10

for dataset in ("train", "test"):
    for cl, cl_name in ((1, "Normal"), (2, "Mild"), (3, "Severe")):
        X = pd.read_csv(f"./results/splits/X_{dataset}_{cl}_{cl_name}.csv")

        new_columns = list()
        for i in range(HYPER_max_pad):
            new_columns.extend([f"{x}_{i}" for x in X.columns])

        grouped = X.groupby(X["Sequential ID"])

        rows = list()
        for k, group in grouped.groups.items():
            # add row containing both real rows and zero rows
            if len(group) > HYPER_max_pad:
                group = group[:HYPER_max_pad]
            X_sub = X.loc[group]

            row = list()
            for i in range(len(group)):
                row.extend(X_sub.iloc[i].tolist())
            for i in range(HYPER_max_pad - len(group)):
                row.extend([0] * len(X.columns))
            rows.append(row)

        new_X = pd.DataFrame(data=rows, columns=new_columns)
        new_X.to_csv(
            f"./results/splits/X_{dataset}_{cl}_{cl_name}_padded_translated.csv",
            index=False,
        )
