In [1]:
import os

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import ensemble, linear_model, metrics, model_selection, tree


# Create and Evaluate Models


In [None]:
HYPER_max_pad = 10

os.makedirs("./results/stats", exist_ok=True)
categorical_vars = []
for i in range(HYPER_max_pad):
    categorical_vars.extend(
        [
            f"{x}_{i}"
            for x in (
                "Glasgow Coma Score - Verbal Response",
                "Glasgow Coma Score - Motor Response",
                "Glasgow Coma Score - Eye Opening",
                "Glasgow Coma Score - Total",
                "Circadian rhythm",
                "Richmond agitation-sedation scale",
                "Ventilator Airway Code",
            )
        ]
    )

models = (
    (
        "Logistic Regression",
        linear_model.LogisticRegression(random_state=666, class_weight="balanced"),
    ),
    (
        "Random Forest",
        ensemble.RandomForestClassifier(random_state=666, class_weight="balanced"),
    ),
    ("Decision Trees", tree.DecisionTreeClassifier(random_state=666)),
    ("Gradient Boosting", ensemble.GradientBoostingClassifier(random_state=666)),
)

for model_name, model in models:
    for cl, cl_name in ((1, "Normal"), (2, "Mild"), (3, "Severe")):
        print(f"Model: {model_name} - Class: {cl_name}")
        X_train = pd.read_csv(
            f"./results/splits/X_train_{cl}_{cl_name}_padded_translated.csv",
            dtype={k: "category" for k in categorical_vars},
        )
        y_train = pd.read_csv(f"./results/splits/y_train_{cl}_{cl_name}.csv")
        X_test = pd.read_csv(
            f"./results/splits/X_test_{cl}_{cl_name}_padded_translated.csv",
            dtype={k: "category" for k in categorical_vars},
        )
        y_test = pd.read_csv(f"./results/splits/y_test_{cl}_{cl_name}.csv")
        X = X_train.append(other=X_test)
        y = y_train.append(other=y_test)

        # drop useless columns
        cols_to_drop = list()
        for col in X.columns:
            if (
                "Patient ID" in col  # remove all "Patient ID_N"
                or "Sequential ID" in col  # remove all "Sequential ID_N"
            ):
                cols_to_drop.append(col)

        X.drop(labels=cols_to_drop, axis=1, inplace=True)

        X = pd.get_dummies(data=X)
        cv = model_selection.StratifiedKFold(n_splits=5, random_state=666, shuffle=True)

        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        fig, ax = plt.subplots(figsize=(20, 10))
        for i, (train, validation) in enumerate(cv.split(X, y)):
            model.fit(X=X.iloc[train], y=y.iloc[train].values.ravel())
            viz = metrics.RocCurveDisplay.from_estimator(
                estimator=model,
                X=X.iloc[validation],
                y=y.iloc[validation].values.ravel(),
                name=f"ROC fold {i}",
                pos_label=0,
                alpha=0.3,
                lw=1,
                ax=ax,
            )
            interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)
            aucs.append(viz.roc_auc)

        ax.plot(
            [0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8
        )

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = metrics.auc(mean_fpr, mean_tpr)
        std_auc = np.std(aucs)
        ax.plot(
            mean_fpr,
            mean_tpr,
            color="b",
            label=f"Mean ROC (AUC = {mean_auc:0.2f} $\\pm$ {std_auc:0.2f})",
            lw=2,
            alpha=0.8,
        )

        std_tpr = np.std(tprs, axis=0)
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
        ax.fill_between(
            mean_fpr,
            tprs_lower,
            tprs_upper,
            color="grey",
            alpha=0.2,
            label="$\\pm$ 1 std. dev.",
        )

        ax.set(
            xlim=[-0.05, 1.05],
            ylim=[-0.05, 1.05],
            title=f"Model: {model_name} - Class: {cl_name}",
        )
        ax.legend(loc="lower right")
        plt.tight_layout()
        plt.savefig(f"./results/stats/{model_name}_{cl}_{cl_name}.png")
        plt.close()
