In [1]:
import os

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import ensemble, linear_model, metrics, model_selection, tree


# Create and Evaluate Models


In [2]:
HYPER_max_pad = 10

os.makedirs("./results/stats", exist_ok=True)
categorical_vars = []
for i in range(HYPER_max_pad):
    categorical_vars.extend(
        [
            f"{x}_{i}"
            for x in (
                "Glasgow Coma Score - Verbal Response",
                "Glasgow Coma Score - Motor Response",
                "Glasgow Coma Score - Eye Opening",
                "Glasgow Coma Score - Total",
                "Circadian rhythm",
                "Richmond agitation-sedation scale",
                "Ventilator Airway Code",
            )
        ]
    )

proto_models = (
    (
        "Logistic Regression",
        linear_model.LogisticRegression(
            n_jobs=-1, random_state=666, class_weight="balanced"
        ),
    ),
    (
        "Random Forest",
        ensemble.RandomForestClassifier(
            n_jobs=-1, random_state=666, class_weight="balanced"
        ),
    ),
    ("Decision Trees", tree.DecisionTreeClassifier(random_state=666)),
    ("Gradient Boosting", ensemble.GradientBoostingClassifier(random_state=666)),
)

for model_name, proto_model in proto_models:
    for cl, cl_name in ((1, "Normal"), (2, "Mild"), (3, "Severe")):
        print(f"Model: {model_name}\nClass: {cl_name}")
        X = pd.read_csv(
            f"./results/splits/X_train_{cl}_{cl_name}_padded_translated.csv"
        )
        y = pd.read_csv(f"./results/splits/y_train_{cl}_{cl_name}.csv")
        X_test = pd.read_csv(
            f"./results/splits/X_test_{cl}_{cl_name}_padded_translated.csv"
        )
        y_test = pd.read_csv(f"./results/splits/y_test_{cl}_{cl_name}.csv")

        # drop useless columns (both X and X_test have the same columns)
        cols_to_drop = categorical_vars
        for col in X.columns:
            if (
                "Patient ID" in col  # remove all "Patient ID_N"
                or "Sequential ID" in col  # remove all "Sequential ID_N"
            ):
                cols_to_drop.append(col)

        X.drop(labels=cols_to_drop, axis=1, inplace=True)
        X_test.drop(labels=cols_to_drop, axis=1, inplace=True)

        cv = model_selection.StratifiedKFold(
            n_splits=10, random_state=666, shuffle=True
        )
        res = model_selection.cross_validate(
            estimator=proto_model,
            X=X,
            y=y.values.ravel(),
            scoring="roc_auc",
            cv=cv,
            n_jobs=-1,
            return_estimator=True,
        )
        scores = list(res["test_score"])
        models = list(res["estimator"])
        model = models[scores.index(max(scores))]

        y_predicted = model.predict(X=X_test)
        print(
            metrics.classification_report(
                y_true=y_test,
                y_pred=y_predicted,
                labels=[0, 1],
                target_names=["Positive", "Negative"],
            )
        )
        print(
            f"ROC AUC Score: {metrics.roc_auc_score(y_true=y_test, y_score=y_predicted)}"
        )
        metrics.plot_roc_curve(estimator=model, X=X_test, y=y_test, pos_label=0)
        plt.tight_layout()
        plt.savefig(f"./results/stats/{model_name}_{cl}_{cl_name}.png")
        plt.close()


Model: Logistic Regression
Class: Normal
              precision    recall  f1-score   support

    Positive       0.94      0.57      0.71     13031
    Negative       0.12      0.65      0.21      1242

    accuracy                           0.57     14273
   macro avg       0.53      0.61      0.46     14273
weighted avg       0.87      0.57      0.67     14273

ROC AUC Score: 0.6065360552953684
Model: Logistic Regression
Class: Mild
              precision    recall  f1-score   support

    Positive       0.51      0.55      0.53      1982
    Negative       0.74      0.70      0.72      3532

    accuracy                           0.65      5514
   macro avg       0.62      0.63      0.62      5514
weighted avg       0.65      0.65      0.65      5514

ROC AUC Score: 0.6252194152811316
Model: Logistic Regression
Class: Severe
              precision    recall  f1-score   support

    Positive       0.45      0.56      0.50       793
    Negative       0.81      0.73      0.77     