In [1]:
import pandas as pd
import numpy as np
import os
import glob
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, log_loss, hinge_loss
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_rel, wilcoxon, shapiro
from sklearn.model_selection import GridSearchCV
import gc
import itertools
from sklearn.utils import resample
import ast
import json
import re
from sklearn.dummy import DummyClassifier

import utils 
import model_train
from constants import *


In [2]:
# List of file paths
path = '/Users/yusiwei/Library/CloudStorage/OneDrive-Personal/research/Third Year Paper/experiments/PSM-SepsisPatient.csv'
df = pd.read_csv(path)
df

Unnamed: 0,PatientIdentifier,SepsisFlag,GenderDescription,RaceDescription,EthnicGroupDescription,AgeCategory,LOSDays,NumberofVisits,HX_AIDS,HX_ALCOHOL,...,Antiviral_AdminFlag,0-6HoursToFirstAntiviralAdmin,6-12HoursToFirstAntiviralAdmin,12-24HoursToFirstAntiviralAdmin,>24HoursToFirstAntiviralAdmin,Antifungal_AdminFlag,0-6HoursToFirstAntifungalAdmin,6-12HoursToFirstAntifungalAdmin,12-24HoursToFirstAntifungalAdmin,>24HoursToFirstAntifungalAdmin
0,4,1,Male,White,Non-Hispanic or Latino,83,24,3,0,0,...,0,0,0,0,0,1,0,0,0,1
1,19,0,Female,Black or African American,Non-Hispanic or Latino,47,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2,23,0,Male,White,Non-Hispanic or Latino,56,3,1,2,2,...,0,0,0,0,0,0,0,0,0,0
3,26,0,Male,Black or African American,Non-Hispanic or Latino,48,1,2,2,2,...,0,0,0,0,0,0,0,0,0,0
4,30,0,Male,Black or African American,Non-Hispanic or Latino,58,4,16,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23101,119770,1,Male,Black or African American,Non-Hispanic or Latino,37,40,2,0,0,...,0,0,0,0,0,1,0,0,0,1
23102,119787,1,Male,Black or African American,Non-Hispanic or Latino,54,3,5,0,0,...,0,0,0,0,0,0,0,0,0,0
23103,119790,0,Male,Unavailable,Unknown,58,1,1,2,2,...,0,0,0,0,0,0,0,0,0,0
23104,119873,0,Male,Black or African American,Non-Hispanic or Latino,73,1,2,2,2,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Define a list of ML models
models = [
    ("DT", DecisionTreeClassifier(criterion='entropy', random_state=42)),
    ("LR", LogisticRegression(solver='lbfgs', max_iter=100, random_state=42, n_jobs=-1)),
    ("NB", GaussianNB()),
    ("NN", MLPClassifier(random_state=42)),
    ("RF", RandomForestClassifier(criterion='entropy', random_state=42)),
    ("SVM", LinearSVC(random_state=42)) 
]

In [4]:
df_encoded = utils.encode_categorical_from_file(df)

googd_loss = model_train.get_model_standard(df_encoded, n_bootstrap=100, test_size=0.2)
print(f"Good loss: {googd_loss}")

Good loss: 5.384135749639681


In [None]:
# Initialize an empty DataFrame to store results
results_df = pd.DataFrame()

In [None]:
for name, model in models:
    print(f"Training model: {name}")

    df_encoded = utils.encode_categorical_from_file(df)

    accuracies, precisions, recalls, f1_scores, aucs, losses, tps, tns, fps, fns = model_train.train_model_bootstrap(df_encoded, name, model, n_bootstrap=100, test_size=0.2)

    # Save the results to a dataframe
    result = pd.DataFrame({
        'Accuracy': accuracies,
        'Precision': precisions,
        'Recall': recalls,
        'F1 Score': f1_scores,
        'AUC': aucs,
        'Loss': losses,
        'TP': tps,
        'TN': tns,
        'FP': fps,
        'FN': fns,
        'ML_Model': name
    })

    results_df = pd.concat([results_df, result], ignore_index=False)

In [None]:
results_df

In [None]:
# Save the results to a CSV file
results_df.to_csv(f"/Users/yusiwei/Library/CloudStorage/OneDrive-Personal/research/Fourth Year Paper/Experiments/2nd experiments/Baseline experiment results/Sepsis_baseline_LR_rerun_results.csv", index=False)