# Supervised Learning

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import tarfile
import glob
from datetime import datetime
import math
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

### Q2.1 Classic Machine Learning Methods (5 Pts)
#### Q2.1 1.

In [2]:
# Retrieving the X matrix (can use)
df = pd.read_parquet('final-data/final-set-a.parquet')
df = df.fillna(0)
X = df.groupby("RecordID").last(numeric_only=True).reset_index()
X = X.drop(columns=["RecordID"])
X = X[sorted(X.columns)]


In [3]:
# Retrieving the label vector
y_df = pd.read_parquet('processed-data/processed-outcomes-a.parquet')
y = y_df["In-hospital_death"].to_numpy().flatten()
print(y.sum())
print(len(y))

554
4000


Small note: We have observed that there is a class imbalance in the dataset. Out of 4000 entries, only 554 contain a 1, whereas the rest consists of 0's. In Q2.1 1. We compare results from taking class imbalance into account (using SMOTE) and simply igroning it (without SMOTE).

In [4]:
# Taking class imbalance into account (there are far more 1's than 0's in the "In-hospital_death" column)

from imblearn.over_sampling import SMOTE
import numpy as np

# Initialize SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Apply SMOTE on the entire dataset
X_resampled, y_resampled = smote.fit_resample(X, y)

In [5]:
# Models (WITH taking class imbalance into account)
# Logistic Regression
model1 = LogisticRegression(max_iter=500)
model1.fit(X_resampled,y_resampled)

# Random Forest
model2 = RandomForestClassifier()
model2.fit(X_resampled,y_resampled)

# KNN
model3 = KNeighborsClassifier()
model3.fit(X_resampled,y_resampled)

In [6]:
# Models (WITHOUT taking class imbalance into account)
# Logistic Regression
model4 = LogisticRegression(max_iter=4300)
model4.fit(X,y)

# Random Forest
model5 = RandomForestClassifier()
model5.fit(X,y)

# KNN
model6 = KNeighborsClassifier()
model6.fit(X,y)

In [7]:
# Test set C performance (WITH SMOTE)

# Loading test set C
df = pd.read_parquet('final-data/final-set-c.parquet')
df = df.fillna(0)
df = df.drop(columns=["ICUType"])
X_test = df.groupby("RecordID").last(numeric_only=True).reset_index()
X_test = X_test.drop(columns=["RecordID"])
X_test= X_test[sorted(X_test.columns)]

y_df = pd.read_parquet('processed-data/processed-outcomes-c.parquet')
y_test = y_df["In-hospital_death"].to_numpy().flatten()

y_pred1 = model1.predict_proba(X_test)[:,1]
y_pred2 = model2.predict_proba(X_test)[:,1]
y_pred3 = model3.predict_proba(X_test)[:,1]

# Calculation of AuROC and AuPRC for Logistic Regression
print("Logistic Regression results WITH SMOTE")
auroc = roc_auc_score(y_test, y_pred1)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred1)
print(f"AUPRC: {auprc}", end="\n\n")

# Calculation of AuROC and AuPRC for Random Forests
print("Random Forests results WITH SMOTE")
auroc = roc_auc_score(y_test, y_pred2)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred2)
print(f"AUPRC: {auprc}", end="\n\n")

# Calculation of AuROC and AuPRC for KNN
print("KNN results WITH SMOTE")
auroc = roc_auc_score(y_test, y_pred3)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred3)
print(f"AUPRC: {auprc}", end="\n\n")

Logistic Regression results WITH SMOTE
AUROC: 0.8427190249152182
AUPRC: 0.49761338735781435

Random Forests results WITH SMOTE
AUROC: 0.8318366682934766
AUPRC: 0.447774764474586

KNN results WITH SMOTE
AUROC: 0.7509709551876463
AUPRC: 0.31074860527554427



In [8]:
# Test set C performance (WITHOUT SMOTE)

# Loading test set C
df = pd.read_parquet('final-data/final-set-c.parquet')
df = df.fillna(0)
df = df.drop(columns=["ICUType"])
X_test = df.groupby("RecordID").last(numeric_only=True).reset_index()
X_test = X_test.drop(columns=["RecordID"])
X_test= X_test[sorted(X_test.columns)]

y_df = pd.read_parquet('processed-data/processed-outcomes-c.parquet')
y_test = y_df["In-hospital_death"].to_numpy().flatten()

y_pred1 = model4.predict_proba(X_test)[:,1]
y_pred2 = model5.predict_proba(X_test)[:,1]
y_pred3 = model6.predict_proba(X_test)[:,1]

# Calculation of AuROC and AuPRC for Logistic Regression
print("Logistic Regression results WITHOUT SMOTE")
auroc = roc_auc_score(y_test, y_pred1)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred1)
print(f"AUPRC: {auprc}", end="\n\n")

# Calculation of AuROC and AuPRC for Random Forests
print("Random Forests results WITHOUT SMOTE")
auroc = roc_auc_score(y_test, y_pred2)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred2)
print(f"AUPRC: {auprc}", end="\n\n")

# Calculation of AuROC and AuPRC for KNN
print("KNN results WITHOUT SMOTE")
auroc = roc_auc_score(y_test, y_pred3)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred3)
print(f"AUPRC: {auprc}", end="\n\n")

Logistic Regression results WITHOUT SMOTE
AUROC: 0.8460812654077661
AUPRC: 0.5059308870423979

Random Forests results WITHOUT SMOTE
AUROC: 0.8363216578443519
AUPRC: 0.5017795764695582

KNN results WITHOUT SMOTE
AUROC: 0.7235331806634882
AUPRC: 0.32825811926007525



#### Q2.1 2.

In [9]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.feature_extraction import EfficientFCParameters

In [10]:
extraction_settings = MinimalFCParameters()

# Extracting features of concatenated training and test dataset (need to do this in one go so the feature extraction is consistent)
df_train = pd.read_parquet('final-data/final-set-a.parquet')
df_test = pd.read_parquet('final-data/final-set-c.parquet').drop(columns=["ICUType"])

print(df_train.shape)
print(df_test.shape)

df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

X_train = extract_features(df_train, column_id='RecordID', column_sort='Time', default_fc_parameters=extraction_settings, impute_function=impute)
X_test = extract_features(df_test, column_id='RecordID', column_sort='Time', default_fc_parameters=extraction_settings, impute_function=impute)

X_train= X_train[sorted(X_train.columns)]
X_test= X_test[sorted(X_test.columns)]

(196000, 42)
(196000, 42)


Feature Extraction: 100%|██████████| 40/40 [00:29<00:00,  1.36it/s]
Feature Extraction: 100%|██████████| 40/40 [00:27<00:00,  1.45it/s]


In [11]:
y_train = pd.read_parquet('processed-data/processed-outcomes-a.parquet')["In-hospital_death"].to_numpy().flatten()
y_test = pd.read_parquet('processed-data/processed-outcomes-c.parquet')["In-hospital_death"].to_numpy().flatten()

In [12]:
# Models
# Logistic Regression
model1 = LogisticRegression(max_iter=10000)
model1.fit(X_train,y_train)

# Random Forest
model2 = RandomForestClassifier()
model2.fit(X_train,y_train)

# KNN
model3 = KNeighborsClassifier()
model3.fit(X_train,y_train)

In [13]:
y_pred1 = model1.predict_proba(X_test)[:,1]
y_pred2 = model2.predict_proba(X_test)[:,1]
y_pred3 = model3.predict_proba(X_test)[:,1]

# Calculation of AuROC and AuPRC for Logistic Regression
print("Logistic Regression results")
auroc = roc_auc_score(y_test, y_pred1)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred1)
print(f"AUPRC: {auprc}", end="\n\n")

# Calculation of AuROC and AuPRC for Random Forests
print("Random Forests results")
auroc = roc_auc_score(y_test, y_pred2)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred2)
print(f"AUPRC: {auprc}", end="\n\n")

# Calculation of AuROC and AuPRC for KNN
print("KNN results")
auroc = roc_auc_score(y_test, y_pred3)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred3)
print(f"AUPRC: {auprc}", end="\n\n")

Logistic Regression results
AUROC: 0.8192384027230294
AUPRC: 0.4438866902255371

Random Forests results
AUROC: 0.8221003366244948
AUPRC: 0.48587095378111955

KNN results
AUROC: 0.6948109772121486
AUPRC: 0.2946395655359523



### Q2.2 Recurrent Neural Networks (4 Pts)

LSTM approach

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [None]:
class PatientDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)  # Shape: (num_patients, 49, 42)
        self.labels = torch.tensor(labels, dtype=torch.float32)      # Shape: (num_patients,)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        lstm_out, (h_n, c_n) = self.lstm(x)
        # Last timestep output (maybe change this to aggregation)
        out = self.fc(lstm_out[:, -1, :])  
        return out

In [None]:
# Hyperparameters
input_size = 42 
hidden_size = 54  # Size of the LSTM hidden layer
num_layers = 2
output_size = 1
learning_rate = 0.001
batch_size = 64
num_epochs = 50

In [None]:
# Assume you have the data as numpy arrays: features (4000, 49, 42) and labels (4000,)
df_train = pd.read_parquet('final-data/final-set-a.parquet').fillna(0)
df_test = pd.read_parquet('final-data/final-set-c.parquet').fillna(0).drop(columns=["ICUType"])

df = pd.concat([df_train, df_test], ignore_index=True)

df = df.to_numpy()
features = df.reshape(8000, 49, 42)

y_train = pd.read_parquet('processed-data/processed-outcomes-a.parquet')["In-hospital_death"].to_numpy().flatten()
y_test = pd.read_parquet('processed-data/processed-outcomes-c.parquet')["In-hospital_death"].to_numpy().flatten()
y = np.concatenate((y_train,y_test))
labels = y

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.5, shuffle=False)

train_dataset = PatientDataset(X_train, y_train)
test_dataset = PatientDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(input_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0
    
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader)}")

In [None]:
model.eval()  # Set model to evaluation mode
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        predicted = torch.round(torch.sigmoid(outputs))  # Convert logits to binary predictions
        total += labels.size(0)
        correct += (predicted.squeeze() == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy}%")

In [None]:
import torch
from sklearn.metrics import roc_auc_score, average_precision_score

model.eval()  # Set model to evaluation mode
all_labels = []
all_preds = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        
        # Get the predicted probabilities for the positive class (after applying sigmoid)
        probs = torch.sigmoid(outputs).cpu().numpy()  # Probabilities for the positive class
        all_labels.extend(labels.cpu().numpy())       # Append ground truth labels
        all_preds.extend(probs)                        # Append predicted probabilities

# Convert lists to numpy arrays for evaluation
all_labels = np.array(all_labels)
all_preds = np.array(all_preds)

# Calculate AUROC (Area Under ROC curve)
auroc = roc_auc_score(all_labels, all_preds)

# Calculate AUPRC (Area Under Precision-Recall curve)
auprc = average_precision_score(all_labels, all_preds)

print(f"AUROC: {auroc}")
print(f"AUPRC: {auprc}")

### Q2.3a Transformers (3 Pts)

### Q2.3b Tokenizing Time-Series Data and Transformers (4 Pts)