# Supervised Learning

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import tarfile
import glob
from datetime import datetime
import math
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

### Q2.1 Classic Machine Learning Methods (5 Pts)
#### Q2.1 1.

In [3]:
# Retrieving the X matrix (can use)
df = pd.read_parquet('imputed-data/imputed-set-a.parquet')
X_normal = df.groupby("RecordID").max(numeric_only=True).reset_index()
X_normal = X_normal.drop(columns=["RecordID"])
X_normal = X_normal[sorted(X_normal.columns)]

df = pd.read_parquet('scaled-data/scaled-set-a.parquet')
X_scaled = df.groupby("RecordID").max(numeric_only=True).reset_index()
X_scaled = X_scaled.drop(columns=["RecordID"])
X_scaled = X_scaled[sorted(X_scaled.columns)]

In [4]:
# Retrieving the label vector
y_df = pd.read_parquet('processed-data/processed-outcomes-a.parquet')
y = y_df["In-hospital_death"].to_numpy().flatten()
y

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [5]:
# Models
# Logistic Regression (best scores using max_iter=4300)
model1 = LogisticRegression(max_iter=4300)
model1.fit(X_normal,y)

# Random Forest
model2 = RandomForestClassifier()
model2.fit(X_normal,y)

# KNN
model3 = KNeighborsClassifier()
model3.fit(X_normal,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
# Test set C performance

# Loading test set C
df = pd.read_parquet('imputed-data/imputed-set-c.parquet')
X_test = df.groupby("RecordID").mean(numeric_only=True).reset_index()
X_test = X_test.drop(columns=["RecordID"])
X_test= X_test[sorted(X_test.columns)]

y_df = pd.read_parquet('processed-data/processed-outcomes-c.parquet')
y_test = y_df["In-hospital_death"].to_numpy().flatten()

y_pred1 = model1.predict(X_test)
y_pred2 = model2.predict(X_test)
y_pred3 = model3.predict(X_test)

# Calculation of AuROC and AuPRC for Logistic Regression
print("Logistic Regression results")
auroc = roc_auc_score(y_test, y_pred1)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred1)
print(f"AUPRC: {auprc}")
accuracy = accuracy_score(y_test, y_pred1)
print(f"Accuracy: {accuracy}", end="\n\n")

# Calculation of AuROC and AuPRC for Random Forests
print("Random Forests results")
auroc = roc_auc_score(y_test, y_pred2)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred2)
print(f"AUPRC: {auprc}")
accuracy = accuracy_score(y_test, y_pred2)
print(f"Accuracy: {accuracy}", end="\n\n")

# Calculation of AuROC and AuPRC for KNN
print("KNN results")
auroc = roc_auc_score(y_test, y_pred3)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred3)
print(f"AUPRC: {auprc}")
accuracy = accuracy_score(y_test, y_pred3)
print(f"Accuracy: {accuracy}", end="\n\n")

Logistic Regression results
AUROC: 0.7200060066824341
AUPRC: 0.2747037297075142
Accuracy: 0.765

Random Forests results
AUROC: 0.5801979702418941
AUPRC: 0.19447753389141564
Accuracy: 0.825

KNN results
AUROC: 0.5913441203338714
AUPRC: 0.19653379203379204
Accuracy: 0.80775



#### Q2.1 2.

In [7]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.feature_extraction import EfficientFCParameters

In [8]:
def time_to_minutes(time_str):
    hour, minute = map(int, time_str.split(':'))
    return (hour * 60 + minute) / 60

In [9]:
extraction_settings = MinimalFCParameters()

# Extracting features of concatenated training and test dataset (need to do this in one go so the feature extraction is consistent)
df_train = pd.read_parquet('imputed-data/imputed-set-a.parquet')
df_test = pd.read_parquet('imputed-data/imputed-set-c.parquet')
df = pd.concat([df_train, df_test], ignore_index=True)
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].apply(lambda col: col.fillna(col.median()))
df['Time'] = df['Time'].apply(time_to_minutes)
X = extract_features(df, column_id='RecordID', column_sort='Time', default_fc_parameters=extraction_settings, impute_function=impute)

Feature Extraction: 100%|██████████| 40/40 [00:52<00:00,  1.30s/it]


In [10]:
y_train = pd.read_parquet('processed-data/processed-outcomes-a.parquet')["In-hospital_death"].to_numpy().flatten()
y_test = pd.read_parquet('processed-data/processed-outcomes-c.parquet')["In-hospital_death"].to_numpy().flatten()
y = np.concatenate((y_train,y_test))

In [11]:
X_full_train, X_full_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=False)
X_filtered_train = select_features(X_full_train, y_train)
X_filtered_train.head()
X_filtered_train, X_filtered_test = X_full_train[X_filtered_train.columns], X_full_test[X_filtered_train.columns]

In [12]:
# Models
# Logistic Regression (best scores using max_iter=4300)
model1 = LogisticRegression(max_iter=4300)
model1.fit(X_full_train,y_train)

# Random Forest
model2 = RandomForestClassifier()
model2.fit(X_full_train,y_train)

# KNN
model3 = KNeighborsClassifier()
model3.fit(X_full_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
y_pred1 = model1.predict(X_full_test)
y_pred2 = model2.predict(X_full_test)
y_pred3 = model3.predict(X_full_test)

# Calculation of AuROC and AuPRC for Logistic Regression
print("Logistic Regression results")
auroc = roc_auc_score(y_test, y_pred1)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred1)
print(f"AUPRC: {auprc}")
accuracy = accuracy_score(y_test, y_pred1)
print(f"Accuracy: {accuracy}", end="\n\n")

# Calculation of AuROC and AuPRC for Random Forests
print("Random Forests results")
auroc = roc_auc_score(y_test, y_pred2)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred2)
print(f"AUPRC: {auprc}")
accuracy = accuracy_score(y_test, y_pred2)
print(f"Accuracy: {accuracy}", end="\n\n")

# Calculation of AuROC and AuPRC for KNN
print("KNN results")
auroc = roc_auc_score(y_test, y_pred3)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred3)
print(f"AUPRC: {auprc}")
accuracy = accuracy_score(y_test, y_pred3)
print(f"Accuracy: {accuracy}", end="\n\n")

Logistic Regression results
AUROC: 0.575197407115416
AUPRC: 0.21951755275484086
Accuracy: 0.86

Random Forests results
AUROC: 0.5656217542015493
AUPRC: 0.21786035601825077
Accuracy: 0.863

KNN results
AUROC: 0.53031372401797
AUPRC: 0.16606463675213673
Accuracy: 0.84625



#### Q2.1 3.
Weirdly enough, Logistic Regression performed the best even in the extracting features method. It might be noted that while accuracy is quite high for all classifiers, they all have a relatively low AUROC score, except for Logistic Regression when not extracting features.

### Q2.2 Recurrent Neural Networks (4 Pts)

LSTM approach

In [35]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [None]:
class PatientDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)  # Shape: (num_patients, 49, 42)
        self.labels = torch.tensor(labels, dtype=torch.float32)      # Shape: (num_patients,)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        lstm_out, (h_n, c_n) = self.lstm(x)
        # Last timestep output (maybe change this to aggregation)
        out = self.fc(lstm_out[:, -1, :])  
        return out

In [None]:
# Hyperparameters
input_size = 42 
hidden_size = 512  # Size of the LSTM hidden layer
num_layers = 2
output_size = 1
learning_rate = 0.01
batch_size = 64
num_epochs = 50

In [58]:
# Assume you have the data as numpy arrays: features (4000, 49, 41) and labels (4000,)
df_train = pd.read_parquet('scaled-data/scaled-set-a.parquet')
df_test = pd.read_parquet('scaled-data/scaled-set-c.parquet')
df = pd.concat([df_train, df_test], ignore_index=True)
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].apply(lambda col: col.fillna(col.median()))
df = df.drop(columns=["RecordID"])
df['Time'] = df['Time'].apply(time_to_minutes)
df = df.to_numpy()
features = df.reshape(8000, 49, 42)

y_train = pd.read_parquet('processed-data/processed-outcomes-a.parquet')["In-hospital_death"].to_numpy().flatten()
y_test = pd.read_parquet('processed-data/processed-outcomes-c.parquet')["In-hospital_death"].to_numpy().flatten()
y = np.concatenate((y_train,y_test))
labels = y

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.5, shuffle=False)

train_dataset = PatientDataset(X_train, y_train)
test_dataset = PatientDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [53]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(input_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy with logits
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [54]:
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0
    
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader)}")

Epoch [1/50], Loss: 0.43228728761748664
Epoch [2/50], Loss: 0.40827573812197127
Epoch [3/50], Loss: 0.42071858355923303
Epoch [4/50], Loss: 0.4086183716380407
Epoch [5/50], Loss: 0.4115395834521642
Epoch [6/50], Loss: 0.4135092927349938
Epoch [7/50], Loss: 0.41392394711100866
Epoch [8/50], Loss: 0.40767944142931983
Epoch [9/50], Loss: 0.40826429710501716
Epoch [10/50], Loss: 0.41173911709634087
Epoch [11/50], Loss: 0.40925208822129266
Epoch [12/50], Loss: 0.41431342467429144
Epoch [13/50], Loss: 0.4092601260968617
Epoch [14/50], Loss: 0.40590367052290177
Epoch [15/50], Loss: 0.41040246779956513
Epoch [16/50], Loss: 0.4077978020622617
Epoch [17/50], Loss: 0.40864242849841953
Epoch [18/50], Loss: 0.40970384885394384
Epoch [19/50], Loss: 0.4109579400410728
Epoch [20/50], Loss: 0.4062851296057777
Epoch [21/50], Loss: 0.41201475309947183
Epoch [22/50], Loss: 0.412938275980571
Epoch [23/50], Loss: 0.40856470143984236
Epoch [24/50], Loss: 0.4082251905448853
Epoch [25/50], Loss: 0.411178142068

In [55]:
model.eval()  # Set model to evaluation mode
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        predicted = torch.round(torch.sigmoid(outputs))  # Convert logits to binary predictions
        total += labels.size(0)
        correct += (predicted.squeeze() == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy}%")

Test Accuracy: 85.375%


### Q2.3a Transformers (3 Pts)

### Q2.3b Tokenizing Time-Series Data and Transformers (4 Pts)