# Supervised Learning

In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import MinimalFCParameters
from sklearn.preprocessing import StandardScaler

### Q2.1 Classic Machine Learning Methods (5 Pts)
#### Q2.1 1.

In [2]:
# Retrieving the X matrix (can use)
df = pd.read_parquet('final-data/final-set-a.parquet')
df = df.fillna(0)
X = df.groupby("RecordID").last(numeric_only=True).reset_index()
X = X.drop(columns=["RecordID"])
X = X[sorted(X.columns)]

In [3]:
# Retrieving the label vector
y_df = pd.read_parquet('processed-data/processed-outcomes-a.parquet')
y = y_df["In-hospital_death"].to_numpy().flatten()
print(y.sum())
print(len(y))

554
4000


Small note: We have observed that there is a class imbalance in the dataset. Out of 4000 entries, only 554 contain a 1, whereas the rest consists of 0's. In Q2.1 1. We compare results from taking class imbalance into account (using SMOTE) and simply igroning it (without SMOTE).

In [4]:
# Taking class imbalance into account (there are far more 1's than 0's in the "In-hospital_death" column)

from imblearn.over_sampling import SMOTE
import numpy as np

# Initialize SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Apply SMOTE on the entire dataset
X_resampled, y_resampled = smote.fit_resample(X, y)

In [5]:
# Models (WITH taking class imbalance into account)
# Logistic Regression
model1 = LogisticRegression(max_iter=500)
model1.fit(X_resampled,y_resampled)

# Random Forest
model2 = RandomForestClassifier()
model2.fit(X_resampled,y_resampled)

# KNN
model3 = KNeighborsClassifier()
model3.fit(X_resampled,y_resampled)

In [6]:
# Models (WITHOUT taking class imbalance into account)
# Logistic Regression
model4 = LogisticRegression(max_iter=4300)
model4.fit(X,y)

# Random Forest
model5 = RandomForestClassifier()
model5.fit(X,y)

# KNN
model6 = KNeighborsClassifier()
model6.fit(X,y)

In [7]:
# Test set C performance (WITH SMOTE)

# Loading test set C
df = pd.read_parquet('final-data/final-set-c.parquet')
df = df.fillna(0)
df = df.drop(columns=["ICUType"])
X_test = df.groupby("RecordID").last(numeric_only=True).reset_index()
X_test = X_test.drop(columns=["RecordID"])
X_test= X_test[sorted(X_test.columns)]

y_df = pd.read_parquet('processed-data/processed-outcomes-c.parquet')
y_test = y_df["In-hospital_death"].to_numpy().flatten()

y_pred1 = model1.predict_proba(X_test)[:,1]
y_pred2 = model2.predict_proba(X_test)[:,1]
y_pred3 = model3.predict_proba(X_test)[:,1]

# Calculation of AuROC and AuPRC for Logistic Regression
print("Logistic Regression results WITH SMOTE")
auroc = roc_auc_score(y_test, y_pred1)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred1)
print(f"AUPRC: {auprc}", end="\n\n")

# Calculation of AuROC and AuPRC for Random Forests
print("Random Forests results WITH SMOTE")
auroc = roc_auc_score(y_test, y_pred2)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred2)
print(f"AUPRC: {auprc}", end="\n\n")

# Calculation of AuROC and AuPRC for KNN
print("KNN results WITH SMOTE")
auroc = roc_auc_score(y_test, y_pred3)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred3)
print(f"AUPRC: {auprc}", end="\n\n")

Logistic Regression results WITH SMOTE
AUROC: 0.8427190249152182
AUPRC: 0.49761338735781435

Random Forests results WITH SMOTE
AUROC: 0.8353615897686175
AUPRC: 0.45286384600588625

KNN results WITH SMOTE
AUROC: 0.7509709551876463
AUPRC: 0.31074860527554427



In [8]:
# Test set C performance (WITHOUT SMOTE)

# Loading test set C
df = pd.read_parquet('final-data/final-set-c.parquet')
df = df.fillna(0)
df = df.drop(columns=["ICUType"])
X_test = df.groupby("RecordID").last(numeric_only=True).reset_index()
X_test = X_test.drop(columns=["RecordID"])
X_test= X_test[sorted(X_test.columns)]

y_df = pd.read_parquet('processed-data/processed-outcomes-c.parquet')
y_test = y_df["In-hospital_death"].to_numpy().flatten()

y_pred1 = model4.predict_proba(X_test)[:,1]
y_pred2 = model5.predict_proba(X_test)[:,1]
y_pred3 = model6.predict_proba(X_test)[:,1]

# Calculation of AuROC and AuPRC for Logistic Regression
print("Logistic Regression results WITHOUT SMOTE")
auroc = roc_auc_score(y_test, y_pred1)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred1)
print(f"AUPRC: {auprc}", end="\n\n")

# Calculation of AuROC and AuPRC for Random Forests
print("Random Forests results WITHOUT SMOTE")
auroc = roc_auc_score(y_test, y_pred2)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred2)
print(f"AUPRC: {auprc}", end="\n\n")

# Calculation of AuROC and AuPRC for KNN
print("KNN results WITHOUT SMOTE")
auroc = roc_auc_score(y_test, y_pred3)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred3)
print(f"AUPRC: {auprc}", end="\n\n")

Logistic Regression results WITHOUT SMOTE
AUROC: 0.8460812654077661
AUPRC: 0.5059308870423979

Random Forests results WITHOUT SMOTE
AUROC: 0.8340791630689142
AUPRC: 0.49815259826665925

KNN results WITHOUT SMOTE
AUROC: 0.7235331806634882
AUPRC: 0.32825811926007525



#### Q2.1 2.

In [9]:
extraction_settings = MinimalFCParameters()

# Extracting features of concatenated training and test dataset (need to do this in one go so the feature extraction is consistent)
df_train = pd.read_parquet('final-data/final-set-a.parquet')
df_test = pd.read_parquet('final-data/final-set-c.parquet').drop(columns=["ICUType"])

print(df_train.shape)
print(df_test.shape)

df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

X_train = extract_features(df_train, column_id='RecordID', column_sort='Time', default_fc_parameters=extraction_settings, impute_function=impute)
X_test = extract_features(df_test, column_id='RecordID', column_sort='Time', default_fc_parameters=extraction_settings, impute_function=impute)

X_train= X_train[sorted(X_train.columns)]
X_test= X_test[sorted(X_test.columns)]

(196000, 42)
(196000, 42)


Feature Extraction: 100%|██████████| 40/40 [00:31<00:00,  1.28it/s]
Feature Extraction: 100%|██████████| 40/40 [00:28<00:00,  1.42it/s]


In [10]:
y_train = pd.read_parquet('processed-data/processed-outcomes-a.parquet')["In-hospital_death"].to_numpy().flatten()
y_test = pd.read_parquet('processed-data/processed-outcomes-c.parquet')["In-hospital_death"].to_numpy().flatten()

In [11]:
# Models
# Logistic Regression
model1 = LogisticRegression(max_iter=10000)
model1.fit(X_train,y_train)

# Random Forest
model2 = RandomForestClassifier()
model2.fit(X_train,y_train)

# KNN
model3 = KNeighborsClassifier()
model3.fit(X_train,y_train)

In [12]:
y_pred1 = model1.predict_proba(X_test)[:,1]
y_pred2 = model2.predict_proba(X_test)[:,1]
y_pred3 = model3.predict_proba(X_test)[:,1]

# Calculation of AuROC and AuPRC for Logistic Regression
print("Logistic Regression results")
auroc = roc_auc_score(y_test, y_pred1)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred1)
print(f"AUPRC: {auprc}", end="\n\n")

# Calculation of AuROC and AuPRC for Random Forests
print("Random Forests results")
auroc = roc_auc_score(y_test, y_pred2)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred2)
print(f"AUPRC: {auprc}", end="\n\n")

# Calculation of AuROC and AuPRC for KNN
print("KNN results")
auroc = roc_auc_score(y_test, y_pred3)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred3)
print(f"AUPRC: {auprc}", end="\n\n")

Logistic Regression results
AUROC: 0.8192384027230294
AUPRC: 0.4438866902255371

Random Forests results
AUROC: 0.8172892342731288
AUPRC: 0.48000540085246357

KNN results
AUROC: 0.6948109772121486
AUPRC: 0.2946395655359523



### Q2.2 Recurrent Neural Networks (4 Pts)

LSTM approach

In [3]:
# Run in case CUDA_LAUNCH_BLOCKING error occurs
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [4]:
class PatientDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [15]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.7)
        self.bn = nn.LayerNorm(hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        lstm_out, (h_n, c_n) = self.lstm(x)
        # Extracting output of last timestep
        lstm_out = lstm_out[:, -1, :]

        lstm_out = self.bn(lstm_out)

        out = self.fc(lstm_out)  
        return out
    
class BidirectionalLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(BidirectionalLSTM, self).__init__()
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.7, bidirectional=True)

        # Adjusted because of bidirectional LSTM
        self.bn = nn.LayerNorm(hidden_size * 2)  
        self.fc = nn.Linear(hidden_size * 2, output_size)
        
    def forward(self, x):
        lstm_out, (h_n, c_n) = self.lstm(x)
        # Extracting output of last timestep
        lstm_out = lstm_out[:, -1, :]

        lstm_out = self.bn(lstm_out)

        out = self.fc(lstm_out)
        return out

In [6]:
# Hyperparameters
input_size = 41
hidden_size = 256
num_layers = 8
output_size = 1
learning_rate = 0.001
batch_size = 64
num_epochs = 10

##### NORMAL LSTM
# AUROC 0.8204 (Only set a)
""" input_size = 41
hidden_size = 256
num_layers = 8
output_size = 1
learning_rate = 0.0005
batch_size = 64
num_epochs = 10 """

# AUROC 0.7483 (set a and set b)
""" input_size = 41
hidden_size = 256
num_layers = 8
output_size = 1
learning_rate = 0.0005
batch_size = 64
num_epochs = 10 """

##### BIDIRECTIONAL LSTM
# AUROC 0.8226 (Only set a)
""" input_size = 41
hidden_size = 256
num_layers = 8
output_size = 1
learning_rate = 0.0003
batch_size = 64
num_epochs = 3 """

# AUROC 0.7773 (set a and set b)
""" input_size = 41
hidden_size = 256
num_layers = 8
output_size = 1
learning_rate = 0.001
batch_size = 64
num_epochs = 2 """

' input_size = 41\nhidden_size = 256\nnum_layers = 8\noutput_size = 1\nlearning_rate = 0.001\nbatch_size = 64\nnum_epochs = 2 '

In [7]:
# NOTE: In the scaled-data set the time column is not scaled. However, due to better performance, we still scale it for this application. RecordID can be dropped, since it doesn't convey any further information and each patient is assigned one of the 4000 dimensions.
scaler = StandardScaler()

# Training Data
df = pd.read_parquet('final-data/final-set-a.parquet')
X = df.fillna(0)
X = X.groupby("RecordID").tail(49).reset_index(drop=True)
X = X.sort_values(by="RecordID", ascending=True)
X["Time"] = scaler.fit_transform(X[["Time"]])
X = X.drop(columns=["RecordID"])
X = X[sorted(X.columns)]
X = X.to_numpy()
X = X.reshape(4000,49,41)

y_df = pd.read_parquet('processed-data/processed-outcomes-a.parquet')
y = y_df["In-hospital_death"].to_numpy().flatten()

train_dataset = PatientDataset(X, y)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


# Validation data
df = pd.read_parquet('final-data/final-set-b.parquet')
X = df.fillna(0)
X = X.groupby("RecordID").tail(49).reset_index(drop=True)
X = X.sort_values(by="RecordID", ascending=True)
X["Time"] = scaler.fit_transform(X[["Time"]])
X = X.drop(columns=["RecordID"]).drop(columns=["ICUType"])
X = X[sorted(X.columns)]
X = X.to_numpy()
X = X.reshape(4000,49,41)

y_df = pd.read_parquet('processed-data/processed-outcomes-b.parquet')
y = y_df["In-hospital_death"].to_numpy().flatten()

val_dataset = PatientDataset(X, y)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)


# Test data
df = pd.read_parquet('final-data/final-set-c.parquet')
X = df.fillna(0)
X = X.groupby("RecordID").tail(49).reset_index(drop=True)
X = X.sort_values(by="RecordID", ascending=True)
X["Time"] = scaler.fit_transform(X[["Time"]])
X = X.drop(columns=["RecordID"]).drop(columns=["ICUType"])
X = X[sorted(X.columns)]
X = X.to_numpy()
X = X.reshape(4000,49,41)

y_df = pd.read_parquet('processed-data/processed-outcomes-c.parquet')
y = y_df["In-hospital_death"].to_numpy().flatten()

test_dataset = PatientDataset(X, y)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)


# Full training (use after having found best model with lowest valildation error)
df = pd.read_parquet('final-data/final-set-a.parquet')
X1 = df.fillna(0)
X1 = X1.groupby("RecordID").tail(49).reset_index(drop=True)
X1 = X1.sort_values(by="RecordID", ascending=True)
X1["Time"] = scaler.fit_transform(X1[["Time"]])
X1 = X1[sorted(X1.columns)]
df = pd.read_parquet('final-data/final-set-b.parquet')
X2 = df.fillna(0)
X2 = X2.groupby("RecordID").tail(49).reset_index(drop=True)
X2 = X2.sort_values(by="RecordID", ascending=True)
X2["Time"] = scaler.fit_transform(X2[["Time"]])
X2 = X2.drop(columns=["ICUType"])
X2 = X2[sorted(X2.columns)]

X_full = pd.concat([X1, X2], axis=0).drop(columns="RecordID")

X_full = X_full.to_numpy()
X_full = X_full.reshape(8000,49,41)

y1 = pd.read_parquet('processed-data/processed-outcomes-b.parquet')
y1 = y1["In-hospital_death"].to_numpy().flatten()
y2 = pd.read_parquet('processed-data/processed-outcomes-b.parquet')
y2 = y2["In-hospital_death"].to_numpy().flatten()

y_full = np.concatenate((y1,y2), axis=0)

full_dataset = PatientDataset(X_full, y_full)
full_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True)

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Uncomment/comment line depending on if you want to use normal LSTM or bidirectional LSTM
#model = LSTMModel(input_size, hidden_size, num_layers, output_size).to(device)
model = BidirectionalLSTM(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [19]:
# Training loop which prints current training loss and validation loss

# NOTE: This is for validation error analysis

total_val_loss = 0.0
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    train_loss = running_loss / len(train_loader)

    # Validation phase
    model.eval() 
    val_loss = 0.0
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            val_loss += loss.item()
    
    val_loss /= len(test_loader)
    total_val_loss += val_loss
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

total_val_loss /= num_epochs
print(f"Avg val loss: {total_val_loss:.4f}")

Epoch [1/10], Loss: 0.4141, Val Loss: 0.3589
Epoch [2/10], Loss: 0.3566, Val Loss: 0.3524
Epoch [3/10], Loss: 0.3345, Val Loss: 0.3512
Epoch [4/10], Loss: 0.3228, Val Loss: 0.3463
Epoch [5/10], Loss: 0.3385, Val Loss: 0.3586
Epoch [6/10], Loss: 0.3248, Val Loss: 0.3668
Epoch [7/10], Loss: 0.3261, Val Loss: 0.3733
Epoch [8/10], Loss: 0.3235, Val Loss: 0.3599
Epoch [9/10], Loss: 0.3223, Val Loss: 0.3418
Epoch [10/10], Loss: 0.3126, Val Loss: 0.3853
Avg val loss: 0.3595


In [20]:
# Training loop which prints current training loss and AUROC/AUPRC

# NOTE: Change train_loader with full_loader if we want to train the model on set a and set b (after validation error analysis)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    train_loss = running_loss / len(train_loader)

    # AUROC & AUPRC calculation
    model.eval()
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)  # Raw logits
            probs = torch.sigmoid(outputs).squeeze()  # Convert logits to probabilities (since we use BCEWithLogitsLoss() loss function)

            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())

    auroc = roc_auc_score(all_labels, all_probs)
    auprc = average_precision_score(all_labels, all_probs)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}, Test AUROC: {auroc:.4f}, Test AUPRC: {auprc:.4f}")

Epoch [1/10], Loss: 0.3170, Test AUROC: 0.7778, Test AUPRC: 0.4243
Epoch [2/10], Loss: 0.3051, Test AUROC: 0.7917, Test AUPRC: 0.4183
Epoch [3/10], Loss: 0.3004, Test AUROC: 0.8007, Test AUPRC: 0.4530
Epoch [4/10], Loss: 0.2999, Test AUROC: 0.7992, Test AUPRC: 0.4403
Epoch [5/10], Loss: 0.2997, Test AUROC: 0.8037, Test AUPRC: 0.4468
Epoch [6/10], Loss: 0.2896, Test AUROC: 0.7971, Test AUPRC: 0.4304
Epoch [7/10], Loss: 0.2913, Test AUROC: 0.7897, Test AUPRC: 0.4260
Epoch [8/10], Loss: 0.2969, Test AUROC: 0.8008, Test AUPRC: 0.4444
Epoch [9/10], Loss: 0.2985, Test AUROC: 0.7943, Test AUPRC: 0.4298
Epoch [10/10], Loss: 0.2890, Test AUROC: 0.7945, Test AUPRC: 0.4304


### Q2.3a Transformers (3 Pts) (MUCH FASTER TO TRAIN)

In [16]:
class Transformer(nn.Module):
    def __init__(self, input_size, num_classes, d_model=128, num_heads=8, num_layers=3, dim_feedforward=512, dropout=0.1):
        super(Transformer, self).__init__()
        
        self.embedding = nn.Linear(input_size, d_model)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, # Dimensionality of embeddings
            nhead=num_heads, 
            dim_feedforward=dim_feedforward, 
            dropout=dropout, 
            batch_first=True
        )
        
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        # FC layer for classification
        self.fc = nn.Linear(d_model, num_classes)
    
    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        x = self.fc(x).squeeze()
        return x

In [22]:
class Transformer(nn.Module):
    def __init__(self, input_size, num_classes, num_heads=8, num_layers=3, dim_feedforward=512, dropout=0.1):
        super(Transformer, self).__init__()

        # Set d_model equal to input_size to avoid embedding projection
        d_model = input_size  

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=num_heads, 
            dim_feedforward=dim_feedforward, 
            dropout=dropout, 
            batch_first=True
        )

        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # FC layer for classification
        self.fc = nn.Linear(d_model, num_classes)
    
    def forward(self, x):
        x = self.transformer_encoder(x)  # Process directly without embedding
        x = x.mean(dim=1)  # Global pooling over time steps
        x = self.fc(x).squeeze()  # Final classification layer
        return x

In [None]:
# Model hyperparameters
input_size = 41
num_classes = 1 
d_model = 128
num_heads = 41
num_layers = 8
dim_feedforward = 512
dropout = 0.1
learning_rate = 0.0004
num_epochs = 25

# AUROC 0.8396 (Only set a)
""" input_size = 41
num_classes = 1 
d_model = 128
num_heads = 16
num_layers = 8
dim_feedforward = 512
dropout = 0.1
learning_rate = 0.0005
num_epochs = 6 """

# AUROC 0.8383 (set a and set b)
""" input_size = 41
num_classes = 1 
d_model = 128
num_heads = 8
num_layers = 4 
dim_feedforward = 512
dropout = 0.1
learning_rate = 0.0005
num_epochs = 4 """

' input_size = 41\nnum_classes = 1 \nd_model = 128\nnum_heads = 8\nnum_layers = 4 \ndim_feedforward = 512\ndropout = 0.1\nnum_epochs = 4 '

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = Transformer(input_size, num_classes, d_model, num_heads, num_layers, dim_feedforward, dropout).to(device)
model = Transformer(input_size, num_classes, num_heads, num_layers, dim_feedforward, dropout).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([7.0]).to(device))  # Handle class imbalance
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)



In [None]:
# Training loop which prints current training loss and AUROC/AUPRC

# NOTE: Change train_loader with full_loader if we want to train the model on set a and set b (after validation error analysis)

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0
    
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    train_loss = running_loss / len(train_loader)

    # Validation phase
    model.eval()  # Set model to evaluation mode
    all_labels = []
    all_probs = []  # Store probabilities for AUROC & AUPRC calculation

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)  # Raw logits
            probs = torch.sigmoid(outputs).squeeze()  # Convert logits to probabilities

            all_labels.extend(labels.cpu().numpy())  # Store true labels
            all_probs.extend(probs.cpu().numpy())  # Store predicted probabilities

    # Compute AUROC & AUPRC
    auroc = roc_auc_score(all_labels, all_probs)
    auprc = average_precision_score(all_labels, all_probs)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}, Test AUROC: {auroc:.4f}, Test AUPRC: {auprc:.4f}")

Epoch [1/25], Loss: 1.0599, Test AUROC: 0.8044, Test AUPRC: 0.4032
Epoch [2/25], Loss: 0.9635, Test AUROC: 0.8168, Test AUPRC: 0.4210
Epoch [3/25], Loss: 0.9327, Test AUROC: 0.8178, Test AUPRC: 0.4284
Epoch [4/25], Loss: 0.8732, Test AUROC: 0.8176, Test AUPRC: 0.4377
Epoch [5/25], Loss: 0.8437, Test AUROC: 0.8193, Test AUPRC: 0.4205
Epoch [6/25], Loss: 0.8143, Test AUROC: 0.8211, Test AUPRC: 0.4162
Epoch [7/25], Loss: 0.8073, Test AUROC: 0.8213, Test AUPRC: 0.4341
Epoch [8/25], Loss: 0.7403, Test AUROC: 0.8224, Test AUPRC: 0.4598
Epoch [9/25], Loss: 0.7095, Test AUROC: 0.8220, Test AUPRC: 0.4323
Epoch [10/25], Loss: 0.6718, Test AUROC: 0.8205, Test AUPRC: 0.4547
Epoch [11/25], Loss: 0.6524, Test AUROC: 0.8023, Test AUPRC: 0.3812
Epoch [12/25], Loss: 0.6210, Test AUROC: 0.8082, Test AUPRC: 0.4185
Epoch [13/25], Loss: 0.6175, Test AUROC: 0.8135, Test AUPRC: 0.4516


### Q2.3b Tokenizing Time-Series Data and Transformers (4 Pts)