In [1]:
import os
import sys
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve,
    confusion_matrix,
    classification_report,
)

# Go from evaluation/ -> project root
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(PROJECT_ROOT)

from utils.mura_dataset import MURADataset
from utils.transforms import get_train_transforms, get_val_transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)



Using device: cpu


In [2]:
#  Dataloader for TEST set
def get_test_loader(batch_size: int = 32):
    data_root = os.path.join(PROJECT_ROOT, "data", "raw")
    test_csv = os.path.join(PROJECT_ROOT, "data", "splits", "valid_labeled_studies.csv")

    test_dataset = MURADataset(
        csv_file=test_csv,
        transform=get_val_transforms(),
        root_dir=data_root,
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
    )
    print("Test samples:", len(test_dataset))
    return test_loader

test_loader = get_test_loader(batch_size=32)


Test samples: 3197


In [3]:
# Baseline CNN definition (same as Su's training code)
class BaselineCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        self.pool = nn.MaxPool2d(2)

        # 224x224 -> pool -> 112x112 -> pool -> 56x56, 32 channels
        self.fc1 = nn.Linear(32 * 56 * 56, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))  # [B, 16, 112, 112]
        x = self.pool(torch.relu(self.conv2(x)))  # [B, 32, 56, 56]
        x = x.view(x.size(0), -1)                 # flatten
        x = torch.relu(self.fc1(x))
        return torch.sigmoid(self.fc2(x))         # prob in [0, 1]



In [None]:
# Paths and helper to evaluate one model on the test set

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve,
    confusion_matrix,
    classification_report,
)
import torch

# Paths to histories and weights (adjust names if needed)
BASELINE_WEIGHTS_PATH = os.path.join(PROJECT_ROOT, "models/baseline_cnn.pt")
BASELINE_HISTORY_PATH = os.path.join(PROJECT_ROOT, "evaluation/baseline_history.npz")

MODEL10_WEIGHTS_PATH = os.path.join(PROJECT_ROOT, "models/model1_10ep.pt")
MODEL10_HISTORY_PATH = os.path.join(PROJECT_ROOT, "evaluation/model1_10ep_history.npz")


def load_history(path):
    """Load a .npz training history file."""
    if not os.path.exists(path):
        raise FileNotFoundError(f"History file not found: {path}")
    h = np.load(path)
    return {
        "train_loss": h["train_loss"],
        "val_acc":   h["val_acc"],
        "val_auc":   h["val_auc"],
        "test_acc_hist": float(h["test_acc"]),
        "test_auc_hist": float(h["test_auc"]),
    }


def evaluate_checkpoint(label, weights_path, history_path, batch_size=32):
    """
    Load weights + history for one model, run on the test set,
    and compute metrics + ROC + confusion matrix.
    """
    print(f"\n===== Evaluating {label} =====")
    print(f"  Weights: {weights_path}")
    print(f"  History: {history_path}")

    #load training history
    hist = load_history(history_path)

    #build DataLoader for test set
    test_loader = get_test_loader(batch_size=batch_size)

    #build model & load weights
    model = BaselineCNN().to(device)
    state = torch.load(weights_path, map_location=device)
    model.load_state_dict(state)
    model.eval()

    #forward pass on test set
    all_labels = []
    all_probs  = []
    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            probs = model(images).cpu().numpy().flatten()   # predicted probabilities
            all_probs.extend(probs)
            all_labels.extend(labels.numpy())

    all_labels = np.array(all_labels)
    all_probs  = np.array(all_probs)

    # Binary predictions with 0.5 threshold
    y_pred = (all_probs >= 0.5).astype(int)

    #metrics
    acc = accuracy_score(all_labels, y_pred)
    prec = precision_score(all_labels, y_pred)
    rec = recall_score(all_labels, y_pred)
    f1 = f1_score(all_labels, y_pred)
    auc_val = roc_auc_score(all_labels, all_probs)
    fpr, tpr, thr = roc_curve(all_labels, all_probs)
    cm = confusion_matrix(all_labels, y_pred)
    report = classification_report(all_labels, y_pred, target_names=["Normal (0)", "Abnormal (1)"])

    print(f"Test Accuracy (from history):   {hist['test_acc_hist']:.4f}")
    print(f"Test AUC (from history):        {hist['test_auc_hist']:.4f}")
    print("--- Recomputed on test set ---")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(f"ROC AUC:   {auc_val:.4f}")
    print("\nConfusion matrix:\n", cm)
    print("\nClassification report:\n", report)

    return {
        "label": label,
        "hist": hist,
        "test_acc": acc,
        "test_prec": prec,
        "test_rec": rec,
        "test_f1": f1,
        "test_auc": auc_val,
        "cm": cm,
        "roc_fpr": fpr,
        "roc_tpr": tpr,
    }


In [7]:
#evaluation for baseline (5 ep) and 10-epoch model

baseline_results = evaluate_checkpoint(
    label="Baseline CNN (5 epochs)",
    weights_path=BASELINE_WEIGHTS_PATH,
    history_path=BASELINE_HISTORY_PATH,
)

model10_results = evaluate_checkpoint(
    label="Deeper CNN (10 epochs)",
    weights_path=MODEL10_WEIGHTS_PATH,
    history_path=MODEL10_HISTORY_PATH,
)



===== Evaluating Baseline CNN (5 epochs) =====
  Weights: c:\Users\ENOPARA\code\MachineLearning2025Project\cnnradiographproject-n\models\baseline_cnn.pt
  History: c:\Users\ENOPARA\code\MachineLearning2025Project\cnnradiographproject-n\evaluation\baseline_history.npz
Test samples: 3197
Test Accuracy (from history):   0.5962
Test AUC (from history):        0.6679
--- Recomputed on test set ---
Accuracy:  0.5962
Precision: 0.7057
Recall:    0.2680
F1-score:  0.3884
ROC AUC:   0.6679

Confusion matrix:
 [[1496  171]
 [1120  410]]

Classification report:
               precision    recall  f1-score   support

  Normal (0)       0.57      0.90      0.70      1667
Abnormal (1)       0.71      0.27      0.39      1530

    accuracy                           0.60      3197
   macro avg       0.64      0.58      0.54      3197
weighted avg       0.64      0.60      0.55      3197


===== Evaluating Deeper CNN (10 epochs) =====
  Weights: c:\Users\ENOPARA\code\MachineLearning2025Project\cnnradi

FileNotFoundError: History file not found: c:\Users\ENOPARA\code\MachineLearning2025Project\cnnradiographproject-n\models\evaluation\model1_10ep_history.npz