In [None]:
import os
import sys
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve,
    confusion_matrix,
    classification_report,
)

# Go from evaluation/ -> project root
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(PROJECT_ROOT)

from utils.mura_dataset import MURADataset
from utils.transforms import get_train_transforms, get_val_transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)



Working dir: C:\Users\ENOPARA\code\MachineLearning2025Project\cnnradiographproject


In [None]:
#  Dataloader for TEST set
def get_test_loader(batch_size: int = 32):
    data_root = os.path.join(PROJECT_ROOT, "data", "raw")
    test_csv = os.path.join(PROJECT_ROOT, "data", "splits", "valid_labeled_studies.csv")

    test_dataset = MURADataset(
        csv_file=test_csv,
        transform=get_val_transforms(),
        root_dir=data_root,
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
    )
    print("Test samples:", len(test_dataset))
    return test_loader

test_loader = get_test_loader(batch_size=32)


In [None]:
# Baseline CNN definition (same as Su's training code)
class BaselineCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        self.pool = nn.MaxPool2d(2)

        # 224x224 -> pool -> 112x112 -> pool -> 56x56, 32 channels
        self.fc1 = nn.Linear(32 * 56 * 56, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))  # [B, 16, 112, 112]
        x = self.pool(torch.relu(self.conv2(x)))  # [B, 32, 56, 56]
        x = x.view(x.size(0), -1)                 # flatten
        x = torch.relu(self.fc1(x))
        return torch.sigmoid(self.fc2(x))         # prob in [0, 1]



In [None]:
#Load training history & model weights
hist = np.load("baseline_history.npz")
train_loss = hist["train_loss"]
val_acc_hist = hist["val_acc"]
val_auc_hist = hist["val_auc"]
test_acc_hist = float(hist["test_acc"])
test_auc_hist = float(hist["test_auc"])

print("Saved-from-training test metrics:")
print(f"  Test Accuracy (history): {test_acc_hist:.4f}")
print(f"  Test AUC (history):      {test_auc_hist:.4f}")

# Load trained weights
weights_path = os.path.join(PROJECT_ROOT, "models", "baseline_cnn.pt")
model = BaselineCNN().to(device)
state = torch.load(weights_path, map_location=device)
model.load_state_dict(state)
model.eval()
print("Loaded weights from:", weights_path)


In [None]:
#Run evaluation on TEST set (fresh, for sanity + metrics)
all_probs = []
all_labels = []

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        outputs = model(images).cpu().numpy().flatten()  # probabilities
        all_probs.extend(outputs)
        all_labels.extend(labels.numpy())

all_probs = np.array(all_probs)
all_labels = np.array(all_labels, dtype=int)

# 0.5 threshold for class prediction
y_pred = (all_probs >= 0.5).astype(int)

acc = accuracy_score(all_labels, y_pred)
prec = precision_score(all_labels, y_pred)
rec = recall_score(all_labels, y_pred)
f1 = f1_score(all_labels, y_pred)
auc = roc_auc_score(all_labels, all_probs)
cm = confusion_matrix(all_labels, y_pred)
report = classification_report(all_labels, y_pred, digits=3)

print("=== Test-set metrics (recomputed) ===")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"AUC      : {auc:.4f}")
print("\nClassification report:\n")
print(report)
print("Confusion matrix:\n", cm)


In [None]:
#Plot training curves (loss, val acc, val AUC)
epochs = np.arange(1, len(train_loss) + 1)

plt.figure(figsize=(6, 4))
plt.plot(epochs, train_loss, marker="o")
plt.xlabel("Epoch")
plt.ylabel("Training Loss")
plt.title("Baseline CNN – Training Loss")
plt.grid(True)
plt.show()

plt.figure(figsize=(6, 4))
plt.plot(epochs, val_acc_hist, marker="o", label="Val Accuracy")
plt.plot(epochs, val_auc_hist, marker="s", label="Val AUC")
plt.xlabel("Epoch")
plt.ylabel("Score")
plt.title("Baseline CNN – Validation Metrics")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
#ROC curve + Confusion matrix plots
fpr, tpr, _ = roc_curve(all_labels, all_probs)

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f"ROC (AUC = {auc:.3f})")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Baseline CNN – ROC Curve (Test Set)")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Confusion matrix as heatmap
plt.figure(figsize=(4, 4))
plt.imshow(cm, interpolation="nearest", cmap="Blues")
plt.title("Baseline CNN – Confusion Matrix (Test Set)")
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, ["Normal (0)", "Abnormal (1)"], rotation=45)
plt.yticks(tick_marks, ["Normal (0)", "Abnormal (1)"])

thresh = cm.max() / 2.0
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(
            j,
            i,
            str(cm[i, j]),
            ha="center",
            va="center",
            color="white" if cm[i, j] > thresh else "black",
        )

plt.tight_layout()
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.show()
