# Section 06 — Final Evaluation and Model Comparison

This section evaluates all models:

1. Custom CNN  
2. Fine-tuned ResNet50  
3. Fine-tuned DenseNet121  
4. Fine-tuned VGG16  
5. Hybrid CNN + Logistic Regression  
6. Hybrid CNN + KNN  
7. Hybrid CNN + Random Forest  

Metrics computed:
- Accuracy
- Precision
- Recall
- F1-score
- Specificity
- Confusion Matrix

All models are evaluated on the test set using the same pipeline.  
A comparison table is produced to summarize performance and identify the strongest method.  
This section completes the full machine learning workflow.

In [54]:
import torch
import torch.nn as nn
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
import torch.nn.functional as F
import numpy as np

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)

import pickle
import matplotlib.pyplot as plt
import seaborn as sns

In [55]:
IMAGE_SIZE = 224
BATCH_SIZE = 32

test_transforms = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

test_dataset = datasets.ImageFolder("data/test", transform=test_transforms)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

class_names = test_dataset.classes
num_classes = len(class_names)

print("✔ Test DataLoader ready")
print("Classes:", class_names)
print("Total test samples:", len(test_dataset))

✔ Test DataLoader ready
Classes: ['COVID', 'Lung_Opacity', 'Normal', 'Viral Pneumonia']
Total test samples: 3176


In [56]:
# Define CNN model
class CNNModel(nn.Module):
    def __init__(self, num_classes=4):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)

        self.fc1 = nn.Linear(64 * 28 * 28, 512)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.pool(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.pool(x)
        x = self.conv3(x)
        x = F.relu(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)

        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

# Load Custom CNN
custom = CNNModel(num_classes=num_classes)
state_dict = torch.load("models/cnn_model.pth", weights_only=True)
custom.load_state_dict(state_dict)
custom.eval()

# Load Pretrained Models
def load_resnet50():
    model = models.resnet50(weights=None)
    # Fix conv1 to accept 1-channel (must match training)
    model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)

    # Replace classifier
    model.fc = nn.Linear(model.fc.in_features, num_classes)

    # Load trained weights
    state = torch.load("models/resnet50_model.pth", map_location="cpu")
    model.load_state_dict(state)
    model.eval()
    return model

def load_densenet121():
    model = models.densenet121(weights=None)

    # Fix input 1-channel
    model.features.conv0 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)

    # Replace classifier
    model.classifier = nn.Linear(model.classifier.in_features, num_classes)
    state = torch.load("models/densenet121_model.pth", map_location="cpu")
    model.load_state_dict(state)
    model.eval()
    return model

def load_vgg16():
    model = models.vgg16(weights=None)

    # Fix 1-channel input
    model.features[0] = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1)

    # Replace classifier
    model.classifier[-1] = nn.Linear(model.classifier[-1].in_features, num_classes)

    state = torch.load("models/vgg16_model.pth", map_location="cpu")
    model.load_state_dict(state)

    model.eval()
    return model


print("✔ ALL CNN models loaded")

✔ ALL CNN models loaded


In [57]:
# Calculate specificity
def compute_specificity(cm):
    spec_per_class = []
    total = cm.sum()

    for i in range(len(cm)):
        TP = cm[i, i]
        FN = cm[i, :].sum() - TP
        FP = cm[:, i].sum() - TP
        TN = total - (TP + FN + FP)

        specificity = TN / (TN + FP + 1e-9)
        spec_per_class.append(specificity)

    weights = cm.sum(axis=1)
    weighted_spec = np.average(spec_per_class, weights=weights)

    return spec_per_class, weighted_spec

In [58]:
# Evaluate CNN model
def evaluate_cnn(model, test_loader, name="Model"):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)
    model.eval()

    y_true, y_pred = [], []

    with torch.no_grad():
        for imgs, labels in test_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            preds = model(imgs).argmax(1)
            y_true.append(labels.cpu().numpy())
            y_pred.append(preds.cpu().numpy())

    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)

    # METRICS
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    rec = recall_score(y_true, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)

    cm = confusion_matrix(y_true, y_pred)
    spec_per_class, spec_weighted = compute_specificity(cm)

    # SAVE CONFUSION MATRIX
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=class_names, yticklabels=class_names)
    plt.title(f"{name} — Confusion Matrix")
    plt.savefig(f"results/confusion_matrices/{name}_cm.png")
    plt.close()

    print(f"\n===== {name} =====")
    print(f"Accuracy:     {acc:.4f}")
    print(f"Precision:    {prec:.4f}")
    print(f"Recall:       {rec:.4f}")
    print(f"F1 Score:     {f1:.4f}")
    print(f"Specificity:  {spec_weighted:.4f}")

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "specificity": spec_weighted,
        "cm": cm
    }

In [None]:
results = {}

results["CustomCNN"] = evaluate_cnn(custom, test_loader, "CustomCNN")
results["ResNet50"] = evaluate_cnn(load_resnet50(), test_loader, "ResNet50")
results["DenseNet121"] = evaluate_cnn(load_densenet121(), test_loader, "DenseNet121")
results["VGG16"] = evaluate_cnn(load_vgg16(), test_loader, "VGG16")

print("✔ CNN evaluation complete.")


===== CustomCNN =====
Accuracy:     0.8835
Precision:    0.8840
Recall:       0.8835
F1 Score:     0.8832
Specificity:  0.9321


  state = torch.load("models/resnet50_model.pth", map_location="cpu")



===== ResNet50 =====
Accuracy:     0.9424
Precision:    0.9425
Recall:       0.9424
F1 Score:     0.9423
Specificity:  0.9637


  state = torch.load("models/densenet121_model.pth", map_location="cpu")



===== DenseNet121 =====
Accuracy:     0.9254
Precision:    0.9254
Recall:       0.9254
F1 Score:     0.9251
Specificity:  0.9514


  state = torch.load("models/vgg16_model.pth", map_location="cpu")



===== VGG16 =====
Accuracy:     0.9229
Precision:    0.9236
Recall:       0.9229
F1 Score:     0.9230
Specificity:  0.9520
✔ CNN evaluation complete.


In [60]:
with open("models/rf_hybrid.pkl", "rb") as f:
    rf = pickle.load(f)

with open("models/lr_hybrid.pkl", "rb") as f:
    lr = pickle.load(f)
with open("models/lr_scaler.pkl", "rb") as f:
    lr_scaler = pickle.load(f)

with open("models/knn_hybrid.pkl", "rb") as f:
    knn = pickle.load(f)
with open("models/knn_scaler.pkl", "rb") as f:
    knn_scaler = pickle.load(f)

print("✔ Hybrid ML models loaded")

✔ Hybrid ML models loaded


In [None]:
# Feature extractor (same as training)
class CustomCNNFeatureExtractor(nn.Module):
    def __init__(self, trained_model):
        super().__init__()
        self.conv1 = trained_model.conv1
        self.conv2 = trained_model.conv2
        self.conv3 = trained_model.conv3
        self.pool  = trained_model.pool
        self.fc1   = trained_model.fc1

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return x

extractor = CustomCNNFeatureExtractor(custom)

def extract_features(model, dataloader):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)
    model.eval()

    feats, labels = [], []
    with torch.no_grad():
        for imgs, y in dataloader:
            imgs = imgs.to(device)
            vec = model(imgs)
            feats.append(vec.cpu().numpy())
            labels.append(y.numpy())

    return np.vstack(feats), np.hstack(labels)

X_test, y_test = extract_features(extractor, test_loader)
print("✔ Extracted CNN features for hybrid test evaluation")

✔ Extracted CNN features for hybrid test evaluation


In [None]:
# Evaluate Hybrid Models
def evaluate_hybrid_model(name, model, X_test, y_test, scaler=None):

    if scaler:
        X_test_scaled = scaler.transform(X_test)
    else:
        X_test_scaled = X_test

    preds = model.predict(X_test_scaled)

    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, average="weighted", zero_division=0)
    rec = recall_score(y_test, preds, average="weighted", zero_division=0)
    f1 = f1_score(y_test, preds, average="weighted", zero_division=0)

    cm = confusion_matrix(y_test, preds)
    spec_per_class, spec_weighted = compute_specificity(cm)

    # save cm
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Greens",
                xticklabels=class_names, yticklabels=class_names)
    plt.title(f"Hybrid {name} — Confusion Matrix")
    plt.savefig(f"results/confusion_matrices/Hybrid_{name}_cm.png")
    plt.close()

    print(f"\n===== Hybrid {name} =====")
    print(f"Accuracy:     {acc:.4f}")
    print(f"Precision:    {prec:.4f}")
    print(f"Recall:       {rec:.4f}")
    print(f"F1 Score:     {f1:.4f}")
    print(f"Specificity:  {spec_weighted:.4f}")

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "specificity": spec_weighted,
        "cm": cm
    }

results["Hybrid_RF"]  = evaluate_hybrid_model("RF",  rf,  X_test, y_test)
results["Hybrid_LR"]  = evaluate_hybrid_model("LR",  lr,  X_test, y_test, lr_scaler)
results["Hybrid_KNN"] = evaluate_hybrid_model("KNN", knn, X_test, y_test, knn_scaler)


===== Hybrid RF =====
Accuracy:     0.8829
Precision:    0.8849
Recall:       0.8829
F1 Score:     0.8829
Specificity:  0.9234

===== Hybrid LR =====
Accuracy:     0.8816
Precision:    0.8820
Recall:       0.8816
F1 Score:     0.8817
Specificity:  0.9278

===== Hybrid KNN =====
Accuracy:     0.8611
Precision:    0.8618
Recall:       0.8611
F1 Score:     0.8612
Specificity:  0.9164
✔ Hybrid evaluation complete.
