In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Imports and device setup
import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchsummary import summary
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix
from tqdm import tqdm
from PIL import Image
import random
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

Using device: cuda


In [3]:
# Load pretrained YOLOv5s-CLS v6.1 via torch.hub
# This pulls both the architecture and weights for the 214-layer (~7 M parameters) network.
hub_model = torch.hub.load(
    'ultralytics/yolov5',   # Repository name
    'custom',               # “custom” loads a classification checkpoint
    path='yolov5s-cls.pt',  # This will fetch v6.1’s official checkpoint
    source='github'         # Ensure it pulls from GitHub
).to(device)

hub_model.eval()
print("Successfully loaded YOLOv5s-CLS via torch.hub.")

Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2025-6-3 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)

Fusing layers... 
Model summary: 117 layers, 5447688 parameters, 0 gradients, 11.4 GFLOPs


Successfully loaded YOLOv5s-CLS via torch.hub.


In [4]:
# Inspect the internal Classify block to find the final Linear layer
print(hub_model.model)

ClassificationModel(
  (model): Sequential(
    (0): Conv(
      (conv): Conv2d(3, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
      (act): SiLU(inplace=True)
    )
    (1): Conv(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (act): SiLU(inplace=True)
    )
    (2): C3(
      (cv1): Conv(
        (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
        (act): SiLU(inplace=True)
      )
      (cv2): Conv(
        (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
        (act): SiLU(inplace=True)
      )
      (cv3): Conv(
        (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
        (act): SiLU(inplace=True)
      )
      (m): Sequential(
        (0): Bottleneck(
          (cv1): Conv(
            (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv2): Conv(
            (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1

In [5]:
# Replace the final “Linear(in_features=1280, out_features=1000)” with “Linear(1280 → 4)”
classify_block = hub_model.model.model[-1]  # This is the Classify(...) module
in_features = classify_block.linear.in_features
print(f"Replacing final linear: in_features = {in_features}, out_features = 4")
classify_block.linear = nn.Linear(in_features, 4)
hub_model.to(device)


Replacing final linear: in_features = 1280, out_features = 4


DetectMultiBackend(
  (model): ClassificationModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (act): SiLU(inplace=True)
      )
      (2): C3(
        (cv1): Conv(
          (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
          (act): SiLU(inplace=True)
        )
        (cv3): Conv(
          (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
          (act): SiLU(inplace=True)
        )
        (m): Sequential(
          (0): Bottleneck(
            (cv1): Conv(
              (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
              (act): SiLU(inplace=True)
            )
            (cv2): 

In [6]:
# Display a summary to confirm ~ 7 M parameters and final head output = 4
summary(hub_model.model, input_size=(3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 32, 112, 112]           3,488
              SiLU-2         [-1, 32, 112, 112]               0
              Conv-3         [-1, 32, 112, 112]               0
            Conv2d-4           [-1, 64, 56, 56]          18,496
              SiLU-5           [-1, 64, 56, 56]               0
              Conv-6           [-1, 64, 56, 56]               0
            Conv2d-7           [-1, 32, 56, 56]           2,080
              SiLU-8           [-1, 32, 56, 56]               0
              Conv-9           [-1, 32, 56, 56]               0
           Conv2d-10           [-1, 32, 56, 56]           1,056
             SiLU-11           [-1, 32, 56, 56]               0
             Conv-12           [-1, 32, 56, 56]               0
           Conv2d-13           [-1, 32, 56, 56]           9,248
             SiLU-14           [-1, 32,

In [7]:
# Data paths and transforms
data_dir  = "/content/drive/MyDrive/spectrograms_split"
train_dir = os.path.join(data_dir, "train")
val_dir   = os.path.join(data_dir, "val")
test_dir  = os.path.join(data_dir, "test")

for path in [train_dir, val_dir, test_dir]:
    assert os.path.isdir(path), f"Directory not found: {path}"

In [8]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Required by YOLOv5s-CLS backbone
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std =[0.229, 0.224, 0.225]
    )
])

In [9]:
train_dataset = datasets.ImageFolder(train_dir, transform=transform)
val_dataset   = datasets.ImageFolder(val_dir,   transform=transform)
test_dataset  = datasets.ImageFolder(test_dir,  transform=transform)

print("Classes:", train_dataset.classes)  # ['mild','moderate','normal','severe']
num_classes = len(train_dataset.classes)

Classes: ['mild', 'moderate', 'normal', 'severe']


In [10]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,  num_workers=2)
val_loader   = DataLoader(val_dataset,   batch_size=32, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_dataset,  batch_size=32, shuffle=False, num_workers=2)

In [11]:
# Ensure all parameters are trainable
model = hub_model
for param in model.parameters():
    param.requires_grad = True

In [12]:
# Set up loss, optimizer, and scheduler
criterion    = nn.CrossEntropyLoss()
optimizer    = torch.optim.Adam(model.parameters(), lr=1e-4)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

In [None]:
# Train + validate for 20 epochs
num_epochs = 20
for epoch in range(num_epochs):
    # Train phase
    model.train()
    running_loss = 0.0
    for imgs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [train]"):
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)  # Shape: (batch_size, 4)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    avg_train_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1:2d} train loss: {avg_train_loss:.4f}")
    lr_scheduler.step()

    # Validation phase
    model.eval()
    val_loss = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for imgs, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [val]"):
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            correct_val += (preds == labels).sum().item()
            total_val += labels.size(0)
    avg_val_loss = val_loss / len(val_loader)
    val_acc = 100.0 * correct_val / total_val
    print(f"Epoch {epoch+1:2d} val loss: {avg_val_loss:.4f} | val acc: {val_acc:.2f}%\n")

print("Training complete.")

Epoch 1/20 [train]: 100%|██████████| 453/453 [33:04<00:00,  4.38s/it]


Epoch  1 train loss: 1.1729


Epoch 1/20 [val]: 100%|██████████| 97/97 [06:47<00:00,  4.20s/it]


Epoch  1 val loss: 1.0473 | val acc: 49.82%



Epoch 2/20 [train]: 100%|██████████| 453/453 [04:33<00:00,  1.66it/s]


Epoch  2 train loss: 0.8429


Epoch 2/20 [val]: 100%|██████████| 97/97 [00:59<00:00,  1.63it/s]


Epoch  2 val loss: 0.7375 | val acc: 67.49%



Epoch 3/20 [train]: 100%|██████████| 453/453 [04:36<00:00,  1.64it/s]


Epoch  3 train loss: 0.5326


Epoch 3/20 [val]: 100%|██████████| 97/97 [00:57<00:00,  1.70it/s]


Epoch  3 val loss: 0.4811 | val acc: 79.55%



Epoch 4/20 [train]: 100%|██████████| 453/453 [04:36<00:00,  1.64it/s]


Epoch  4 train loss: 0.3766


Epoch 4/20 [val]: 100%|██████████| 97/97 [00:57<00:00,  1.70it/s]


Epoch  4 val loss: 0.3702 | val acc: 84.91%



Epoch 5/20 [train]: 100%|██████████| 453/453 [04:34<00:00,  1.65it/s]


Epoch  5 train loss: 0.2754


Epoch 5/20 [val]: 100%|██████████| 97/97 [00:58<00:00,  1.65it/s]


Epoch  5 val loss: 0.3600 | val acc: 85.71%



Epoch 6/20 [train]: 100%|██████████| 453/453 [04:36<00:00,  1.64it/s]


Epoch  6 train loss: 0.2128


Epoch 6/20 [val]: 100%|██████████| 97/97 [00:56<00:00,  1.72it/s]


Epoch  6 val loss: 0.3129 | val acc: 87.84%



Epoch 7/20 [train]: 100%|██████████| 453/453 [04:34<00:00,  1.65it/s]


Epoch  7 train loss: 0.1541


Epoch 7/20 [val]: 100%|██████████| 97/97 [00:56<00:00,  1.71it/s]


Epoch  7 val loss: 0.2917 | val acc: 89.13%



Epoch 8/20 [train]: 100%|██████████| 453/453 [04:28<00:00,  1.69it/s]


Epoch  8 train loss: 0.1139


Epoch 8/20 [val]: 100%|██████████| 97/97 [00:59<00:00,  1.63it/s]


Epoch  8 val loss: 0.3511 | val acc: 88.78%



Epoch 9/20 [train]: 100%|██████████| 453/453 [04:35<00:00,  1.64it/s]


Epoch  9 train loss: 0.0896


Epoch 9/20 [val]: 100%|██████████| 97/97 [00:56<00:00,  1.72it/s]


Epoch  9 val loss: 0.2903 | val acc: 89.46%



Epoch 10/20 [train]: 100%|██████████| 453/453 [04:29<00:00,  1.68it/s]


Epoch 10 train loss: 0.0686


Epoch 10/20 [val]: 100%|██████████| 97/97 [00:57<00:00,  1.70it/s]


Epoch 10 val loss: 0.3258 | val acc: 89.78%



Epoch 11/20 [train]: 100%|██████████| 453/453 [04:39<00:00,  1.62it/s]


Epoch 11 train loss: 0.0124


Epoch 11/20 [val]: 100%|██████████| 97/97 [00:56<00:00,  1.71it/s]


Epoch 11 val loss: 0.2555 | val acc: 92.26%



Epoch 12/20 [train]:  43%|████▎     | 196/453 [02:04<03:03,  1.40it/s]

In [None]:
# Test evaluation
model.eval()
all_preds = []
all_labels = []
correct_test = 0
total_test = 0

with torch.no_grad():
    for imgs, labels in tqdm(test_loader, desc="Testing"):
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)
        _, preds = torch.max(outputs, 1)
        correct_test += (preds == labels).sum().item()
        total_test += labels.size(0)
        all_preds.append(preds.cpu())
        all_labels.append(labels.cpu())

test_acc = 100.0 * correct_test / total_test
print(f"\nTest accuracy: {test_acc:.2f}%")  # Expect ≈ 88.2%

all_preds = torch.cat(all_preds).numpy()
all_labels = torch.cat(all_labels).numpy()

precision_test = precision_score(all_labels, all_preds, average="macro", zero_division=0)
recall_test    = recall_score(all_labels, all_preds, average="macro", zero_division=0)
f1_test        = f1_score(all_labels, all_preds, average="macro", zero_division=0)

print(f"Test precision (macro): {precision_test:.4f}")
print(f"Test recall    (macro): {recall_test:.4f}")
print(f"Test f1-score  (macro): {f1_test:.4f}\n")

print("Test: per-class precision / recall / f1:\n")
print(classification_report(
    all_labels,
    all_preds,
    target_names=test_dataset.classes,
    zero_division=0
))

cm = confusion_matrix(all_labels, all_preds)
print("Test confusion matrix (rows=true, cols=predicted):\n", cm)