In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Imports and device setup
import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchsummary import summary
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix
from tqdm import tqdm
from PIL import Image
import random
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

Using device: cuda


In [3]:
# Load pretrained YOLOv5s-CLS v6.1 via torch.hub
# This pulls both the architecture and weights for the 214-layer (~7 M parameters) network.
hub_model = torch.hub.load(
    'ultralytics/yolov5',   # Repository name
    'custom',               # “custom” loads a classification checkpoint
    path='yolov5s-cls.pt',  # This will fetch v6.1’s official checkpoint
    source='github'         # Ensure it pulls from GitHub
).to(device)

hub_model.eval()
print("Successfully loaded YOLOv5s-CLS via torch.hub.")

Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2025-6-3 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)

Fusing layers... 
Model summary: 117 layers, 5447688 parameters, 0 gradients, 11.4 GFLOPs


Successfully loaded YOLOv5s-CLS via torch.hub.


In [4]:
# Inspect the internal Classify block to find the final Linear layer
print(hub_model.model)

ClassificationModel(
  (model): Sequential(
    (0): Conv(
      (conv): Conv2d(3, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
      (act): SiLU(inplace=True)
    )
    (1): Conv(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (act): SiLU(inplace=True)
    )
    (2): C3(
      (cv1): Conv(
        (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
        (act): SiLU(inplace=True)
      )
      (cv2): Conv(
        (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
        (act): SiLU(inplace=True)
      )
      (cv3): Conv(
        (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
        (act): SiLU(inplace=True)
      )
      (m): Sequential(
        (0): Bottleneck(
          (cv1): Conv(
            (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv2): Conv(
            (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1

In [5]:
# Replace the final “Linear(in_features=1280, out_features=1000)” with “Linear(1280 → 4)”
classify_block = hub_model.model.model[-1]  # This is the Classify(...) module
in_features = classify_block.linear.in_features
print(f"Replacing final linear: in_features = {in_features}, out_features = 4")
classify_block.linear = nn.Linear(in_features, 4)
hub_model.to(device)


Replacing final linear: in_features = 1280, out_features = 4


DetectMultiBackend(
  (model): ClassificationModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (act): SiLU(inplace=True)
      )
      (2): C3(
        (cv1): Conv(
          (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
          (act): SiLU(inplace=True)
        )
        (cv3): Conv(
          (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
          (act): SiLU(inplace=True)
        )
        (m): Sequential(
          (0): Bottleneck(
            (cv1): Conv(
              (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
              (act): SiLU(inplace=True)
            )
            (cv2): 

In [6]:
# Display a summary to confirm ~ 7 M parameters and final head output = 4
summary(hub_model.model, input_size=(3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 32, 112, 112]           3,488
              SiLU-2         [-1, 32, 112, 112]               0
              Conv-3         [-1, 32, 112, 112]               0
            Conv2d-4           [-1, 64, 56, 56]          18,496
              SiLU-5           [-1, 64, 56, 56]               0
              Conv-6           [-1, 64, 56, 56]               0
            Conv2d-7           [-1, 32, 56, 56]           2,080
              SiLU-8           [-1, 32, 56, 56]               0
              Conv-9           [-1, 32, 56, 56]               0
           Conv2d-10           [-1, 32, 56, 56]           1,056
             SiLU-11           [-1, 32, 56, 56]               0
             Conv-12           [-1, 32, 56, 56]               0
           Conv2d-13           [-1, 32, 56, 56]           9,248
             SiLU-14           [-1, 32,

In [7]:
# Data paths and transforms
data_dir  = "/content/drive/MyDrive/spectrograms_split"
train_dir = os.path.join(data_dir, "train")
val_dir   = os.path.join(data_dir, "val")
test_dir  = os.path.join(data_dir, "test")

for path in [train_dir, val_dir, test_dir]:
    assert os.path.isdir(path), f"Directory not found: {path}"

In [8]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Required by YOLOv5s-CLS backbone
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std =[0.229, 0.224, 0.225]
    )
])

In [9]:
train_dataset = datasets.ImageFolder(train_dir, transform=transform)
val_dataset   = datasets.ImageFolder(val_dir,   transform=transform)
test_dataset  = datasets.ImageFolder(test_dir,  transform=transform)

print("Classes:", train_dataset.classes)  # ['mild','moderate','normal','severe']
num_classes = len(train_dataset.classes)

Classes: ['mild', 'moderate', 'normal', 'severe']


In [10]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,  num_workers=2)
val_loader   = DataLoader(val_dataset,   batch_size=32, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_dataset,  batch_size=32, shuffle=False, num_workers=2)

In [11]:
# Ensure all parameters are trainable
model = hub_model
for param in model.parameters():
    param.requires_grad = True

In [12]:
# Set up loss, optimizer, and scheduler
criterion    = nn.CrossEntropyLoss()
optimizer    = torch.optim.Adam(model.parameters(), lr=1e-4)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)