In [None]:
import os
import torch
import torch.nn as nn
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
from torchsummary import summary

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Set device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# data_dir = "../data/spectrograms_split"
data_dir = "/content/drive/MyDrive/spectrograms_split"

In [None]:
# Define transformations (resize, convert to tensor, normalize using ImageNet stats)
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Because the ResNet model was trained on images of this exact size. Feeding in differently-sized images would confuse it.
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])  # Make the colors of the image more uniform by subtracting the average color and dividing by how much the color varies. This helps the model learn faster and better — because the pixel values are centered and scaled. These specific numbers are the mean and standard deviation of images in the ImageNet dataset, which is what ResNet was trained on.

# The transforms.ToTensor() function is a key step in preparing image data for deep learning with PyTorch.
# It converts a PIL image (which stores pixel values as integers from 0 to 255) into a PyTorch tensor,
# which is a multi-dimensional array of floating-point numbers.
# During this process, the pixel values are scaled from the 0–255 range to a normalized range of 0.0 to 1.0.
# Additionally, the image’s dimensions are rearranged from (Height, Width, Channels),
# which is common in image libraries, to (Channels, Height, Width),
# which is the format expected by PyTorch models like ResNet.
# This transformation ensures that the image is in the correct format and value range for input into a neural network,
# making the training process more stable and efficient.

In [None]:
# find ../data/spectrograms_split -name '.DS_Store' -delete

In [None]:
# Load train and validation datasets
train_dataset = datasets.ImageFolder(os.path.join(data_dir, 'train'), transform=transform)
val_dataset   = datasets.ImageFolder(os.path.join(data_dir, 'val'), transform=transform)

# This code loads and prepares all your training and validation images,
# automatically assigning each one a label (a number that represents its class),
# based on the folder it's in — so your model can learn what class each image belongs to.

In [None]:
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)  # each time the model trains, it processes 32 images at once (a batch); uses 4 CPU threads to load the images in the background for faster training
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

In [None]:
# Define the ResNet64 model (based on resnet50)
class ResNet64(nn.Module):
    def __init__(self, num_classes=4):  # the model is designed to classify images into 4 categories
        super(ResNet64, self).__init__()  # This line initializes the parent class (nn.Module). This is necessary in PyTorch to make sure your custom model behaves like a standard neural network.
        self.model = models.resnet50(pretrained=True)  # Load a pre-trained ResNet50 model, which has been trained on the ImageNet dataset.
        self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)  # Replace final layer (1000 classes in ImageNet) with a new layer that outputs the number of classes in our dataset (4 classes for spectrograms).

    def forward(self, x):
        return self.model(x) # This method defines how the input data flows through the model. It takes an input tensor `x` and passes it through the ResNet50 model, returning the output.

In [None]:
# Instantiate model
model = ResNet64(num_classes=4).to(device)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /Users/alinakurliantseva/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:04<00:00, 21.2MB/s]


In [None]:
# Show model summary
summary(model, input_size=(3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256,

In [None]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [None]:
# Training loop
num_epochs = 20  # model will see each training example 20 times, but likely in different shuffled orders (shuffle=True => DataLoader)
# Each epoch is broken into mini-batches (32 images per batch), and the model is updated after processing each batch.
# With each epoch, the model gradually learns better patterns from the data by adjusting its internal weights based on the loss.
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()  # clears out old gradients (otherwise they accumulate)
        loss.backward()  # computes the gradients (derivatives of the loss)
        optimizer.step()  # uses the gradients to adjust the model's weights via gradient descent

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

Epoch [1/20], Loss: 0.1856
Epoch [2/20], Loss: 0.0156
Epoch [3/20], Loss: 0.0178
Epoch [4/20], Loss: 0.0012
Epoch [5/20], Loss: 0.0002
Epoch [6/20], Loss: 0.0002
Epoch [7/20], Loss: 0.0001
Epoch [8/20], Loss: 0.0001
Epoch [9/20], Loss: 0.0320
Epoch [10/20], Loss: 0.0141
Epoch [11/20], Loss: 0.0004
Epoch [12/20], Loss: 0.0002


In [None]:
# Evaluation on validation set
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Validation Accuracy: {accuracy:.2f}%")