In [None]:
# import sys
# sys.path.append('/kaggle/input/project-541/project')

import os
import random
from typing import List

import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt

import torch
import torchaudio
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models

from augment import WaveformAugment  

In [None]:
# configurations

DATA_DIR = "./data/UrbanSound8K" # dataset path
MODEL_DIR = "./model/model-b" # store trained model and figure (path)
os.makedirs(MODEL_DIR, exist_ok=True)

SAMPLE_RATE = 22050 # sample rate
CLIP_DURATION = 4.0 # length of clip
N_CLASSES = 10

N_MELS = 128
FMIN = 0
FMAX = SAMPLE_RATE // 2

BATCH_SIZE = 128 # batch size
NUM_EPOCHS = 10 # epoch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)


def set_seed(seed: int = 38):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
set_seed(38)

Using device: cuda


In [3]:
# Dataset construct

class UrbanSound8KWaveDataset(Dataset):
    def __init__(
        self,
        data_dir: str,
        folds: List[int],
        sample_rate: int = SAMPLE_RATE,
        duration: float = CLIP_DURATION,
        waveform_augment: WaveformAugment = None,
    ):
        super().__init__()
        self.data_dir = data_dir
        self.folds = folds if isinstance(folds, list) else [folds]
        self.sample_rate = sample_rate
        self.duration = duration
        self.n_samples = int(sample_rate * duration)
        self.waveform_augment = waveform_augment

        meta_path = os.path.join(data_dir, "metadata", "UrbanSound8K.csv")
        df = pd.read_csv(meta_path)
        self.df = df[df["fold"].isin(self.folds)].reset_index(drop=True)
        self.labels = self.df["classID"].astype(int).to_numpy()

    def __len__(self):
        return len(self.df)

    def _load_waveform(self, index: int) -> torch.Tensor:
        row = self.df.iloc[index]
        fold = row["fold"]
        filename = row["slice_file_name"]
        file_path = os.path.join(self.data_dir, "audio", f"fold{fold}", filename)

        # resample
        wav, sr = librosa.load(file_path, sr=self.sample_rate, mono=True)

        # pad / truncate to 4 seconds
        if len(wav) < self.n_samples:
            wav = np.pad(wav, (0, self.n_samples - len(wav)), mode="constant")
        elif len(wav) > self.n_samples:
            wav = wav[: self.n_samples]

        # normalization
        if np.std(wav) > 1e-6:
            wav = (wav - wav.mean()) / wav.std()

        wav = wav.astype(np.float32)
        return torch.from_numpy(wav).unsqueeze(0)  # [1, T]

    def __getitem__(self, index: int):
        label = int(self.labels[index])
        wav = self._load_waveform(index)

        # data augmentation (for train set)
        if self.waveform_augment is not None:
            wav = self.waveform_augment(wav)

        return wav, torch.tensor(label, dtype=torch.long) # return waveform [1, T] and label

In [4]:
# pretrained VGG-13-BN model (pretrained on imagenet1k)

def create_pretrained_vgg13(num_classes: int = N_CLASSES) -> nn.Module:
    try:
        vgg = models.vgg13_bn(weights=models.VGG13_BN_Weights.IMAGENET1K_V1)
    except AttributeError:
        vgg = models.vgg13_bn(pretrained=True)

    # change the last classifier to 10 classes
    in_features = vgg.classifier[-1].in_features
    vgg.classifier[-1] = nn.Linear(in_features, num_classes)

    return vgg

In [5]:
# mel transformation (change waveform to mel spectrogram)

mel_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=N_MELS,
    f_min=FMIN,
    f_max=FMAX,
    power=2.0,
).to(DEVICE)

db_transform = torchaudio.transforms.AmplitudeToDB().to(DEVICE)


def waveform_batch_to_mel(x_wave: torch.Tensor) -> torch.Tensor:
    x_in = x_wave.squeeze(1)   # [B, T]

    # generate mel
    mel = mel_transform(x_in)
    mel_db = db_transform(mel)

    # normalization
    mean = mel_db.mean(dim=(1, 2), keepdim=True)
    std = mel_db.std(dim=(1, 2), keepdim=True)
    mel_db = (mel_db - mean) / (std + 1e-6)

    # add dimension for channel [B, 1, n_mels, frames]
    mel_db = mel_db.unsqueeze(1)

    # copy to 3 channels [B, 3, n_mels, frames]
    mel_db = mel_db.repeat(1, 3, 1, 1)

    return mel_db

In [6]:
# 3-stage Fine-tuning

def set_trainable_stage(model: nn.Module, stage: int):
    """
    stage 1: only train classifier
    stage 2: train the final conv block and classifier
    stage 3: train the whole model
    """
    # freeze all
    for p in model.parameters():
        p.requires_grad = False

    if stage == 1:
        # unfreeze classifier
        for p in model.classifier.parameters():
            p.requires_grad = True

    elif stage == 2:
        # unfreeze classifier + last conv block
        for p in model.classifier.parameters():
            p.requires_grad = True

        features = list(model.features.children())
        for layer in features[-6:]:
            for p in layer.parameters():
                p.requires_grad = True

    elif stage == 3:
        # freeze all
        for p in model.parameters():
            p.requires_grad = True

    else:
        raise ValueError(f"Unknown stage: {stage}")


def create_optimizer_for_stage(model: nn.Module, stage: int): # construct optimizer
    if stage == 1:
        lr = 1e-5
    elif stage == 2:
        lr = 5e-6
    elif stage == 3:
        lr = 1e-6
    else:
        raise ValueError(f"Unknown stage: {stage}")

    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.Adam(params, lr=lr, weight_decay=1e-4) # Adam, L2 regularization
    return optimizer, lr

def get_stage_for_epoch(epoch: int):
    """
    1-10  : stage 1
    11-12 : stage 2
    13-20: stage 3
    """
    if epoch <= 10:
        return 1
    elif epoch <= 12:
        return 2
    else:
        return 3

In [7]:
# Functions for train, validate and plot

def train_one_epoch(model, loader, criterion, optimizer, device): # train one epoch
    model.train()
    running_loss, correct, total = 0.0, 0, 0

    for x_wave, y in loader:
        x_wave = x_wave.to(device) # [B, 1, T]
        y = y.to(device)

        # change waveform to mel
        x = waveform_batch_to_mel(x_wave) # [B, 1, n_mels, frames]

        optimizer.zero_grad() # zero gradient
        logits = model(x) # forward
        loss = criterion(logits, y) # loss
        loss.backward() # backward
        optimizer.step() # update

        running_loss += loss.item() * x.size(0)
        _, preds = torch.max(logits, dim=1)
        correct += (preds == y).sum().item()
        total += y.size(0)

    return running_loss / total, correct / total


@torch.no_grad() # evaluate
def evaluate(model, loader, criterion, device):
    model.eval()
    running_loss, correct, total = 0.0, 0, 0

    for x_wave, y in loader:
        x_wave = x_wave.to(device)
        y = y.to(device)

        x = waveform_batch_to_mel(x_wave)

        logits = model(x)
        loss = criterion(logits, y)

        running_loss += loss.item() * x.size(0)
        _, preds = torch.max(logits, dim=1)
        correct += (preds == y).sum().item()
        total += y.size(0)

    return running_loss / total, correct / total


def plot_learning_curve(history, fold_id: int, save_dir: str = MODEL_DIR): # plot
    epochs = range(1, len(history["train_loss"]) + 1)

    plt.figure(figsize=(10, 4))

    # loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, history["train_loss"], label="Train Loss")
    plt.plot(epochs, history["val_loss"], label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title(f"b - Fold {fold_id} - Loss")
    plt.legend()

    # accuracy
    plt.subplot(1, 2, 2)
    plt.plot(epochs, history["train_acc"], label="Train Acc")
    plt.plot(epochs, history["val_acc"], label="Val Acc")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title(f"b - Fold {fold_id} - Accuracy")
    plt.legend()

    plt.tight_layout()
    save_path = os.path.join(save_dir, f"vgg13_b_fold{fold_id}_curve.png")
    plt.savefig(save_path, dpi=150)
    plt.close()
    print(f"Saved learning curve to: {save_path}")

In [8]:
# 10-fold Cross-Validation train

all_folds = list(range(1, 11))

for test_fold in all_folds:
    print("=" * 80)
    print(f"Fold {test_fold} as TEST fold")
    print("=" * 80)

    # current fold as test set, remaining 9 folds as train set and val set
    trainval_folds = [f for f in all_folds if f != test_fold]

    # last fold of remaining 9 set as val set，remaining 8 folds as train set
    val_fold = trainval_folds[-1]
    train_folds = trainval_folds[:-1]

    print(f"Train folds: {train_folds}")
    print(f"Val fold:   {val_fold}")
    print(f"Test fold:  {test_fold}")

    # augmentation
    wave_aug = WaveformAugment(sample_rate=SAMPLE_RATE)

    train_dataset = UrbanSound8KWaveDataset(
        data_dir=DATA_DIR,
        folds=train_folds,
        sample_rate=SAMPLE_RATE,
        duration=CLIP_DURATION,
        waveform_augment=None, # don't do augmentation
    )

    val_dataset = UrbanSound8KWaveDataset(
        data_dir=DATA_DIR,
        folds=[val_fold],
        sample_rate=SAMPLE_RATE,
        duration=CLIP_DURATION,
        waveform_augment=None, # don't do augmentation
    )

    test_dataset = UrbanSound8KWaveDataset(
        data_dir=DATA_DIR,
        folds=[test_fold],
        sample_rate=SAMPLE_RATE,
        duration=CLIP_DURATION,
        waveform_augment=None, # don't do augmentation
    )

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, num_workers=2)
    val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE,
                              shuffle=False, num_workers=2)
    test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE,
                              shuffle=False, num_workers=2)

    # creat model VGG-13
    model = create_pretrained_vgg13(num_classes=N_CLASSES).to(DEVICE)
    criterion = nn.CrossEntropyLoss()

    history = {
        "train_loss": [],
        "val_loss": [],
        "train_acc": [],
        "val_acc": [],
    }

    current_stage = None
    optimizer = None
    current_lr = None

    for epoch in range(1, NUM_EPOCHS + 1):
        stage = get_stage_for_epoch(epoch)
        if stage != current_stage:
            # reset requires_grad and optimizer when entering new stage
            current_stage = stage
            set_trainable_stage(model, stage)
            optimizer, current_lr = create_optimizer_for_stage(model, stage)
            print(f"\nSwitch to stage {stage} at epoch {epoch}, lr = {current_lr:g}")

        print(f"\n[Fold {test_fold}] Epoch {epoch}/{NUM_EPOCHS} (Stage {current_stage}, lr={current_lr:g})")

        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, DEVICE) # train one epoch
        val_loss, val_acc = evaluate(model, val_loader, criterion, DEVICE) # evaluate on val set

        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        history["train_acc"].append(train_acc)
        history["val_acc"].append(val_acc)

        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
        print(f"Val   Loss: {val_loss:.4f}, Val   Acc: {val_acc:.4f}")

    # plot learning curve
    plot_learning_curve(history, fold_id=test_fold, save_dir=MODEL_DIR)

    # test on test set after finishing training
    test_loss, test_acc = evaluate(model, test_loader, criterion, DEVICE)
    print(f"[Fold {test_fold}] Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")

    # store the model
    model_path = os.path.join(MODEL_DIR, f"vgg13_b_fold{test_fold}.pth") #  fold i, means that it is used for test for this training
    torch.save(model.state_dict(), model_path)
    print(f"Saved model for fold {test_fold} to: {model_path}")

Fold 1 as TEST fold
Train folds: [2, 3, 4, 5, 6, 7, 8, 9]
Val fold:   10
Test fold:  1


Downloading: "https://download.pytorch.org/models/vgg13_bn-abd245e5.pth" to /root/.cache/torch/hub/checkpoints/vgg13_bn-abd245e5.pth
100%|██████████| 508M/508M [00:02<00:00, 255MB/s] 



Switch to stage 1 at epoch 1, lr = 1e-05

[Fold 1] Epoch 1/10 (Stage 1, lr=1e-05)
Train Loss: 2.0321, Train Acc: 0.2985
Val   Loss: 1.6469, Val   Acc: 0.5579

[Fold 1] Epoch 2/10 (Stage 1, lr=1e-05)
Train Loss: 1.4052, Train Acc: 0.5689
Val   Loss: 1.1612, Val   Acc: 0.6547

[Fold 1] Epoch 3/10 (Stage 1, lr=1e-05)
Train Loss: 1.0593, Train Acc: 0.6659
Val   Loss: 0.9862, Val   Acc: 0.7025

[Fold 1] Epoch 4/10 (Stage 1, lr=1e-05)
Train Loss: 0.8675, Train Acc: 0.7257
Val   Loss: 0.9060, Val   Acc: 0.7228

[Fold 1] Epoch 5/10 (Stage 1, lr=1e-05)
Train Loss: 0.7407, Train Acc: 0.7640
Val   Loss: 0.8465, Val   Acc: 0.7360

[Fold 1] Epoch 6/10 (Stage 1, lr=1e-05)
Train Loss: 0.6563, Train Acc: 0.7888
Val   Loss: 0.8307, Val   Acc: 0.7419

[Fold 1] Epoch 7/10 (Stage 1, lr=1e-05)
Train Loss: 0.5667, Train Acc: 0.8218
Val   Loss: 0.8186, Val   Acc: 0.7443

[Fold 1] Epoch 8/10 (Stage 1, lr=1e-05)
Train Loss: 0.5172, Train Acc: 0.8345
Val   Loss: 0.7993, Val   Acc: 0.7539

[Fold 1] Epoch 9/10 (