In [1]:
!pip install -U \
  numpy \
  librosa \
  soundfile \
  torch \
  torchaudio \
  transformers \
  scikit-learn \
  faiss-cpu


Collecting numpy
  Downloading numpy-2.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting torch
  Downloading torch-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting torchaudio
  Downloading torchaudio-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (6.9 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-cupti-cu12==12.8.90 (from torch)

In [2]:
!pip install -U torch==2.9.0 torchaudio==2.9.0 torchvision==0.24.0


Collecting torch==2.9.0
  Downloading torch-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting torchaudio==2.9.0
  Downloading torchaudio-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (6.9 kB)
Collecting triton==3.5.0 (from torch==2.9.0)
  Downloading triton-3.5.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.7 kB)
Downloading torch-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl (899.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[0mDownloading torchaudio-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading triton-3.5.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (170.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.5/170.5 MB[0m [31m6.6 MB/s[0m e

# Global Setup

Cell 1 initializes the AudioTrust environment by enforcing reproducibility, deterministic GPU behavior, and Colab-safe defaults, while defining global audio and training assumptions such as sample rate, maximum audio duration, and batch size. This cell ensures that all experiments are stable, repeatable, and consistent across runs, and it deliberately avoids any data loading, feature extraction, or model logic so that these global settings never need to change as the project evolves.

In [3]:
# =========================================================
# AudioTrust — Global Setup (Colab-First)
# =========================================================

import os
import random
import numpy as np
import torch

# -------------------------
# Reproducibility
# -------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# -------------------------
# Device configuration
# -------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# -------------------------
# Audio configuration
# -------------------------
SAMPLE_RATE = 16000          # Required for ASVspoof & SSL models
MAX_DURATION = 6.0           # seconds (explicit, NOT implicit)
MAX_SAMPLES = int(SAMPLE_RATE * MAX_DURATION)

# -------------------------
# Feature configuration
# -------------------------
N_FFT = 1024
HOP_LENGTH = 256
N_MELS = 80

# -------------------------
# Training configuration
# -------------------------
BATCH_SIZE = 16              # Colab-safe
NUM_WORKERS = 2              # Avoid Colab crashes

print("Global config loaded.")

Using device: cuda
Global config loaded.


In [4]:
import torchaudio
from torch.utils.data import Dataset


# ASVspoof Dataset

Cell 2 defines how raw ASVspoof audio enters the AudioTrust system by implementing a PyTorch Dataset that parses official protocol files for labels, loads audio as waveforms, standardizes it to mono 16 kHz audio, and applies explicit padding or truncation to a fixed duration. This waveform-first, framework-native design avoids hardcoded spectrogram assumptions, supports real-time inference later, and provides a clean, reusable data–model interface for CNNs, VAEs, GANs, and self-supervised speech models.

In [5]:
import os
import torch
import torchaudio
from torch.utils.data import Dataset


class ASVspoofDataset(Dataset):
    """
    PyTorch Dataset for ASVspoof (2019 / 2021).

    Returns:
        waveform: Tensor of shape (max_samples,)
        label: 0 = bonafide (real), 1 = spoof (fake)
    """

    def __init__(
        self,
        audio_dir: str,
        protocol_file: str,
        sample_rate: int,
        max_samples: int,
    ):
        self.audio_dir = audio_dir
        self.sample_rate = sample_rate
        self.max_samples = max_samples

        # -------------------------------
        # Parse ASVspoof protocol file
        # -------------------------------
        self.file_labels = {}

        with open(protocol_file, "r") as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) < 5:
                    continue

                file_id = parts[1]            # e.g. LA_T_1000137
                label_str = parts[-1]         # bonafide / spoof
                label = 0 if label_str == "bonafide" else 1

                self.file_labels[file_id] = label

        self.files = sorted(self.file_labels.keys())

        print(f"[ASVspoofDataset] Loaded {len(self.files)} samples")

    def __len__(self):
        return len(self.files)

    def _load_audio(self, file_path: str) -> torch.Tensor:
        """
        Load audio, convert to mono, resample, pad/truncate.
        """
        waveform, sr = torchaudio.load(file_path)  # (C, T)

        # Convert to mono
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # Resample if needed
        if sr != self.sample_rate:
            waveform = torchaudio.functional.resample(
                waveform, sr, self.sample_rate
            )

        waveform = waveform.squeeze(0)  # (T,)

        # Pad or truncate to fixed length
        if waveform.shape[0] > self.max_samples:
            waveform = waveform[: self.max_samples]
        else:
            pad_len = self.max_samples - waveform.shape[0]
            waveform = torch.nn.functional.pad(waveform, (0, pad_len))

        return waveform

    def __getitem__(self, idx):
        file_id = self.files[idx]
        label = self.file_labels[file_id]

        audio_path = os.path.join(self.audio_dir, file_id + ".flac")
        waveform = self._load_audio(audio_path)

        return waveform, label
