# ASVspoof 2019 — TCAV Global Subset Creation
Create a balanced TEST subset:
- 20 speakers
- 10 real + 10 fake per speaker
- Total ≈ 400 samples

This preserves HF Audio feature (no pandas conversion).

In [1]:
from datasets import load_dataset
import numpy as np
from pathlib import Path

PROJECT_ROOT = Path("/home/SpeakerRec/BioVoice")

# Load TEST split
test_ds = load_dataset("Bisher/ASVspoof_2019_LA", split="test")

print("Total test samples:", len(test_ds))
print("Unique speakers:", len(test_ds.unique("speaker_id")))

  from .autonotebook import tqdm as notebook_tqdm


Total test samples: 71237
Unique speakers: 67


  table = cls._concat_blocks(blocks, axis=0)


## Create Balanced Subset (10 Real + 10 Fake per Speaker)

In [2]:
N_REAL = 10
N_FAKE = 10

balanced_indices = []

speaker_ids = test_ds.unique("speaker_id")

for spk in speaker_ids:
    spk_indices = np.where(np.array(test_ds["speaker_id"]) == spk)[0]
    spk_subset = test_ds.select(spk_indices)

    real_indices = np.where(np.array(spk_subset["key"]) == 1)[0]
    fake_indices = np.where(np.array(spk_subset["key"]) == 0)[0]

    real_sample = np.random.choice(
        real_indices,
        size=min(N_REAL, len(real_indices)),
        replace=False
    )

    fake_sample = np.random.choice(
        fake_indices,
        size=min(N_FAKE, len(fake_indices)),
        replace=False
    )

    balanced_indices.extend(spk_indices[real_sample])
    balanced_indices.extend(spk_indices[fake_sample])

tcav_subset = test_ds.select(balanced_indices)

print("TCAV subset size:", len(tcav_subset))
print("Class distribution:", {k:int(v) for k,v in zip(*np.unique(tcav_subset['key'], return_counts=True))})

TCAV subset size: 1150
Class distribution: {0: 670, 1: 480}


## Save Subset To Disk

In [3]:
SAVE_PATH = (
    PROJECT_ROOT
    / "data"
    / "datasets"
    / "asv_spoof_2019"
    / "tcav__20_speakers_10_real_10_fake"
)

tcav_subset.save_to_disk(SAVE_PATH)

print("Saved TCAV subset to:", SAVE_PATH)

Saving the dataset (1/1 shards): 100%|██████████| 1150/1150 [00:00<00:00, 4633.72 examples/s]

Saved TCAV subset to: /home/SpeakerRec/BioVoice/data/datasets/asv_spoof_2019/tcav__20_speakers_10_real_10_fake



