
# ASVspoof5 100-Speakers Plan + Selective Extraction (train/val/test)

Design in this notebook:

- `100` speakers total
- split groups: `train/val/test = 60/20/20`
- partition mix per split:
  - `train`: 15(train partition) / 15(dev partition) / 30(eval partition)
  - `val`: 5 / 5 / 10
  - `test`: 5 / 5 / 10
- per speaker quota: `32 bonafide + 48 spoof`
- spoof quota by partition:
  - `train` partition (`A01-A08`): `6` per system
  - `dev` partition (`A09-A16`): `6` per system
  - `eval` partition (`A17-A32`): `3` per system

This notebook does two things:
1. Builds the selection manifest CSV used by the 100-speaker logistic notebooks
2. Selectively extracts only needed audio from tar shards


In [None]:

from pathlib import Path
import json
import tarfile
import pandas as pd
import numpy as np


def _find_project_root_from_cwd() -> Path | None:
    cwd = Path.cwd().resolve()
    for cand in [cwd, *cwd.parents]:
        if (cand / 'ASVspoof5_protocols').exists() and (cand / 'redimnet').exists():
            return cand
    return None

_detected_root = _find_project_root_from_cwd()
PROJECT_ROOT = _detected_root if _detected_root is not None else Path('/home/SpeakerRec/BioVoice')

PROTOCOL_PATHS = {
    'train': PROJECT_ROOT / 'ASVspoof5_protocols' / 'ASVspoof5.train.tsv',
    'dev': PROJECT_ROOT / 'ASVspoof5_protocols' / 'ASVspoof5.dev.track_1.tsv',
    'eval': PROJECT_ROOT / 'ASVspoof5_protocols' / 'ASVspoof5.eval.track_1.tsv',
}

# Adjust these to your local tar folders if needed.
TAR_DIRS = {
    'train': PROJECT_ROOT / 'ASVspoof5_audio_train_tars',
    'dev': PROJECT_ROOT / 'ASVspoof5_audio_dev_tars',
    'eval': PROJECT_ROOT / 'ASVspoof5_audio_eval_tars',
}

OUT_DIR = PROJECT_ROOT / 'ASVspoof5_protocols' / 'subset_100_speakers_outputs'
OUT_DIR.mkdir(parents=True, exist_ok=True)

NOTEBOOK_DIR = PROJECT_ROOT / 'redimnet' / 'logistic_regression' / 'asvspoof5' / '100_speakers'
NOTEBOOK_DIR.mkdir(parents=True, exist_ok=True)

MANIFEST_OUT = NOTEBOOK_DIR / 'asvspoof5_100_speakers_selected_utterances_plan.csv'
SPEAKER_PLAN_OUT = NOTEBOOK_DIR / 'asvspoof5_100_speakers_selected_speakers_plan.csv'
AUDIT_OUT = NOTEBOOK_DIR / 'asvspoof5_100_speakers_selection_audit.csv'
SUMMARY_OUT = NOTEBOOK_DIR / 'asvspoof5_100_speakers_plan_summary.json'

EXTRACT_DIR = PROJECT_ROOT / 'data' / 'datasets' / 'asvspoof5_100_speakers_32_real_48_spoof'
EXTRACT_DIR.mkdir(parents=True, exist_ok=True)

SEED = 42
BONA_PER_SPK = 32

GROUP_PARTITION_SPEAKERS = {
    'train': {'train': 15, 'dev': 15, 'eval': 30},
    'val':   {'train': 5,  'dev': 5,  'eval': 10},
    'test':  {'train': 5,  'dev': 5,  'eval': 10},
}

SYSTEMS_BY_PARTITION = {
    'train': [f'A{i:02d}' for i in range(1, 9)],
    'dev': [f'A{i:02d}' for i in range(9, 17)],
    'eval': [f'A{i:02d}' for i in range(17, 33)],
}

SPOOF_QUOTA_BY_PARTITION = {
    'train': 6,
    'dev': 6,
    'eval': 3,
}

for split in ['train', 'val', 'test']:
    for lbl in ['bonafide', 'spoof']:
        (EXTRACT_DIR / split / lbl).mkdir(parents=True, exist_ok=True)

print('PROJECT_ROOT =', PROJECT_ROOT)
print('OUT_DIR =', OUT_DIR)
print('NOTEBOOK_DIR =', NOTEBOOK_DIR)
print('MANIFEST_OUT =', MANIFEST_OUT)
print('EXTRACT_DIR =', EXTRACT_DIR)
for part, p in PROTOCOL_PATHS.items():
    print(f'PROTOCOL[{part}] =', p, '| exists =', p.exists())
for part, p in TAR_DIRS.items():
    print(f'TAR_DIR[{part}] =', p, '| exists =', p.exists())


In [None]:

cols = ['speaker_id','utt_id','gender','codec_id','codec_q','source_utt_id','attack_codec_id','system_id','label','unused']


def load_protocol(partition: str, path: Path) -> pd.DataFrame:
    assert path.exists(), f'Missing protocol file: {path}'
    rows = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            t = line.strip().split()
            if len(t) != 10:
                continue
            rows.append(dict(zip(cols, t)))
    df = pd.DataFrame(rows)
    df['partition'] = partition
    return df

all_parts = []
for part in ['train', 'dev', 'eval']:
    d = load_protocol(part, PROTOCOL_PATHS[part])
    all_parts.append(d)

df = pd.concat(all_parts, ignore_index=True)

print('Total rows:', len(df))
print('Total unique speakers:', df['speaker_id'].nunique())
print(df.groupby('partition')['speaker_id'].nunique().rename('speakers'))
print('Label counts by partition:')
print(df.groupby(['partition','label']).size().unstack(fill_value=0))
print('Spoof systems by partition:')
for part in ['train', 'dev', 'eval']:
    systems = sorted(df[(df['partition']==part) & (df['label']=='spoof')]['system_id'].unique().tolist())
    print(part, systems)


In [None]:


def sample_gender_balanced(eligible_df: pd.DataFrame, n_pick: int, seed: int) -> pd.DataFrame:
    eligible_df = eligible_df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    f = eligible_df[eligible_df['gender'].eq('F')].copy()
    m = eligible_df[eligible_df['gender'].eq('M')].copy()

    target_f = n_pick // 2
    target_m = n_pick - target_f

    if len(f) < target_f:
        target_f = len(f)
        target_m = n_pick - target_f
    if len(m) < target_m:
        target_m = len(m)
        target_f = n_pick - target_m

    picked = pd.concat([f.head(target_f), m.head(target_m)], ignore_index=True)
    if len(picked) < n_pick:
        used = set(picked['speaker_id'])
        extra = eligible_df[~eligible_df['speaker_id'].isin(used)].head(n_pick - len(picked))
        picked = pd.concat([picked, extra], ignore_index=True)

    return picked.drop_duplicates('speaker_id').head(n_pick).copy()


required_by_partition = {
    part: sum(GROUP_PARTITION_SPEAKERS[g][part] for g in ['train','val','test'])
    for part in ['train','dev','eval']
}
print('Required speakers by partition:', required_by_partition)

selected_rows = []

for part in ['train','dev','eval']:
    dfp = df[df['partition'].eq(part)].copy()
    systems = SYSTEMS_BY_PARTITION[part]
    spoof_q = SPOOF_QUOTA_BY_PARTITION[part]

    bona_counts = dfp[dfp['label'].eq('bonafide')].groupby('speaker_id').size().rename('bonafide_n')
    spoof_piv = (
        dfp[dfp['label'].eq('spoof')]
        .groupby(['speaker_id','system_id']).size()
        .unstack(fill_value=0)
    )
    for s in systems:
        if s not in spoof_piv.columns:
            spoof_piv[s] = 0
    spoof_piv = spoof_piv[systems]

    gender_ser = dfp.groupby('speaker_id')['gender'].agg(lambda x: x.iloc[0]).rename('gender')
    summary = pd.concat([bona_counts, spoof_piv, gender_ser], axis=1).fillna(0)
    summary['n_systems'] = (summary[systems] > 0).sum(axis=1)
    summary['min_system_n'] = summary[systems].min(axis=1)

    eligible = summary[
        (summary['bonafide_n'] >= BONA_PER_SPK) &
        (summary['n_systems'] == len(systems)) &
        (summary['min_system_n'] >= spoof_q)
    ].reset_index()

    need = required_by_partition[part]
    print(f'[{part}] eligible speakers:', len(eligible), '| need:', need)
    assert len(eligible) >= need, f'Not enough eligible speakers in {part}'

    picked = sample_gender_balanced(eligible, need, seed=SEED + {'train': 11, 'dev': 22, 'eval': 33}[part]).copy()
    picked['partition'] = part

    # assign split groups inside each partition according to plan
    picked = picked.sample(frac=1.0, random_state=SEED + {'train': 101, 'dev': 202, 'eval': 303}[part]).reset_index(drop=True)
    start = 0
    chunks = []
    for g in ['train','val','test']:
        k = GROUP_PARTITION_SPEAKERS[g][part]
        chunk = picked.iloc[start:start+k].copy()
        chunk['group'] = g
        chunks.append(chunk)
        start += k
    part_sel = pd.concat(chunks, ignore_index=True)
    selected_rows.append(part_sel)

sel = pd.concat(selected_rows, ignore_index=True)
sel = sel[['group','partition','speaker_id','gender','bonafide_n','min_system_n','n_systems']]
sel = sel.sort_values(['group','partition','speaker_id']).reset_index(drop=True)

print('Selected speakers total:', sel['speaker_id'].nunique())
print('By group:', sel['group'].value_counts().sort_index().to_dict())
print('By partition:', sel['partition'].value_counts().sort_index().to_dict())
print('By group+partition:')
print(sel.groupby(['group','partition']).size().unstack(fill_value=0))
print('Gender by group:')
print(sel.groupby(['group','gender']).size().unstack(fill_value=0))

display(sel.head(20))


In [None]:

selected_rows = []
audit_rows = []

for idx, r in sel.reset_index(drop=True).iterrows():
    split_group = r['group']
    part = r['partition']
    spk = r['speaker_id']
    systems = SYSTEMS_BY_PARTITION[part]
    spoof_q = SPOOF_QUOTA_BY_PARTITION[part]

    pool_spk = df[(df['partition'].eq(part)) & (df['speaker_id'].eq(spk))].copy()

    bona_pool = pool_spk[pool_spk['label'].eq('bonafide')].copy()
    bona_pick = bona_pool.sample(n=BONA_PER_SPK, random_state=SEED + 5000 + idx)
    bona_pick = bona_pick.copy()
    bona_pick['group'] = split_group
    bona_pick['selected_reason'] = 'bonafide_quota'
    selected_rows.append(bona_pick)
    audit_rows.append({
        'group': split_group,
        'partition': part,
        'speaker_id': spk,
        'label': 'bonafide',
        'system_id': 'bonafide',
        'target_n': BONA_PER_SPK,
        'selected_n': len(bona_pick),
        'availability_n': len(bona_pool),
    })

    for j, sysid in enumerate(systems):
        pool = pool_spk[(pool_spk['label'].eq('spoof')) & (pool_spk['system_id'].eq(sysid))].copy()
        pick = pool.sample(n=spoof_q, random_state=SEED + 7000 + idx * 100 + j)
        pick = pick.copy()
        pick['group'] = split_group
        pick['selected_reason'] = 'spoof_system_quota'
        selected_rows.append(pick)
        audit_rows.append({
            'group': split_group,
            'partition': part,
            'speaker_id': spk,
            'label': 'spoof',
            'system_id': sysid,
            'target_n': spoof_q,
            'selected_n': len(pick),
            'availability_n': len(pool),
        })

manifest = pd.concat(selected_rows, ignore_index=True)
manifest = manifest[['group','partition','speaker_id','utt_id','gender','label','system_id','codec_id','codec_q','source_utt_id','attack_codec_id','selected_reason']]
manifest = manifest.sort_values(['group','partition','speaker_id','label','system_id','utt_id']).reset_index(drop=True)

audit_df = pd.DataFrame(audit_rows)

# Sanity checks
assert manifest['speaker_id'].nunique() == 100
assert len(manifest) == 100 * 80
assert manifest['utt_id'].nunique() == len(manifest)

per_spk = manifest.groupby(['group','partition','speaker_id','label']).size().unstack(fill_value=0)
assert (per_spk['bonafide'] == 32).all()
assert (per_spk['spoof'] == 48).all()
assert (audit_df['selected_n'] == audit_df['target_n']).all()

manifest.to_csv(MANIFEST_OUT, index=False)
sel.to_csv(SPEAKER_PLAN_OUT, index=False)
audit_df.to_csv(AUDIT_OUT, index=False)

summary = {
    'seed': SEED,
    'design': {
        'n_speakers_total': 100,
        'group_partition_speakers': GROUP_PARTITION_SPEAKERS,
        'bonafide_per_speaker': BONA_PER_SPK,
        'spoof_quota_by_partition': SPOOF_QUOTA_BY_PARTITION,
        'systems_by_partition': SYSTEMS_BY_PARTITION,
    },
    'selected_speakers': int(sel['speaker_id'].nunique()),
    'selected_utterances_total': int(len(manifest)),
    'class_counts': {k:int(v) for k,v in manifest['label'].value_counts().sort_index().to_dict().items()},
    'group_counts': {k:int(v) for k,v in manifest['group'].value_counts().sort_index().to_dict().items()},
    'group_partition_counts': {
        g: {p: int(v) for p, v in sel[sel['group'].eq(g)]['partition'].value_counts().sort_index().to_dict().items()}
        for g in ['train','val','test']
    },
    'spoof_counts_by_system': {k:int(v) for k,v in manifest[manifest['label'].eq('spoof')]['system_id'].value_counts().sort_index().to_dict().items()},
}
SUMMARY_OUT.write_text(json.dumps(summary, indent=2), encoding='utf-8')

# Convenience copies in OUT_DIR
manifest.to_csv(OUT_DIR / 'asvspoof5_100_speakers_selected_utterances_plan.csv', index=False)
sel.to_csv(OUT_DIR / 'asvspoof5_100_speakers_selected_speakers_plan.csv', index=False)
audit_df.to_csv(OUT_DIR / 'asvspoof5_100_speakers_selection_audit.csv', index=False)
(OUT_DIR / 'asvspoof5_100_speakers_plan_summary.json').write_text(json.dumps(summary, indent=2), encoding='utf-8')

print('Saved:', MANIFEST_OUT)
print('Saved:', SPEAKER_PLAN_OUT)
print('Saved:', AUDIT_OUT)
print('Saved:', SUMMARY_OUT)
print('Rows:', len(manifest), '| Speakers:', manifest['speaker_id'].nunique())
print('By group:', manifest['group'].value_counts().sort_index().to_dict())
print('By class:', manifest['label'].value_counts().sort_index().to_dict())


In [None]:

# Inspect tar shards by partition

partition_tars = {}
for part, tar_dir in TAR_DIRS.items():
    assert tar_dir.exists(), f'Missing TAR_DIR[{part}]: {tar_dir}'
    tars = sorted(tar_dir.glob('flac_*.tar'))
    partition_tars[part] = tars
    print(f'[{part}] tar files:', len(tars))
    for p in tars[:10]:
        print(' -', p.name)
    assert len(tars) > 0, f'No tar files found for partition {part} in {tar_dir}'


In [None]:

# Build tar member index for selected utt_ids
manifest = pd.read_csv(MANIFEST_OUT)
needed_by_part = {
    part: set(manifest[manifest['partition'].eq(part)]['utt_id'].astype(str).tolist())
    for part in ['train','dev','eval']
}
print({k: len(v) for k, v in needed_by_part.items()})

index_rows = []
found_by_part = {k: set() for k in ['train','dev','eval']}

for part in ['train','dev','eval']:
    needed = needed_by_part[part]
    for tar_path in partition_tars[part]:
        print(f'Indexing [{part}]', tar_path.name)
        with tarfile.open(tar_path, 'r') as tf:
            for m in tf.getmembers():
                if not m.isfile():
                    continue
                stem = Path(Path(m.name).name).stem
                if stem in needed:
                    index_rows.append({
                        'partition': part,
                        'utt_id': stem,
                        'tar_file': str(tar_path),
                        'member_name': m.name,
                        'member_size': int(m.size),
                    })
                    found_by_part[part].add(stem)
        print(f'  found so far [{part}]:', len(found_by_part[part]), '/', len(needed))

index_df = pd.DataFrame(index_rows)
idx_counts = index_df.groupby('utt_id').size().reset_index(name='n_matches') if not index_df.empty else pd.DataFrame(columns=['utt_id','n_matches'])
ambiguous = idx_counts[idx_counts['n_matches'] > 1].copy()
if not index_df.empty:
    index_df = index_df.sort_values(['partition','utt_id','tar_file','member_name']).drop_duplicates('utt_id', keep='first').reset_index(drop=True)

mapped = manifest.merge(index_df.drop(columns=['partition']), on='utt_id', how='left')
unmatched = mapped[mapped['tar_file'].isna()].copy()

index_csv = OUT_DIR / 'asvspoof5_100_speakers_tar_member_index.csv'
mapped_csv = OUT_DIR / 'asvspoof5_100_speakers_manifest_with_tar_paths.csv'
unmatched_csv = OUT_DIR / 'asvspoof5_100_speakers_unmatched_utts.csv'
ambiguous_csv = OUT_DIR / 'asvspoof5_100_speakers_ambiguous_utts.csv'

index_df.to_csv(index_csv, index=False)
mapped.to_csv(mapped_csv, index=False)
unmatched.to_csv(unmatched_csv, index=False)
ambiguous.to_csv(ambiguous_csv, index=False)

print('Mapped rows:', mapped['tar_file'].notna().sum(), '/', len(mapped))
print('Unmatched rows:', len(unmatched))
print('Ambiguous utt_ids:', len(ambiguous))
assert len(unmatched) == 0, 'Some selected utt_ids were not found in the provided tar files.'


In [None]:

# Selective extraction to EXTRACT_DIR/{group}/{label}/{utt_id}.flac
mapped = pd.read_csv(OUT_DIR / 'asvspoof5_100_speakers_manifest_with_tar_paths.csv')
assert mapped['tar_file'].notna().all(), 'Run tar indexing cell first and resolve unmatched rows.'

for tar_file, g in mapped.groupby('tar_file'):
    tar_path = Path(tar_file)
    print('Extracting from', tar_path.name, '| rows =', len(g))
    lookup = {row.member_name: row for row in g.itertuples(index=False)}
    with tarfile.open(tar_path, 'r') as tf:
        for m in tf.getmembers():
            if m.name not in lookup:
                continue
            row = lookup[m.name]
            ext = Path(m.name).suffix or '.flac'
            out_dir = EXTRACT_DIR / row.group / row.label
            out_dir.mkdir(parents=True, exist_ok=True)
            out_path = out_dir / f'{row.utt_id}{ext}'
            if out_path.exists():
                continue
            fobj = tf.extractfile(m)
            assert fobj is not None
            with open(out_path, 'wb') as w:
                w.write(fobj.read())

print('Extraction complete.')
print('Extracted flac count:', len(list(EXTRACT_DIR.rglob('*.flac'))))
print('Extracted wav count:', len(list(EXTRACT_DIR.rglob('*.wav'))))


In [None]:

# Optional: write manifest with resolved local paths
manifest = pd.read_csv(OUT_DIR / 'asvspoof5_100_speakers_manifest_with_tar_paths.csv')
ext_guess = manifest['member_name'].apply(lambda s: Path(str(s)).suffix if pd.notna(s) else '.flac')
manifest['local_audio_path'] = [str(EXTRACT_DIR / g / lbl / f'{u}{e}') for g,lbl,u,e in zip(manifest['group'], manifest['label'], manifest['utt_id'], ext_guess)]
manifest['label_binary_spoof_1'] = (manifest['label'] == 'spoof').astype(int)

out_csv = OUT_DIR / 'asvspoof5_100_speakers_subset_manifest_with_local_paths.csv'
manifest.to_csv(out_csv, index=False)
print('Saved:', out_csv)
display(manifest.head(10))
