
# 1. Prepare TCAV CSV (Long -> Wide)

This notebook fixes the TCAV CSV shape for machine learning.

Input (long format): one row per `(sample, concept)`.
Output (wide format): one row per `sample`, with concept columns.

This notebook is self-contained (does not import your `.py` script).


In [2]:
from pathlib import Path
import json
import numpy as np
import pandas as pd

PROJECT_ROOT = Path('/home/SpeakerRec/BioVoice')
INPUT_CSV = PROJECT_ROOT / 'data' / 'tcav' / 'tcav_ASVspoof_stage4_spoofwrapper.csv'
OUT_DIR = PROJECT_ROOT / 'data' / 'tcav' / 'prepared_csvs'
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Choose features to pivot into columns
FEATURE_METRICS = [
    "magnitude"
]  # or ['magnitude'] or ['positive percentage', 'magnitude']
MIN_CAV_ACC = None  # example: 0.5

print('PROJECT_ROOT =', PROJECT_ROOT)
print('INPUT_CSV =', INPUT_CSV)
print('OUT_DIR =', OUT_DIR)

PROJECT_ROOT = /home/SpeakerRec/BioVoice
INPUT_CSV = /home/SpeakerRec/BioVoice/data/tcav/tcav_ASVspoof_stage4_spoofwrapper.csv
OUT_DIR = /home/SpeakerRec/BioVoice/data/tcav/prepared_csvs


In [3]:

# Load and validate CSV
assert INPUT_CSV.exists(), f'Missing CSV: {INPUT_CSV}'
df = pd.read_csv(INPUT_CSV)

required_cols = {
    'idx', 'speaker_id', 'system_id', 'key', 'true label', 'layer_key', 'concept name',
    'positive percentage', 'magnitude'
}
missing = sorted(required_cols - set(df.columns))
assert not missing, f'Missing columns: {missing}'

# Types
for col in ['positive percentage', 'magnitude', 'cav acc']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
df['idx'] = df['idx'].astype(str)
df['speaker_id'] = df['speaker_id'].astype(str)
df['true label'] = pd.to_numeric(df['true label'], errors='coerce').astype('Int64')
df['key'] = pd.to_numeric(df['key'], errors='coerce').astype('Int64')

print('Rows:', len(df))
print('Unique samples (idx):', df['idx'].nunique())
print('Unique speakers:', df['speaker_id'].nunique())
print('Unique concepts:', df['concept name'].nunique())
print('Rows per sample (should be concepts count):')
print(df.groupby('idx').size().value_counts().sort_index())
print('Label mapping reminder: key=1 real, key=0 fake; true label=0 real, true label=1 fake')


Rows: 16100
Unique samples (idx): 1150
Unique speakers: 67
Unique concepts: 14
Rows per sample (should be concepts count):
14    1150
dtype: int64
Label mapping reminder: key=1 real, key=0 fake; true label=0 real, true label=1 fake


In [4]:

# Optional concept-level CAV accuracy summary and filtering
concept_acc_df = None
if 'cav acc' in df.columns:
    concept_acc_df = (
        df.groupby(['layer_key', 'concept name', 'layer name'], dropna=False)['cav acc']
        .mean()
        .reset_index()
        .rename(columns={'cav acc': 'mean_cav_acc'})
    )
    display(concept_acc_df.sort_values('mean_cav_acc', ascending=False))

if MIN_CAV_ACC is not None and concept_acc_df is not None:
    keep_concepts = set(concept_acc_df.loc[concept_acc_df['mean_cav_acc'] >= MIN_CAV_ACC, 'concept name'].astype(str))
    df = df[df['concept name'].astype(str).isin(keep_concepts)].copy()
    print('Rows after MIN_CAV_ACC filter:', len(df))
    print('Remaining concepts:', df['concept name'].nunique())


Unnamed: 0,layer_key,concept name,layer name,mean_cav_acc
10,stage4,short_dropping_steep_thick,redim.backbone.stage4.2,0.480769
1,stage4,long_constant_thick_Vibrato,redim.backbone.stage4.2,0.442308
8,stage4,long_rising_steep_thin,redim.backbone.stage4.2,0.423077
13,stage4,short_rising_steep_thin,redim.backbone.stage4.2,0.423077
3,stage4,long_dropping_flat_thick_Vibrato,redim.backbone.stage4.2,0.403846
4,stage4,long_dropping_steep_thick,redim.backbone.stage4.2,0.403846
0,stage4,long_constant_thick,redim.backbone.stage4.2,0.384615
6,stage4,long_rising_flat_thick,redim.backbone.stage4.2,0.384615
12,stage4,short_rising_steep_thick,redim.backbone.stage4.2,0.384615
11,stage4,short_dropping_steep_thin,redim.backbone.stage4.2,0.365385


In [5]:

# Long -> wide (one row per sample)
meta_cols = ['idx', 'speaker_id', 'system_id', 'key', 'true label']
df_meta = df[meta_cols].drop_duplicates(subset=['idx']).sort_values('idx').reset_index(drop=True)

df_wide = df_meta.copy()
feature_cols = []
for metric in FEATURE_METRICS:
    pivot = df.pivot_table(index='idx', columns='concept name', values=metric, aggfunc='first')
    pivot = pivot.sort_index(axis=1)
    pivot.columns = [f'{metric}__{c}' for c in pivot.columns]
    pivot = pivot.reset_index()
    feature_cols.extend([c for c in pivot.columns if c != 'idx'])
    df_wide = df_wide.merge(pivot, on='idx', how='inner')

print('Prepared sample-level rows:', len(df_wide))
print('Feature count:', len(feature_cols))
print('Feature examples:', feature_cols[:5])
display(df_wide.head())


Prepared sample-level rows: 1150
Feature count: 14
Feature examples: ['magnitude__long_constant_thick', 'magnitude__long_constant_thick_Vibrato', 'magnitude__long_dropping_flat_thick', 'magnitude__long_dropping_flat_thick_Vibrato', 'magnitude__long_dropping_steep_thick']


Unnamed: 0,idx,speaker_id,system_id,key,true label,magnitude__long_constant_thick,magnitude__long_constant_thick_Vibrato,magnitude__long_dropping_flat_thick,magnitude__long_dropping_flat_thick_Vibrato,magnitude__long_dropping_steep_thick,magnitude__long_dropping_steep_thin,magnitude__long_rising_flat_thick,magnitude__long_rising_steep_thick,magnitude__long_rising_steep_thin,magnitude__short_constant_thick,magnitude__short_dropping_steep_thick,magnitude__short_dropping_steep_thin,magnitude__short_rising_steep_thick,magnitude__short_rising_steep_thin
0,0,LA_0039,A18,1,0,1.386472,1.66574,1.12434,1.578779,0.497841,0.738488,0.649573,1.11676,0.531362,1.208903,1.145937,0.800696,1.151875,0.858018
1,1,LA_0039,A09,1,0,0.6198,0.64914,0.186097,-0.115557,0.1768,-0.184231,-0.301169,0.46713,0.423431,0.011932,0.429088,0.092542,-0.301437,-0.049834
2,10,LA_0039,-,0,1,-0.7956,-0.350272,-0.490347,-0.712432,-1.208729,-0.753362,-0.99634,-1.800365,-1.4701,-0.587676,-0.458562,-0.832447,-0.315891,-0.639084
3,100,LA_0030,A18,1,0,1.285514,1.169712,1.585133,1.197529,0.964059,0.49272,1.332907,0.457686,0.750959,1.465446,1.325524,1.318126,1.586114,1.37558
4,1000,LA_0060,-,0,1,0.287473,0.25954,-0.042414,-0.215594,-0.432068,-0.553561,-0.713139,0.672127,0.185313,-0.203986,-0.094293,-0.294625,-1.035261,-0.464262


In [6]:

# Save prepared CSVs
run_name = 'stage4_spoofwrapper_' + ('_'.join([m.replace(' ', '_') for m in FEATURE_METRICS]))
out_subdir = OUT_DIR / run_name
out_subdir.mkdir(parents=True, exist_ok=True)

sample_csv = out_subdir / 'sample_level_features_all.csv'
meta_json = out_subdir / 'prepare_metadata.json'
concept_acc_csv = out_subdir / 'concept_cav_accuracy_summary.csv'

df_wide.to_csv(sample_csv, index=False)
if concept_acc_df is not None:
    concept_acc_df.to_csv(concept_acc_csv, index=False)

metadata = {
    'input_csv': str(INPUT_CSV),
    'output_sample_csv': str(sample_csv),
    'feature_metrics': FEATURE_METRICS,
    'min_cav_acc': MIN_CAV_ACC,
    'rows_long': int(len(df)),
    'rows_wide': int(len(df_wide)),
    'unique_speakers': int(df_wide['speaker_id'].nunique()),
    'feature_count': int(len(feature_cols)),
    'label_mapping': {
        'key_1': 'real/bonafide',
        'key_0': 'fake/spoof',
        'true_label_0': 'real',
        'true_label_1': 'fake'
    }
}
meta_json.write_text(json.dumps(metadata, indent=2), encoding='utf-8')
print('Saved:', sample_csv)
print('Saved:', meta_json)
if concept_acc_df is not None:
    print('Saved:', concept_acc_csv)


Saved: /home/SpeakerRec/BioVoice/data/tcav/prepared_csvs/stage4_spoofwrapper_magnitude/sample_level_features_all.csv
Saved: /home/SpeakerRec/BioVoice/data/tcav/prepared_csvs/stage4_spoofwrapper_magnitude/prepare_metadata.json
Saved: /home/SpeakerRec/BioVoice/data/tcav/prepared_csvs/stage4_spoofwrapper_magnitude/concept_cav_accuracy_summary.csv
