
# 1. Prepare TCAV CSV (Long -> Wide)

This notebook fixes the TCAV CSV shape for machine learning.

Input (long format): one row per `(sample, concept)`.
Output (wide format): one row per `sample`, with concept columns.

This notebook is self-contained (does not import your `.py` script).


In [None]:

from pathlib import Path
import json
import numpy as np
import pandas as pd

PROJECT_ROOT = Path('/home/SpeakerRec/BioVoice')
INPUT_CSV = PROJECT_ROOT / 'data' / 'tcav' / 'tcav_ASVspoof_stage4_spoofwrapper.csv'
OUT_DIR = PROJECT_ROOT / 'data' / 'tcav' / 'prepared_csvs'
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Choose features to pivot into columns
FEATURE_METRICS = ['positive percentage']  # or ['magnitude'] or ['positive percentage', 'magnitude']
MIN_CAV_ACC = None  # example: 0.5

print('PROJECT_ROOT =', PROJECT_ROOT)
print('INPUT_CSV =', INPUT_CSV)
print('OUT_DIR =', OUT_DIR)


In [None]:

# Load and validate CSV
assert INPUT_CSV.exists(), f'Missing CSV: {INPUT_CSV}'
df = pd.read_csv(INPUT_CSV)

required_cols = {
    'idx', 'speaker_id', 'system_id', 'key', 'true label', 'layer_key', 'concept name',
    'positive percentage', 'magnitude'
}
missing = sorted(required_cols - set(df.columns))
assert not missing, f'Missing columns: {missing}'

# Types
for col in ['positive percentage', 'magnitude', 'cav acc']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
df['idx'] = df['idx'].astype(str)
df['speaker_id'] = df['speaker_id'].astype(str)
df['true label'] = pd.to_numeric(df['true label'], errors='coerce').astype('Int64')
df['key'] = pd.to_numeric(df['key'], errors='coerce').astype('Int64')

print('Rows:', len(df))
print('Unique samples (idx):', df['idx'].nunique())
print('Unique speakers:', df['speaker_id'].nunique())
print('Unique concepts:', df['concept name'].nunique())
print('Rows per sample (should be concepts count):')
print(df.groupby('idx').size().value_counts().sort_index())
print('Label mapping reminder: key=1 real, key=0 fake; true label=0 real, true label=1 fake')


In [None]:

# Optional concept-level CAV accuracy summary and filtering
concept_acc_df = None
if 'cav acc' in df.columns:
    concept_acc_df = (
        df.groupby(['layer_key', 'concept name', 'layer name'], dropna=False)['cav acc']
        .mean()
        .reset_index()
        .rename(columns={'cav acc': 'mean_cav_acc'})
    )
    display(concept_acc_df.sort_values('mean_cav_acc', ascending=False))

if MIN_CAV_ACC is not None and concept_acc_df is not None:
    keep_concepts = set(concept_acc_df.loc[concept_acc_df['mean_cav_acc'] >= MIN_CAV_ACC, 'concept name'].astype(str))
    df = df[df['concept name'].astype(str).isin(keep_concepts)].copy()
    print('Rows after MIN_CAV_ACC filter:', len(df))
    print('Remaining concepts:', df['concept name'].nunique())


In [None]:

# Long -> wide (one row per sample)
meta_cols = ['idx', 'speaker_id', 'system_id', 'key', 'true label']
df_meta = df[meta_cols].drop_duplicates(subset=['idx']).sort_values('idx').reset_index(drop=True)

df_wide = df_meta.copy()
feature_cols = []
for metric in FEATURE_METRICS:
    pivot = df.pivot_table(index='idx', columns='concept name', values=metric, aggfunc='first')
    pivot = pivot.sort_index(axis=1)
    pivot.columns = [f'{metric}__{c}' for c in pivot.columns]
    pivot = pivot.reset_index()
    feature_cols.extend([c for c in pivot.columns if c != 'idx'])
    df_wide = df_wide.merge(pivot, on='idx', how='inner')

print('Prepared sample-level rows:', len(df_wide))
print('Feature count:', len(feature_cols))
print('Feature examples:', feature_cols[:5])
display(df_wide.head())


In [None]:

# Save prepared CSVs
run_name = 'stage4_spoofwrapper_' + ('_'.join([m.replace(' ', '_') for m in FEATURE_METRICS]))
out_subdir = OUT_DIR / run_name
out_subdir.mkdir(parents=True, exist_ok=True)

sample_csv = out_subdir / 'sample_level_features_all.csv'
meta_json = out_subdir / 'prepare_metadata.json'
concept_acc_csv = out_subdir / 'concept_cav_accuracy_summary.csv'

df_wide.to_csv(sample_csv, index=False)
if concept_acc_df is not None:
    concept_acc_df.to_csv(concept_acc_csv, index=False)

metadata = {
    'input_csv': str(INPUT_CSV),
    'output_sample_csv': str(sample_csv),
    'feature_metrics': FEATURE_METRICS,
    'min_cav_acc': MIN_CAV_ACC,
    'rows_long': int(len(df)),
    'rows_wide': int(len(df_wide)),
    'unique_speakers': int(df_wide['speaker_id'].nunique()),
    'feature_count': int(len(feature_cols)),
    'label_mapping': {
        'key_1': 'real/bonafide',
        'key_0': 'fake/spoof',
        'true_label_0': 'real',
        'true_label_1': 'fake'
    }
}
meta_json.write_text(json.dumps(metadata, indent=2), encoding='utf-8')
print('Saved:', sample_csv)
print('Saved:', meta_json)
if concept_acc_df is not None:
    print('Saved:', concept_acc_csv)
