
# ASVspoof5 Train-Only LogReg on TCAV CSV: Visualization

This notebook is visualization-only. It reads outputs produced by:
- `tcav_asvspoof5_train_only_logreg_on_tcav_csv.ipynb`

It generates the requested plots/exports:
- global concept importance (coefficients)
- class-wise concept differences (spoof vs bonafide)
- per-user concept heatmap (test group C, spoof samples)
- per-system (A01-A08) accuracy bar chart
- top concepts per spoof system (A01-A08) CSV export


In [None]:

from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

PROJECT_ROOT = Path('/home/SpeakerRec/BioVoice')
ANALYSIS_DIR = PROJECT_ROOT / 'data' / 'tcav' / 'ASVspoof5_train_only_logreg_on_tcav'
PLOTS_DIR = ANALYSIS_DIR / 'plots'
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

print('ANALYSIS_DIR =', ANALYSIS_DIR)
print('PLOTS_DIR =', PLOTS_DIR)


In [None]:

# Load outputs from the training notebook
paths = {
    'meta': ANALYSIS_DIR / 'run_metadata.json',
    'coef': ANALYSIS_DIR / 'global_concept_coefficients.csv',
    'class_summary': ANALYSIS_DIR / 'classwise_concept_summary_BplusC.csv',
    'pred': ANALYSIS_DIR / 'test_group_C_predictions.csv',
    'user_contrib': ANALYSIS_DIR / 'test_group_C_user_mean_contributions.csv',
    'system_contrib': ANALYSIS_DIR / 'test_group_C_system_mean_contributions.csv',
    'system_perf': ANALYSIS_DIR / 'test_group_C_system_performance.csv',
}
for k,p in paths.items():
    assert p.exists(), f'Missing {k}: {p}'

run_meta = json.loads(paths['meta'].read_text(encoding='utf-8'))
coef_df = pd.read_csv(paths['coef'])
class_summary_df = pd.read_csv(paths['class_summary'])
pred_df = pd.read_csv(paths['pred'])
user_contrib_df = pd.read_csv(paths['user_contrib'])
system_contrib_df = pd.read_csv(paths['system_contrib'])
system_perf_df = pd.read_csv(paths['system_perf'])

print('Loaded outputs successfully')
print('Metrics:')
print(json.dumps({k:v for k,v in run_meta['metrics'].items() if k != 'classification_report'}, indent=2))
print(run_meta['metrics']['classification_report'])


In [None]:

# 1) Global concept importance (coefficients)
display(coef_df.head(30))

coef_plot = coef_df.head(25).copy().iloc[::-1]
colors = ['#d62728' if d == 'spoof' else '#1f77b4' if d == 'bonafide' else '#7f7f7f' for d in coef_plot['direction']]
plt.figure(figsize=(10, max(6, 0.32 * len(coef_plot))))
plt.barh(coef_plot['feature'], coef_plot['coefficient'], color=colors)
plt.axvline(0, color='black', linewidth=1)
plt.title('Global Concept Coefficients (positive=spoof, negative=bonafide)')
plt.tight_layout()
plt.savefig(PLOTS_DIR / 'global_concept_coefficients_top25.png', dpi=150)
plt.show()
print('Saved:', PLOTS_DIR / 'global_concept_coefficients_top25.png')


In [None]:

# 2) Class-wise concept differences (spoof vs bonafide)
print('Top spoof-leaning concepts (mean_diff_spoof_minus_bonafide):')
display(class_summary_df.sort_values('mean_diff_spoof_minus_bonafide', ascending=False).head(15))
print('Top bonafide-leaning concepts (mean_diff_spoof_minus_bonafide):')
display(class_summary_df.sort_values('mean_diff_spoof_minus_bonafide', ascending=True).head(15))

top_spoof = class_summary_df.sort_values('mean_diff_spoof_minus_bonafide', ascending=False).head(15).copy().iloc[::-1]
plt.figure(figsize=(9, 6))
plt.barh(top_spoof['feature'], top_spoof['mean_diff_spoof_minus_bonafide'], color='#d62728')
plt.axvline(0, color='black', linewidth=1)
plt.title('Top Spoof-Leaning Concepts by Mean Difference (B+C)')
plt.tight_layout()
plt.savefig(PLOTS_DIR / 'class_diff_top_spoof_15.png', dpi=150)
plt.show()

top_bona = class_summary_df.sort_values('mean_diff_spoof_minus_bonafide', ascending=True).head(15).copy().iloc[::-1]
plt.figure(figsize=(9, 6))
plt.barh(top_bona['feature'], top_bona['mean_diff_spoof_minus_bonafide'], color='#1f77b4')
plt.axvline(0, color='black', linewidth=1)
plt.title('Top Bonafide-Leaning Concepts by Mean Difference (B+C)')
plt.tight_layout()
plt.savefig(PLOTS_DIR / 'class_diff_top_bonafide_15.png', dpi=150)
plt.show()


In [None]:

# 3) Per-user concept heatmap (test group C, spoof samples)
spoof_user = user_contrib_df[user_contrib_df['label_id'] == 1].copy()
if spoof_user.empty:
    print('No spoof rows in user_contrib_df for test group C')
else:
    heat = spoof_user.pivot_table(index='speaker_id', columns='feature', values='mean_contribution', aggfunc='mean').fillna(0)
    display(heat)
    plt.figure(figsize=(max(10, 0.35 * heat.shape[1]), max(4, 0.6 * heat.shape[0])))
    im = plt.imshow(heat.to_numpy(), aspect='auto', cmap='coolwarm')
    plt.colorbar(im, label='Mean contribution')
    plt.xticks(range(heat.shape[1]), heat.columns, rotation=90, fontsize=7)
    plt.yticks(range(heat.shape[0]), heat.index, fontsize=9)
    plt.title('Test Group C: Per-user Mean Concept Contributions (Spoof samples)')
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / 'test_group_C_user_heatmap_spoof.png', dpi=150)
    plt.show()
    print('Saved:', PLOTS_DIR / 'test_group_C_user_heatmap_spoof.png')


In [None]:

# 4) Per-system (A01-A08) accuracy bar chart (test group C)
display(system_perf_df)

sys_perf_plot = system_perf_df.copy().sort_values(['label_str_mode', 'system_id'])
plt.figure(figsize=(9, 4.5))
colors = ['#1f77b4' if x == 'bonafide' else '#d62728' for x in sys_perf_plot['label_str_mode']]
plt.bar(sys_perf_plot['system_id'], sys_perf_plot['accuracy'], color=colors)
plt.ylim(0, 1)
plt.ylabel('Accuracy on test C')
plt.title('Per-system Accuracy (Test Group C)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(PLOTS_DIR / 'test_group_C_system_accuracy.png', dpi=150)
plt.show()
print('Saved:', PLOTS_DIR / 'test_group_C_system_accuracy.png')


In [None]:

# 5) Top concepts per spoof system (A01-A08) CSV export + display
# Use mean contribution on test group C, spoof rows only.
spoof_sys = system_contrib_df[system_contrib_df['label_id'] == 1].copy()

top_system_rows = []
for sysid, g in spoof_sys.groupby('system_id'):
    gs = g.sort_values('mean_contribution', ascending=False).head(5)
    for rank, (_, r) in enumerate(gs.iterrows(), start=1):
        top_system_rows.append({
            'system_id': sysid,
            'rank': rank,
            'feature': r['feature'],
            'metric': r['metric'],
            'concept': r['concept'],
            'mean_contribution': float(r['mean_contribution'])
        })

top_system_df = pd.DataFrame(top_system_rows).sort_values(['system_id','rank']).reset_index(drop=True)
out_csv = ANALYSIS_DIR / 'test_group_C_top_concepts_per_system.csv'
top_system_df.to_csv(out_csv, index=False)
print('Saved:', out_csv)
display(top_system_df)


In [None]:

# Optional summary table: concept frequency across systems' top-5 lists
freq_df = (pd.read_csv(ANALYSIS_DIR / 'test_group_C_top_concepts_per_system.csv')
           .groupby(['concept','metric']).size().reset_index(name='count')
           .sort_values(['count','concept'], ascending=[False, True]))
display(freq_df)
freq_df.to_csv(ANALYSIS_DIR / 'test_group_C_top_concepts_per_system_frequency.csv', index=False)
print('Saved:', ANALYSIS_DIR / 'test_group_C_top_concepts_per_system_frequency.csv')
