
# TCAV Concept Logistic Regression Analysis (Speaker Split 15/5)

This notebook analyzes a TCAV CSV (like `tcav_ASVspoof_stage4_spoofwrapper.csv`) and trains a **speaker-independent logistic regression** on concept features.

It is separated by logic stages:
- paths + module loading
- CSV job configuration
- load / filter / long->wide
- speaker split (15 train / 5 test)
- train + evaluate logistic regression
- interpret concepts (global + per-user)
- export outputs

Label mapping from your original notebook:
- `key=1` -> real (bonafide)
- `key=0` -> fake (spoof)
- `true label=0` -> real
- `true label=1` -> fake


In [None]:
# Imports + load the analysis functions from the Python script
from pathlib import Path
import sys
import json
import pandas as pd

PROJECT_ROOT = Path('/home/SpeakerRec/BioVoice')
SCRIPT_DIR = PROJECT_ROOT / 'redimnet' / 'tcav' / 'deepfakes'
SCRIPT_PATH = SCRIPT_DIR / 'tcav_concept_logreg_analysis.py'

assert SCRIPT_PATH.exists(), f'Missing analysis script: {SCRIPT_PATH}'
if str(SCRIPT_DIR) not in sys.path:
    sys.path.insert(0, str(SCRIPT_DIR))

import tcav_concept_logreg_analysis as mod

print('PROJECT_ROOT =', PROJECT_ROOT)
print('SCRIPT_PATH =', SCRIPT_PATH)
print('Loaded module from =', mod.__file__)


In [None]:

# CSV job configuration (separate logic per CSV)
# Add more items to CSV_JOBS if you want to analyze multiple TCAV CSV files.

CSV_JOBS = [
    {
        'name': 'stage4_spoofwrapper_pospct',
        'csv_path': PROJECT_ROOT / 'data' / 'tcav' / 'tcav_ASVspoof_stage4_spoofwrapper.csv',
        'out_dir': PROJECT_ROOT / 'data' / 'tcav' / 'logreg_concept_analysis' / 'stage4_spoofwrapper_pospct',
        'feature_metrics': ['positive percentage'],  # or ['magnitude'] or ['both']
        'min_cav_acc': None,  # example: 0.5
    },
]

# Split settings
NUM_SPEAKERS = 20
TRAIN_SPEAKERS = 15
TEST_SPEAKERS = 5
RANDOM_SEED = 42
PREFER_BALANCED_SPEAKERS = True

for job in CSV_JOBS:
    print(job['name'])
    print('  csv_path =', job['csv_path'])
    print('  out_dir  =', job['out_dir'])


In [None]:

# Helper to run one CSV job end-to-end using the functions from the script

def run_one_csv_job(job: dict):
    feature_metrics = mod._normalize_feature_metrics(job['feature_metrics'])
    csv_path = Path(job['csv_path'])
    out_dir = Path(job['out_dir'])
    out_dir.mkdir(parents=True, exist_ok=True)

    print(f'\n[JOB] {job["name"]}')
    print('[1] Load CSV')
    df_long = mod.load_tcav_csv(csv_path)
    print('    rows =', len(df_long), '| unique speakers =', df_long['speaker_id'].nunique(), '| unique idx =', df_long['idx'].nunique())

    print('[2] Optional CAV accuracy concept filter')
    df_long_filtered, concept_acc_df = mod.apply_cav_acc_filter(df_long, job.get('min_cav_acc'))
    print('    rows after filter =', len(df_long_filtered))

    print('[3] Long -> wide (one row per sample)')
    df_wide, feature_cols = mod.build_sample_level_dataset(df_long_filtered, feature_metrics)
    print('    sample rows =', len(df_wide), '| feature count =', len(feature_cols))

    print('[4] Speaker selection + 15/5 split')
    selected_speakers = mod.choose_speakers(
        df_wide=df_wide,
        num_speakers=NUM_SPEAKERS,
        seed=RANDOM_SEED,
        prefer_balanced=PREFER_BALANCED_SPEAKERS,
    )
    split_result = mod.split_by_speaker(
        df_wide=df_wide,
        selected_speakers=selected_speakers,
        train_speakers_n=TRAIN_SPEAKERS,
        test_speakers_n=TEST_SPEAKERS,
        seed=RANDOM_SEED,
    )
    print('    train speakers:', split_result.train_speakers)
    print('    test speakers :', split_result.test_speakers)
    print('    train samples =', len(split_result.train_df), '| test samples =', len(split_result.test_df))

    print('[5] Train logistic regression + evaluate')
    (clf, scaler, X_train, X_test, X_train_scaled, X_test_scaled, y_train, y_test, y_pred, y_prob_fake, metrics) = mod.fit_logreg(
        split_result.train_df, split_result.test_df, feature_cols
    )
    print(json.dumps({k: v for k, v in metrics.items() if k != 'classification_report'}, indent=2))
    print(metrics['classification_report'])

    print('[6] Build interpretation tables')
    coef_df = mod.build_global_importance_table(clf, feature_cols)
    class_summary_df = mod.build_class_summary_table(
        df_wide[df_wide['speaker_id'].isin(selected_speakers)].copy(), feature_cols
    )
    pred_df, sample_contrib_long, user_contrib = mod.build_test_predictions_table(
        split_result.test_df, feature_cols, X_test_scaled, y_pred, y_prob_fake, clf
    )
    top_user_df = mod.build_top_concepts_per_user(user_contrib, top_k=3)

    print('[7] Export outputs')
    df_wide.to_csv(out_dir / 'sample_level_features_all.csv', index=False)
    df_wide[df_wide['speaker_id'].isin(selected_speakers)].to_csv(out_dir / 'sample_level_features_selected_20speakers.csv', index=False)
    split_result.train_df.to_csv(out_dir / 'train_samples_15speakers.csv', index=False)
    split_result.test_df.to_csv(out_dir / 'test_samples_5speakers.csv', index=False)
    coef_df.to_csv(out_dir / 'global_concept_coefficients.csv', index=False)
    class_summary_df.to_csv(out_dir / 'classwise_concept_summary.csv', index=False)
    pred_df.to_csv(out_dir / 'test_predictions.csv', index=False)
    sample_contrib_long.to_csv(out_dir / 'test_sample_contributions_long.csv', index=False)
    user_contrib.to_csv(out_dir / 'test_user_mean_contributions.csv', index=False)
    top_user_df.to_csv(out_dir / 'test_user_top_concepts.csv', index=False)

    try:
        import joblib
        joblib.dump(scaler, out_dir / 'scaler.joblib')
        joblib.dump(clf, out_dir / 'logreg.joblib')
    except Exception as e:
        print('[WARN] joblib save failed:', e)

    mod.maybe_save_plots(out_dir, coef_df, user_contrib, split_result.test_speakers)

    class ArgsObj:
        pass
    args_obj = ArgsObj()
    args_obj.csv_path = csv_path
    args_obj.out_dir = out_dir
    args_obj.random_seed = RANDOM_SEED
    args_obj.feature_metrics = job['feature_metrics']
    args_obj.num_speakers = NUM_SPEAKERS
    args_obj.train_speakers = TRAIN_SPEAKERS
    args_obj.test_speakers = TEST_SPEAKERS
    args_obj.min_cav_acc = job.get('min_cav_acc')
    mod.save_run_metadata(out_dir, args_obj, df_long_filtered, df_wide, split_result, feature_cols, concept_acc_df, metrics)

    return {
        'job': job,
        'df_long': df_long,
        'df_long_filtered': df_long_filtered,
        'df_wide': df_wide,
        'feature_cols': feature_cols,
        'selected_speakers': selected_speakers,
        'split_result': split_result,
        'clf': clf,
        'scaler': scaler,
        'metrics': metrics,
        'coef_df': coef_df,
        'class_summary_df': class_summary_df,
        'pred_df': pred_df,
        'sample_contrib_long': sample_contrib_long,
        'user_contrib': user_contrib,
        'top_user_df': top_user_df,
        'out_dir': out_dir,
    }


In [None]:

# Run all configured CSV jobs (each CSV gets its own output folder)
results = {}
for job in CSV_JOBS:
    results[job['name']] = run_one_csv_job(job)

print('\nCompleted jobs:', list(results.keys()))


In [None]:

# Inspect one job quickly (change the key if needed)
JOB_NAME = CSV_JOBS[0]['name']
r = results[JOB_NAME]

print('Job:', JOB_NAME)
print('Output folder:', r['out_dir'])
print('Test metrics summary:')
print(json.dumps({k: v for k, v in r['metrics'].items() if k != 'classification_report'}, indent=2))

print('\nTop global concept coefficients (absolute):')
display(r['coef_df'].head(10))


In [None]:

# Per-user important concepts on test speakers (top concepts)
JOB_NAME = CSV_JOBS[0]['name']
r = results[JOB_NAME]

display(r['top_user_df'].head(30))


In [None]:

# Class-wise concept summary (fake vs real means)
JOB_NAME = CSV_JOBS[0]['name']
r = results[JOB_NAME]

display(r['class_summary_df'].sort_values('mean_diff_fake_minus_real', ascending=False).head(10))
display(r['class_summary_df'].sort_values('mean_diff_fake_minus_real', ascending=True).head(10))



## Notes

- This notebook reuses the functions in `redimnet/tcav/deepfakes/tcav_concept_logreg_analysis.py`.
- To analyze another TCAV CSV, add another item to `CSV_JOBS` with a different `csv_path` and `out_dir`.
- Keep each CSV in a separate `out_dir` so the outputs do not overwrite each other.
