
# ASVspoof5 Train-Only Logistic Regression on TCAV CSV (Best-Practice Split)

This notebook trains a logistic regression **on the TCAV CSV outputs** (concept features), using the fixed train-only subset design:

- `A/B/C = 30/15/5` speakers (already selected)
- Train logistic regression on `group == 'B'`
- Test on `group == 'C'`
- Keep `A` for optional inspection / later analysis

Main goals:
- Which concepts are important for `bonafide` vs `spoof`
- Which concepts are important per user (`speaker_id`)
- Which concepts are important for each spoof `system_id` (`A01-A08`)


In [None]:

from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

PROJECT_ROOT = Path('/home/SpeakerRec/BioVoice')

TCAV_CSV = PROJECT_ROOT / 'data' / 'tcav' / 'ASVspoof5_train_only_stage4_spoofwrapper' / 'ASVspoof5_train_only_stage4_spoofwrapper.csv'
OUT_DIR = PROJECT_ROOT / 'data' / 'tcav' / 'ASVspoof5_train_only_logreg_on_tcav'
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Feature options: choose one or both metrics from the TCAV CSV
USE_METRICS = ['positive_percentage']   # options: ['positive_percentage'], ['magnitude'], ['positive_percentage','magnitude']

# Fixed split by group (best practice for your design)
TRAIN_GROUPS = ['B']
TEST_GROUPS = ['C']
OPTIONAL_HOLDOUT_GROUPS = ['A']

print('TCAV_CSV =', TCAV_CSV)
print('OUT_DIR =', OUT_DIR)
print('USE_METRICS =', USE_METRICS)
print('TRAIN_GROUPS =', TRAIN_GROUPS, '| TEST_GROUPS =', TEST_GROUPS)


In [None]:

# Load TCAV CSV and validate schema
assert TCAV_CSV.exists(), f'Missing TCAV CSV: {TCAV_CSV}'
df_long = pd.read_csv(TCAV_CSV)

required_cols = {
    'utt_id','group','partition','speaker_id','label_str','label_id','system_id',
    'concept_name','positive_percentage','magnitude','layer_key'
}
missing = sorted(required_cols - set(df_long.columns))
assert not missing, f'Missing columns in TCAV CSV: {missing}'

# Normalize types
for c in ['positive_percentage','magnitude','cav_acc']:
    if c in df_long.columns:
        df_long[c] = pd.to_numeric(df_long[c], errors='coerce')
df_long['utt_id'] = df_long['utt_id'].astype(str)
df_long['speaker_id'] = df_long['speaker_id'].astype(str)
df_long['group'] = df_long['group'].astype(str)
df_long['partition'] = df_long['partition'].astype(str)
df_long['label_str'] = df_long['label_str'].astype(str)
df_long['label_id'] = pd.to_numeric(df_long['label_id'], errors='coerce').astype(int)
df_long['system_id'] = df_long['system_id'].astype(str)

# Basic checks
print('Rows:', len(df_long))
print('Unique utt_id:', df_long['utt_id'].nunique())
print('Groups:', df_long[['utt_id','group']].drop_duplicates()['group'].value_counts().sort_index().to_dict())
print('Labels:', df_long[['utt_id','label_str']].drop_duplicates()['label_str'].value_counts().sort_index().to_dict())
print('Label IDs:', df_long[['utt_id','label_id']].drop_duplicates()['label_id'].value_counts().sort_index().to_dict())
print('Layer keys:', sorted(df_long['layer_key'].astype(str).unique().tolist()))
print('Concepts:', df_long['concept_name'].nunique())

# Expect one row per (utt_id, concept_name, layer_key)
per_utt = df_long.groupby('utt_id').size()
print('Rows per utt_id (value counts):')
print(per_utt.value_counts().sort_index())

display(df_long.head())


In [None]:

# Long -> wide (one row per utterance, concept metrics as features)
meta_cols = ['utt_id','group','partition','speaker_id','gender','label_str','label_id','system_id','target_class_name','target_class_id']
meta_cols = [c for c in meta_cols if c in df_long.columns]

# Keep one metadata row per utterance
meta_df = df_long[meta_cols].drop_duplicates(subset=['utt_id']).copy()
assert meta_df['utt_id'].nunique() == len(meta_df)

feature_frames = []
feature_cols = []
for metric in USE_METRICS:
    piv = df_long.pivot_table(index='utt_id', columns='concept_name', values=metric, aggfunc='first')
    piv = piv.sort_index(axis=1)
    piv.columns = [f'{metric}__{c}' for c in piv.columns]
    piv = piv.reset_index()
    feature_frames.append(piv)
    feature_cols.extend([c for c in piv.columns if c != 'utt_id'])

df_wide = meta_df.copy()
for fdf in feature_frames:
    df_wide = df_wide.merge(fdf, on='utt_id', how='inner')

print('Sample-level rows:', len(df_wide))
print('Feature count:', len(feature_cols))
print('Groups:', df_wide['group'].value_counts().sort_index().to_dict())
print('Labels:', df_wide['label_str'].value_counts().sort_index().to_dict())
display(df_wide.head())


In [None]:

# Fixed B/C split (speaker-disjoint by design from your subset plan)
train_df = df_wide[df_wide['group'].isin(TRAIN_GROUPS)].copy()
test_df = df_wide[df_wide['group'].isin(TEST_GROUPS)].copy()
holdout_A_df = df_wide[df_wide['group'].isin(OPTIONAL_HOLDOUT_GROUPS)].copy()

assert len(train_df) > 0 and len(test_df) > 0, 'Train/Test groups are empty. Check group labels in TCAV CSV.'

# Sanity: speaker disjointness between B and C
train_speakers = set(train_df['speaker_id'].astype(str).unique().tolist())
test_speakers = set(test_df['speaker_id'].astype(str).unique().tolist())
assert len(train_speakers & test_speakers) == 0, 'Speaker leakage between train and test groups!'

print('Train rows:', len(train_df), '| speakers:', len(train_speakers), '| labels:', train_df['label_id'].value_counts().sort_index().to_dict())
print('Test rows :', len(test_df),  '| speakers:', len(test_speakers),  '| labels:', test_df['label_id'].value_counts().sort_index().to_dict())
print('Holdout A rows:', len(holdout_A_df), '| speakers:', holdout_A_df['speaker_id'].nunique())


In [None]:

# Train logistic regression on TCAV features (B -> C)
X_train = train_df[feature_cols].copy()
X_test = test_df[feature_cols].copy()

# Impute with training medians only
medians = X_train.median(numeric_only=True)
X_train = X_train.fillna(medians)
X_test = X_test.fillna(medians)

y_train = train_df['label_id'].astype(int).to_numpy()  # 0=bonafide, 1=spoof
y_test = test_df['label_id'].astype(int).to_numpy()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = LogisticRegression(max_iter=3000, class_weight='balanced', solver='liblinear', random_state=0)
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)
y_prob_spoof = clf.predict_proba(X_test_scaled)[:, 1]

metrics = {
    'accuracy': float(accuracy_score(y_test, y_pred)),
    'precision_spoof_1': float(precision_score(y_test, y_pred, pos_label=1, zero_division=0)),
    'recall_spoof_1': float(recall_score(y_test, y_pred, pos_label=1, zero_division=0)),
    'f1_spoof_1': float(f1_score(y_test, y_pred, pos_label=1, zero_division=0)),
    'confusion_matrix_labels_[0_bonafide,1_spoof]': confusion_matrix(y_test, y_pred, labels=[0,1]).tolist(),
    'classification_report': classification_report(y_test, y_pred, labels=[0,1], zero_division=0),
}
print(json.dumps({k:v for k,v in metrics.items() if k != 'classification_report'}, indent=2))
print(metrics['classification_report'])


In [None]:

# Global concept importance (coefficients)
coef = clf.coef_.ravel()
coef_rows = []
for feature, c in zip(feature_cols, coef):
    metric, concept = feature.split('__', 1)
    coef_rows.append({
        'feature': feature,
        'metric': metric,
        'concept': concept,
        'coefficient': float(c),
        'abs_coefficient': float(abs(c)),
        'direction': 'spoof' if c > 0 else 'bonafide' if c < 0 else 'neutral'
    })
coef_df = pd.DataFrame(coef_rows).sort_values('abs_coefficient', ascending=False).reset_index(drop=True)
display(coef_df.head(30))


In [None]:

# Class-wise concept summary (bonafide vs spoof means) on B+C (analysis set)
analysis_df = df_wide[df_wide['group'].isin(TRAIN_GROUPS + TEST_GROUPS)].copy()
summary_rows = []
for feature in feature_cols:
    metric, concept = feature.split('__', 1)
    grp = analysis_df.groupby('label_id')[feature].agg(['mean', 'median'])
    bona_mean = float(grp.loc[0, 'mean']) if 0 in grp.index and pd.notna(grp.loc[0, 'mean']) else np.nan
    spoof_mean = float(grp.loc[1, 'mean']) if 1 in grp.index and pd.notna(grp.loc[1, 'mean']) else np.nan
    bona_median = float(grp.loc[0, 'median']) if 0 in grp.index and pd.notna(grp.loc[0, 'median']) else np.nan
    spoof_median = float(grp.loc[1, 'median']) if 1 in grp.index and pd.notna(grp.loc[1, 'median']) else np.nan
    summary_rows.append({
        'feature': feature,
        'metric': metric,
        'concept': concept,
        'bonafide_mean_label0': bona_mean,
        'spoof_mean_label1': spoof_mean,
        'bonafide_median_label0': bona_median,
        'spoof_median_label1': spoof_median,
        'mean_diff_spoof_minus_bonafide': spoof_mean - bona_mean if pd.notna(spoof_mean) and pd.notna(bona_mean) else np.nan,
    })
class_summary_df = pd.DataFrame(summary_rows)
print('Top spoof-leaning by mean difference:')
display(class_summary_df.sort_values('mean_diff_spoof_minus_bonafide', ascending=False).head(15))
print('Top bonafide-leaning by mean difference:')
display(class_summary_df.sort_values('mean_diff_spoof_minus_bonafide', ascending=True).head(15))


In [None]:

# Per-sample contributions, per-user importance (test group C), and per-system analysis (A01-A08)
contrib_matrix = X_test_scaled * clf.coef_.ravel().reshape(1, -1)
contrib_df = pd.DataFrame(contrib_matrix, columns=feature_cols)

pred_df = test_df[['utt_id','group','speaker_id','system_id','label_str','label_id']].reset_index(drop=True).copy()
pred_df['pred_label_id'] = y_pred
pred_df['pred_label_str'] = pred_df['pred_label_id'].map({0:'bonafide',1:'spoof'})
pred_df['pred_prob_spoof'] = y_prob_spoof
pred_df['correct'] = (pred_df['pred_label_id'] == pred_df['label_id']).astype(int)

sample_contrib_long = pd.concat([
    pred_df[['utt_id','group','speaker_id','system_id','label_str','label_id']],
    contrib_df
], axis=1).melt(
    id_vars=['utt_id','group','speaker_id','system_id','label_str','label_id'],
    var_name='feature', value_name='contribution'
)
sample_contrib_long[['metric','concept']] = sample_contrib_long['feature'].str.split('__', n=1, expand=True)

# Per-user mean contributions (test speakers)
user_contrib_df = (
    sample_contrib_long.groupby(['speaker_id','label_id','label_str','feature','metric','concept'], as_index=False)['contribution']
    .mean().rename(columns={'contribution':'mean_contribution'})
)

# Top concepts per user (test set)
def top_concepts_per_user(user_df, top_k=5):
    rows = []
    for (spk, label_id, label_str), g in user_df.groupby(['speaker_id','label_id','label_str']):
        g_desc = g.sort_values('mean_contribution', ascending=False)
        g_asc = g.sort_values('mean_contribution', ascending=True)
        for rank, (_, r) in enumerate(g_desc.head(top_k).iterrows(), start=1):
            rows.append({'speaker_id':spk,'label_id':int(label_id),'label_str':label_str,'list_type':'top_spoof_supporting','rank':rank,
                         'feature':r['feature'],'metric':r['metric'],'concept':r['concept'],'mean_contribution':float(r['mean_contribution'])})
        for rank, (_, r) in enumerate(g_asc.head(top_k).iterrows(), start=1):
            rows.append({'speaker_id':spk,'label_id':int(label_id),'label_str':label_str,'list_type':'top_bonafide_supporting','rank':rank,
                         'feature':r['feature'],'metric':r['metric'],'concept':r['concept'],'mean_contribution':float(r['mean_contribution'])})
    return pd.DataFrame(rows).sort_values(['speaker_id','label_id','list_type','rank']).reset_index(drop=True)

top_user_df = top_concepts_per_user(user_contrib_df, top_k=5)

# Per-system summaries on test set (A01-A08 + bonafide)
system_feature_summary_df = (
    test_df.groupby(['system_id','label_id','label_str'])[feature_cols]
    .mean(numeric_only=True)
    .reset_index()
)

# Per-system contributions (test set)
system_contrib_df = (
    sample_contrib_long.groupby(['system_id','label_id','label_str','feature','metric','concept'], as_index=False)['contribution']
    .mean().rename(columns={'contribution':'mean_contribution'})
)

# Per-system performance (spoof systems + bonafide)
system_perf_rows = []
for sysid, g in pred_df.groupby('system_id'):
    y_true_sys = g['label_id'].to_numpy()
    y_pred_sys = g['pred_label_id'].to_numpy()
    system_perf_rows.append({
        'system_id': sysid,
        'n_samples': int(len(g)),
        'label_id_mode': int(g['label_id'].mode().iloc[0]),
        'label_str_mode': str(g['label_str'].mode().iloc[0]),
        'accuracy': float((y_true_sys == y_pred_sys).mean()),
        'mean_pred_prob_spoof': float(g['pred_prob_spoof'].mean()),
    })
system_perf_df = pd.DataFrame(system_perf_rows).sort_values(['label_str_mode','system_id']).reset_index(drop=True)

print('pred_df:'); display(pred_df.head(10))
print('top_user_df:'); display(top_user_df.head(20))
print('system_perf_df:'); display(system_perf_df)


In [None]:

# Save outputs (ready for visualization / reporting)
# Save prepared wide table too, since it's useful for re-running analysis quickly.
df_wide.to_csv(OUT_DIR / 'tcav_sample_level_features_all.csv', index=False)
train_df.to_csv(OUT_DIR / 'train_group_B_samples.csv', index=False)
test_df.to_csv(OUT_DIR / 'test_group_C_samples.csv', index=False)
holdout_A_df.to_csv(OUT_DIR / 'holdout_group_A_samples.csv', index=False)

coef_df.to_csv(OUT_DIR / 'global_concept_coefficients.csv', index=False)
class_summary_df.to_csv(OUT_DIR / 'classwise_concept_summary_BplusC.csv', index=False)
pred_df.to_csv(OUT_DIR / 'test_group_C_predictions.csv', index=False)
sample_contrib_long.to_csv(OUT_DIR / 'test_group_C_sample_contributions_long.csv', index=False)
user_contrib_df.to_csv(OUT_DIR / 'test_group_C_user_mean_contributions.csv', index=False)
top_user_df.to_csv(OUT_DIR / 'test_group_C_top_concepts_per_user.csv', index=False)
system_feature_summary_df.to_csv(OUT_DIR / 'test_group_C_system_feature_means.csv', index=False)
system_contrib_df.to_csv(OUT_DIR / 'test_group_C_system_mean_contributions.csv', index=False)
system_perf_df.to_csv(OUT_DIR / 'test_group_C_system_performance.csv', index=False)

try:
    import joblib
    joblib.dump(scaler, OUT_DIR / 'scaler_tcav_features.joblib')
    joblib.dump(clf, OUT_DIR / 'logreg_tcav_features.joblib')
except Exception as e:
    print('[WARN] Could not save model artifacts:', e)

run_meta = {
    'tcav_csv': str(TCAV_CSV),
    'out_dir': str(OUT_DIR),
    'use_metrics': USE_METRICS,
    'split_policy': {'train_groups': TRAIN_GROUPS, 'test_groups': TEST_GROUPS, 'holdout_groups': OPTIONAL_HOLDOUT_GROUPS},
    'label_mapping': {'0':'bonafide', '1':'spoof'},
    'counts': {
        'total_samples': int(len(df_wide)),
        'train_samples_B': int(len(train_df)),
        'test_samples_C': int(len(test_df)),
        'holdout_samples_A': int(len(holdout_A_df)),
        'n_features': int(len(feature_cols)),
        'n_concepts': int(df_long['concept_name'].nunique()),
    },
    'metrics': metrics,
}
(OUT_DIR / 'run_metadata.json').write_text(json.dumps(run_meta, indent=2), encoding='utf-8')
print('Saved outputs to', OUT_DIR)
print(json.dumps({k:v for k,v in metrics.items() if k != 'classification_report'}, indent=2))


In [None]:
# Train and test diagnostics plots (saved to OUT_DIR/plots)
PLOTS_DIR = OUT_DIR / 'plots'
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

# Compute train predictions for diagnostics
X_train_diag = train_df[feature_cols].copy().fillna(medians)
X_train_diag_scaled = scaler.transform(X_train_diag)
y_pred_train = clf.predict(X_train_diag_scaled)
y_prob_train_spoof = clf.predict_proba(X_train_diag_scaled)[:, 1]

train_acc = float((y_pred_train == y_train).mean())
test_acc = float((y_pred == y_test).mean())
print('Train accuracy:', train_acc)
print('Test accuracy :', test_acc)

# 1) Train vs Test accuracy bar chart
plt.figure(figsize=(5,4))
plt.bar(['Train (B)', 'Test (C)'], [train_acc, test_acc], color=['#1f77b4', '#ff7f0e'])
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('TCAV-Feature Logistic Regression Accuracy')
for x, yv in zip(['Train (B)', 'Test (C)'], [train_acc, test_acc]):
    plt.text(x, yv + 0.02, f'{yv:.3f}', ha='center', fontsize=10)
plt.tight_layout()
plt.savefig(PLOTS_DIR / 'train_vs_test_accuracy.png', dpi=150)
plt.show()

# 2) Confusion matrix (test)
cm = confusion_matrix(y_test, y_pred, labels=[0,1])
plt.figure(figsize=(4.8,4.2))
plt.imshow(cm, cmap='Blues')
plt.title('Test Confusion Matrix (C)')
plt.xticks([0,1], ['pred bona', 'pred spoof'])
plt.yticks([0,1], ['true bona', 'true spoof'])
for i in range(2):
    for j in range(2):
        plt.text(j, i, str(cm[i,j]), ha='center', va='center', color='black')
plt.colorbar()
plt.tight_layout()
plt.savefig(PLOTS_DIR / 'test_confusion_matrix.png', dpi=150)
plt.show()

# 3) Probability distributions (train vs test, by class)
fig, axes = plt.subplots(1, 2, figsize=(10,4), sharey=True)
axes[0].hist(y_prob_train_spoof[y_train==0], bins=20, alpha=0.7, label='bonafide (0)')
axes[0].hist(y_prob_train_spoof[y_train==1], bins=20, alpha=0.7, label='spoof (1)')
axes[0].set_title('Train (B): Predicted P(spoof)')
axes[0].set_xlabel('Predicted P(spoof)')
axes[0].set_ylabel('Count')
axes[0].legend(fontsize=8)

axes[1].hist(y_prob_spoof[y_test==0], bins=20, alpha=0.7, label='bonafide (0)')
axes[1].hist(y_prob_spoof[y_test==1], bins=20, alpha=0.7, label='spoof (1)')
axes[1].set_title('Test (C): Predicted P(spoof)')
axes[1].set_xlabel('Predicted P(spoof)')
axes[1].legend(fontsize=8)
plt.tight_layout()
plt.savefig(PLOTS_DIR / 'train_test_prob_spoof_hist.png', dpi=150)
plt.show()

print('Saved diagnostic plots in', PLOTS_DIR)


In [None]:
# Visualization: global concept importance and class differences
PLOTS_DIR = OUT_DIR / 'plots'
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

# Global coefficients top 25
coef_plot = coef_df.head(25).copy().iloc[::-1]
colors = ['#d62728' if d == 'spoof' else '#1f77b4' if d == 'bonafide' else '#7f7f7f' for d in coef_plot['direction']]
plt.figure(figsize=(10, max(6, 0.32 * len(coef_plot))))
plt.barh(coef_plot['feature'], coef_plot['coefficient'], color=colors)
plt.axvline(0, color='black', linewidth=1)
plt.title('Global Concept Coefficients (positive=spoof, negative=bonafide)')
plt.tight_layout()
plt.savefig(PLOTS_DIR / 'global_concept_coefficients_top25.png', dpi=150)
plt.show()

# Mean difference (spoof - bonafide) top/bottom
top_spoof = class_summary_df.sort_values('mean_diff_spoof_minus_bonafide', ascending=False).head(15).copy().iloc[::-1]
plt.figure(figsize=(9, 6))
plt.barh(top_spoof['feature'], top_spoof['mean_diff_spoof_minus_bonafide'], color='#d62728')
plt.axvline(0, color='black', linewidth=1)
plt.title('Top Spoof-Leaning Concepts by Mean Difference (B+C)')
plt.tight_layout()
plt.savefig(PLOTS_DIR / 'class_diff_top_spoof_15.png', dpi=150)
plt.show()

top_bona = class_summary_df.sort_values('mean_diff_spoof_minus_bonafide', ascending=True).head(15).copy().iloc[::-1]
plt.figure(figsize=(9, 6))
plt.barh(top_bona['feature'], top_bona['mean_diff_spoof_minus_bonafide'], color='#1f77b4')
plt.axvline(0, color='black', linewidth=1)
plt.title('Top Bonafide-Leaning Concepts by Mean Difference (B+C)')
plt.tight_layout()
plt.savefig(PLOTS_DIR / 'class_diff_top_bonafide_15.png', dpi=150)
plt.show()


In [None]:
# Visualization: per-user and per-system analyses (test group C)
PLOTS_DIR = OUT_DIR / 'plots'
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

# Per-user heatmap (fake/spoof samples only, test C)
spoof_user = user_contrib_df[user_contrib_df['label_id'] == 1].copy()
if spoof_user.empty:
    print('No spoof rows in user_contrib_df for test group C')
else:
    heat = spoof_user.pivot_table(index='speaker_id', columns='feature', values='mean_contribution', aggfunc='mean').fillna(0)
    plt.figure(figsize=(max(10, 0.35 * heat.shape[1]), max(4, 0.6 * heat.shape[0])))
    im = plt.imshow(heat.to_numpy(), aspect='auto', cmap='coolwarm')
    plt.colorbar(im, label='Mean contribution')
    plt.xticks(range(heat.shape[1]), heat.columns, rotation=90, fontsize=7)
    plt.yticks(range(heat.shape[0]), heat.index, fontsize=9)
    plt.title('Test Group C: Per-user Mean Concept Contributions (Spoof samples)')
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / 'test_group_C_user_heatmap_spoof.png', dpi=150)
    plt.show()

# Per-system performance bar chart on test C
sys_perf_plot = system_perf_df.copy().sort_values(['label_str_mode', 'system_id'])
plt.figure(figsize=(9, 4.5))
colors = ['#1f77b4' if x == 'bonafide' else '#d62728' for x in sys_perf_plot['label_str_mode']]
plt.bar(sys_perf_plot['system_id'], sys_perf_plot['accuracy'], color=colors)
plt.ylim(0, 1)
plt.ylabel('Accuracy on test C')
plt.title('Per-system Accuracy (Test Group C)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(PLOTS_DIR / 'test_group_C_system_accuracy.png', dpi=150)
plt.show()

# Top concepts per spoof system (mean contributions) table export
top_system_rows = []
for sysid, g in system_contrib_df[system_contrib_df['label_id'] == 1].groupby('system_id'):
    gs = g.sort_values('mean_contribution', ascending=False).head(5)
    for rank, (_, r) in enumerate(gs.iterrows(), start=1):
        top_system_rows.append({
            'system_id': sysid,
            'rank': rank,
            'feature': r['feature'],
            'metric': r['metric'],
            'concept': r['concept'],
            'mean_contribution': float(r['mean_contribution']),
        })

top_system_df = pd.DataFrame(top_system_rows).sort_values(['system_id','rank']).reset_index(drop=True)
top_system_df.to_csv(OUT_DIR / 'test_group_C_top_concepts_per_system.csv', index=False)
print('Saved:', OUT_DIR / 'test_group_C_top_concepts_per_system.csv')
display(top_system_df.head(40))
