
# 2. Train Logistic Regression (15 Users Train / 5 Users Test)

This notebook trains a speaker-independent logistic regression on the prepared sample-level CSV.

Input: output from `tcav_concept_prepare_csv.ipynb`
Output: train/test splits, metrics, coefficients, per-user contributions, predictions.

This notebook is self-contained (does not import your `.py` script).


In [None]:

from pathlib import Path
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

PROJECT_ROOT = Path('/home/SpeakerRec/BioVoice')
PREPARED_CSV = PROJECT_ROOT / 'data' / 'tcav' / 'prepared_csvs' / 'stage4_spoofwrapper_positive_percentage' / 'sample_level_features_all.csv'
OUT_DIR = PROJECT_ROOT / 'data' / 'tcav' / 'logreg_concept_analysis' / 'stage4_spoofwrapper_pospct'
OUT_DIR.mkdir(parents=True, exist_ok=True)

NUM_SPEAKERS = 20
TRAIN_SPEAKERS = 15
TEST_SPEAKERS = 5
RANDOM_SEED = 42
PREFER_BALANCED_SPEAKERS = True

print('PREPARED_CSV =', PREPARED_CSV)
print('OUT_DIR =', OUT_DIR)


In [None]:

# Load prepared sample-level CSV
assert PREPARED_CSV.exists(), f'Missing prepared CSV: {PREPARED_CSV}'
df_wide = pd.read_csv(PREPARED_CSV)

required_meta = ['idx', 'speaker_id', 'system_id', 'key', 'true label']
for c in required_meta:
    assert c in df_wide.columns, f'Missing column: {c}'

feature_cols = [c for c in df_wide.columns if '__' in c]
assert feature_cols, 'No feature columns found (expected names like metric__concept)'

df_wide['idx'] = df_wide['idx'].astype(str)
df_wide['speaker_id'] = df_wide['speaker_id'].astype(str)
df_wide['true label'] = pd.to_numeric(df_wide['true label'], errors='coerce').astype(int)

print('Rows:', len(df_wide))
print('Speakers:', df_wide['speaker_id'].nunique())
print('Features:', len(feature_cols))
print('Class counts (true label):', df_wide['true label'].value_counts().sort_index().to_dict())


In [None]:

# Speaker selection and 15/5 split (speaker-independent)
def choose_speakers(df, num_speakers=20, seed=42, prefer_balanced=True):
    speakers = sorted(df['speaker_id'].astype(str).unique().tolist())
    if len(speakers) < num_speakers:
        raise ValueError(f'Requested {num_speakers} speakers, found {len(speakers)}')
    rng = np.random.default_rng(seed)

    if prefer_balanced:
        counts = df.groupby(['speaker_id', 'true label']).size().unstack(fill_value=0)
        has_real = counts[0] > 0 if 0 in counts.columns else pd.Series(False, index=counts.index)
        has_fake = counts[1] > 0 if 1 in counts.columns else pd.Series(False, index=counts.index)
        balanced = counts.index[(has_real) & (has_fake)].astype(str).tolist()
        if len(balanced) >= num_speakers:
            return sorted(rng.choice(sorted(balanced), size=num_speakers, replace=False).tolist())

    return sorted(rng.choice(speakers, size=num_speakers, replace=False).tolist())

selected_speakers = choose_speakers(df_wide, NUM_SPEAKERS, RANDOM_SEED, PREFER_BALANCED_SPEAKERS)
train_speakers, test_speakers = train_test_split(
    selected_speakers,
    train_size=TRAIN_SPEAKERS,
    test_size=TEST_SPEAKERS,
    random_state=RANDOM_SEED,
    shuffle=True,
)
train_speakers = sorted(train_speakers)
test_speakers = sorted(test_speakers)

selected_df = df_wide[df_wide['speaker_id'].isin(selected_speakers)].copy()
train_df = selected_df[selected_df['speaker_id'].isin(train_speakers)].copy()
test_df = selected_df[selected_df['speaker_id'].isin(test_speakers)].copy()

print('Train speakers:', train_speakers)
print('Test speakers :', test_speakers)
print('Train rows:', len(train_df), '| class counts:', train_df['true label'].value_counts().sort_index().to_dict())
print('Test rows :', len(test_df),  '| class counts:', test_df['true label'].value_counts().sort_index().to_dict())


In [None]:

# Train logistic regression
X_train = train_df[feature_cols].copy()
X_test = test_df[feature_cols].copy()

# Fill missing values using training medians
medians = X_train.median(numeric_only=True)
X_train = X_train.fillna(medians)
X_test = X_test.fillna(medians)

y_train = train_df['true label'].astype(int).to_numpy()  # 1=fake, 0=real
y_test = test_df['true label'].astype(int).to_numpy()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = LogisticRegression(max_iter=2000, class_weight='balanced', solver='liblinear', random_state=0)
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)
y_prob_fake = clf.predict_proba(X_test_scaled)[:, 1]

metrics = {
    'accuracy': float(accuracy_score(y_test, y_pred)),
    'precision_fake_1': float(precision_score(y_test, y_pred, pos_label=1, zero_division=0)),
    'recall_fake_1': float(recall_score(y_test, y_pred, pos_label=1, zero_division=0)),
    'f1_fake_1': float(f1_score(y_test, y_pred, pos_label=1, zero_division=0)),
    'confusion_matrix_labels_[0_real,1_fake]': confusion_matrix(y_test, y_pred, labels=[0,1]).tolist(),
    'classification_report': classification_report(y_test, y_pred, labels=[0,1], zero_division=0),
}
print(json.dumps({k:v for k,v in metrics.items() if k != 'classification_report'}, indent=2))
print(metrics['classification_report'])


In [None]:

# Global concept importance (coefficients)
coef = clf.coef_.ravel()
coef_rows = []
for feature, c in zip(feature_cols, coef):
    metric, concept = feature.split('__', 1)
    coef_rows.append({
        'feature': feature,
        'metric': metric,
        'concept': concept,
        'coefficient': float(c),
        'abs_coefficient': float(abs(c)),
        'direction': 'fake' if c > 0 else 'real' if c < 0 else 'neutral'
    })
coef_df = pd.DataFrame(coef_rows).sort_values('abs_coefficient', ascending=False).reset_index(drop=True)
display(coef_df.head(20))


In [None]:

# Class-wise concept summary (real vs fake means)
summary_rows = []
for feature in feature_cols:
    metric, concept = feature.split('__', 1)
    grp = selected_df.groupby('true label')[feature].agg(['mean', 'median'])
    real_mean = float(grp.loc[0, 'mean']) if 0 in grp.index and pd.notna(grp.loc[0, 'mean']) else np.nan
    fake_mean = float(grp.loc[1, 'mean']) if 1 in grp.index and pd.notna(grp.loc[1, 'mean']) else np.nan
    real_median = float(grp.loc[0, 'median']) if 0 in grp.index and pd.notna(grp.loc[0, 'median']) else np.nan
    fake_median = float(grp.loc[1, 'median']) if 1 in grp.index and pd.notna(grp.loc[1, 'median']) else np.nan
    summary_rows.append({
        'feature': feature,
        'metric': metric,
        'concept': concept,
        'real_mean_true_label_0': real_mean,
        'fake_mean_true_label_1': fake_mean,
        'real_median_true_label_0': real_median,
        'fake_median_true_label_1': fake_median,
        'mean_diff_fake_minus_real': fake_mean - real_mean if pd.notna(fake_mean) and pd.notna(real_mean) else np.nan,
    })
class_summary_df = pd.DataFrame(summary_rows)
display(class_summary_df.sort_values('mean_diff_fake_minus_real', ascending=False).head(10))


In [None]:

# Per-sample and per-user concept contributions on test set
contrib_matrix = X_test_scaled * clf.coef_.ravel().reshape(1, -1)
contrib_df = pd.DataFrame(contrib_matrix, columns=feature_cols)

pred_df = test_df[['idx', 'speaker_id', 'system_id', 'key', 'true label']].reset_index(drop=True).copy()
pred_df['pred_label'] = y_pred
pred_df['pred_prob_fake'] = y_prob_fake

sample_contrib_long = pd.concat([pred_df[['idx', 'speaker_id', 'system_id', 'key', 'true label']], contrib_df], axis=1).melt(
    id_vars=['idx', 'speaker_id', 'system_id', 'key', 'true label'],
    var_name='feature', value_name='contribution'
)
sample_contrib_long[['metric', 'concept']] = sample_contrib_long['feature'].str.split('__', n=1, expand=True)

user_contrib = (
    sample_contrib_long
    .groupby(['speaker_id', 'true label', 'feature', 'metric', 'concept'], as_index=False)['contribution']
    .mean()
    .rename(columns={'contribution': 'mean_contribution'})
)

def top_concepts_per_user(user_df, top_k=3):
    rows = []
    for (speaker_id, true_label), g in user_df.groupby(['speaker_id', 'true label']):
        g_desc = g.sort_values('mean_contribution', ascending=False)
        g_asc = g.sort_values('mean_contribution', ascending=True)
        for rank, (_, r) in enumerate(g_desc.head(top_k).iterrows(), start=1):
            rows.append({'speaker_id': speaker_id, 'true label': int(true_label), 'list_type': 'top_fake_supporting', 'rank': rank,
                         'feature': r['feature'], 'metric': r['metric'], 'concept': r['concept'], 'mean_contribution': float(r['mean_contribution'])})
        for rank, (_, r) in enumerate(g_asc.head(top_k).iterrows(), start=1):
            rows.append({'speaker_id': speaker_id, 'true label': int(true_label), 'list_type': 'top_real_supporting', 'rank': rank,
                         'feature': r['feature'], 'metric': r['metric'], 'concept': r['concept'], 'mean_contribution': float(r['mean_contribution'])})
    return pd.DataFrame(rows).sort_values(['speaker_id','true label','list_type','rank']).reset_index(drop=True)

top_user_df = top_concepts_per_user(user_contrib, top_k=3)
display(top_user_df.head(30))


In [None]:

# Save training outputs for later visualization notebook
train_df.to_csv(OUT_DIR / 'train_samples_15speakers.csv', index=False)
test_df.to_csv(OUT_DIR / 'test_samples_5speakers.csv', index=False)
selected_df.to_csv(OUT_DIR / 'sample_level_features_selected_20speakers.csv', index=False)
coef_df.to_csv(OUT_DIR / 'global_concept_coefficients.csv', index=False)
class_summary_df.to_csv(OUT_DIR / 'classwise_concept_summary.csv', index=False)
pred_df.to_csv(OUT_DIR / 'test_predictions.csv', index=False)
sample_contrib_long.to_csv(OUT_DIR / 'test_sample_contributions_long.csv', index=False)
user_contrib.to_csv(OUT_DIR / 'test_user_mean_contributions.csv', index=False)
top_user_df.to_csv(OUT_DIR / 'test_user_top_concepts.csv', index=False)

try:
    import joblib
    joblib.dump(scaler, OUT_DIR / 'scaler.joblib')
    joblib.dump(clf, OUT_DIR / 'logreg.joblib')
except Exception as e:
    print('[WARN] Could not save joblib files:', e)

run_metadata = {
    'prepared_csv': str(PREPARED_CSV),
    'out_dir': str(OUT_DIR),
    'random_seed': RANDOM_SEED,
    'speaker_split': {
        'train_speakers': train_speakers,
        'test_speakers': test_speakers,
    },
    'counts': {
        'all_rows': int(len(df_wide)),
        'selected_rows': int(len(selected_df)),
        'train_rows': int(len(train_df)),
        'test_rows': int(len(test_df)),
    },
    'label_mapping': {
        'true_label_0': 'real',
        'true_label_1': 'fake'
    },
    'metrics': metrics,
}
(OUT_DIR / 'run_metadata.json').write_text(json.dumps(run_metadata, indent=2), encoding='utf-8')
print('Saved outputs to', OUT_DIR)
