# Aggregate metrics from `outputs`

Reads every prediction CSV under each model folder in `outputs/` and reports the mean +/- standard deviation for the classification report metrics, One-vs-Rest AUC, and a row-normalized confusion matrix.

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import label_binarize
from IPython.display import display, Markdown

# Project paths and label mapping
base_dir = Path("outputs")
label_map = {0: "negative", 1: "neutral", 2: "positive"}
class_order = list(label_map.keys())
class_names = [label_map[i] for i in class_order]


In [2]:

# Helpers for formatting and aggregation

def format_mean_std(mean: float, std: float) -> str:
    std = 0.0 if pd.isna(std) else std
    return f"{mean:.4f} +/- {std:.4f}"

def load_runs(model_path: Path):
    runs = []
    for fp in sorted(model_path.glob("*.csv")):
        df = pd.read_csv(fp)
        if not {"true_label", "pred_label"}.issubset(df.columns):
            print(f"Skipping {fp.name} because it lacks true_label/pred_label.")
            continue
        y_true = df["true_label"].astype(int)
        y_pred = df["pred_label"].astype(int)
        prob_cols = [f"prob_{i}" for i in class_order]
        probs = df[prob_cols].values if all(col in df.columns for col in prob_cols) else None
        runs.append({"file": fp.name, "y_true": y_true, "y_pred": y_pred, "probs": probs})
    return runs

def aggregate_classification_report(runs):
    reports = []
    for run in runs:
        report = classification_report(
            run["y_true"],
            run["y_pred"],
            target_names=class_names,
            output_dict=True,
            zero_division=0,
        )
        reports.append(pd.DataFrame(report).T)
    stacked = pd.concat(reports, keys=range(len(reports)), names=["run"])
    stats = stacked.groupby(level=1).agg(["mean", "std"])
    formatted = pd.DataFrame(index=stats.index)
    for col in stats.columns.levels[0]:
        formatted[col] = stats[col].apply(lambda r: format_mean_std(r["mean"], r["std"]), axis=1)
    return formatted

def aggregate_auc(runs):
    prob_runs = [run for run in runs if run["probs"] is not None]
    if not prob_runs:
        return None
    auc_rows = []
    for run in prob_runs:
        y_true_bin = label_binarize(run["y_true"], classes=class_order)
        auc_rows.append({
            class_names[i]: roc_auc_score(y_true_bin[:, i], run["probs"][:, i])
            for i in range(len(class_order))
        })
    df = pd.DataFrame(auc_rows)
    summary = pd.DataFrame(index=df.columns)
    summary["ovr_auc"] = [format_mean_std(df[col].mean(), df[col].std(ddof=0)) for col in df.columns]
    return summary

def aggregate_confusion(runs):
    matrices = []
    for run in runs:
        cm = confusion_matrix(run["y_true"], run["y_pred"], labels=class_order)
        row_sums = cm.sum(axis=1, keepdims=True)
        row_sums[row_sums == 0] = 1
        matrices.append(cm / row_sums)
    cube = np.stack(matrices)
    mean = cube.mean(axis=0)
    std = cube.std(axis=0)
    data = []
    for i, cname in enumerate(class_names):
        row = {}
        for j, pred in enumerate(class_names):
            row[pred] = format_mean_std(mean[i, j], std[i, j])
        data.append(row)
    return pd.DataFrame(data, index=class_names)


In [3]:

# Evaluate every model folder under outputs/
model_dirs = sorted([p for p in base_dir.iterdir() if p.is_dir()], reverse=True)
if not model_dirs:
    raise ValueError("No model directories found in outputs/.")

for model_dir in model_dirs:
    runs = load_runs(model_dir)
    if not runs:
        display(Markdown(f"### {model_dir.name} (no runs found)"))
        continue

    display(Markdown(f"## {model_dir.name} ({len(runs)} run(s))"))

    display(Markdown("**Classification report (mean +/- std)**"))
    display(aggregate_classification_report(runs))

    auc_table = aggregate_auc(runs)
    if auc_table is not None:
        display(Markdown("**One-vs-Rest AUC (mean +/- std)**"))
        display(auc_table)
    else:
        display(Markdown("**One-vs-Rest AUC:** skipped because no probability columns were found."))

    display(Markdown("**Row-normalized confusion matrix (mean +/- std)**"))
    display(aggregate_confusion(runs))


## naive-bayes (10 run(s))

**Classification report (mean +/- std)**

Unnamed: 0,precision,recall,f1-score,support
accuracy,0.7030 +/- 0.0023,0.7030 +/- 0.0023,0.7030 +/- 0.0023,0.7030 +/- 0.0023
macro avg,0.6240 +/- 0.0042,0.6540 +/- 0.0055,0.6182 +/- 0.0033,3673.0000 +/- 0.0000
negative,0.3953 +/- 0.0086,0.7101 +/- 0.0218,0.5076 +/- 0.0067,467.0000 +/- 0.0000
neutral,0.6345 +/- 0.0135,0.4461 +/- 0.0233,0.5233 +/- 0.0126,925.0000 +/- 0.0000
positive,0.8424 +/- 0.0101,0.8058 +/- 0.0095,0.8236 +/- 0.0022,2281.0000 +/- 0.0000
weighted avg,0.7332 +/- 0.0054,0.7030 +/- 0.0023,0.7078 +/- 0.0028,3673.0000 +/- 0.0000


**One-vs-Rest AUC (mean +/- std)**

Unnamed: 0,ovr_auc
negative,0.8654 +/- 0.0047
neutral,0.8214 +/- 0.0037
positive,0.8698 +/- 0.0041


**Row-normalized confusion matrix (mean +/- std)**

Unnamed: 0,negative,neutral,positive
negative,0.7101 +/- 0.0207,0.0949 +/- 0.0124,0.1951 +/- 0.0120
neutral,0.2801 +/- 0.0164,0.4461 +/- 0.0221,0.2738 +/- 0.0267
positive,0.1091 +/- 0.0054,0.0851 +/- 0.0079,0.8058 +/- 0.0091


## lstm (10 run(s))

**Classification report (mean +/- std)**

Unnamed: 0,precision,recall,f1-score,support
accuracy,0.7531 +/- 0.0049,0.7531 +/- 0.0049,0.7531 +/- 0.0049,0.7531 +/- 0.0049
macro avg,0.6715 +/- 0.0049,0.7223 +/- 0.0073,0.6885 +/- 0.0051,3673.0000 +/- 0.0000
negative,0.4908 +/- 0.0127,0.6925 +/- 0.0302,0.5740 +/- 0.0098,467.0000 +/- 0.0000
neutral,0.6014 +/- 0.0163,0.6789 +/- 0.0259,0.6373 +/- 0.0078,925.0000 +/- 0.0000
positive,0.9224 +/- 0.0110,0.7955 +/- 0.0130,0.8541 +/- 0.0037,2281.0000 +/- 0.0000
weighted avg,0.7867 +/- 0.0046,0.7531 +/- 0.0049,0.7639 +/- 0.0040,3673.0000 +/- 0.0000


**One-vs-Rest AUC (mean +/- std)**

Unnamed: 0,ovr_auc
negative,0.8994 +/- 0.0049
neutral,0.8678 +/- 0.0031
positive,0.9140 +/- 0.0017


**Row-normalized confusion matrix (mean +/- std)**

Unnamed: 0,negative,neutral,positive
negative,0.6925 +/- 0.0286,0.2028 +/- 0.0343,0.1047 +/- 0.0215
neutral,0.2083 +/- 0.0145,0.6789 +/- 0.0246,0.1128 +/- 0.0172
positive,0.0629 +/- 0.0061,0.1416 +/- 0.0103,0.7955 +/- 0.0123


## bert (10 run(s))

**Classification report (mean +/- std)**

Unnamed: 0,precision,recall,f1-score,support
accuracy,0.8134 +/- 0.0029,0.8134 +/- 0.0029,0.8134 +/- 0.0029,0.8134 +/- 0.0029
macro avg,0.7478 +/- 0.0055,0.7751 +/- 0.0031,0.7598 +/- 0.0038,3673.0000 +/- 0.0000
negative,0.6545 +/- 0.0178,0.7328 +/- 0.0160,0.6910 +/- 0.0037,467.0000 +/- 0.0000
neutral,0.6676 +/- 0.0046,0.7279 +/- 0.0179,0.6963 +/- 0.0075,925.0000 +/- 0.0000
positive,0.9213 +/- 0.0038,0.8647 +/- 0.0040,0.8921 +/- 0.0017,2281.0000 +/- 0.0000
weighted avg,0.8235 +/- 0.0029,0.8134 +/- 0.0029,0.8172 +/- 0.0028,3673.0000 +/- 0.0000


**One-vs-Rest AUC (mean +/- std)**

Unnamed: 0,ovr_auc
negative,0.9482 +/- 0.0014
neutral,0.9098 +/- 0.0020
positive,0.9445 +/- 0.0007


**Row-normalized confusion matrix (mean +/- std)**

Unnamed: 0,negative,neutral,positive
negative,0.7328 +/- 0.0152,0.1820 +/- 0.0119,0.0852 +/- 0.0056
neutral,0.1329 +/- 0.0145,0.7279 +/- 0.0170,0.1392 +/- 0.0101
positive,0.0256 +/- 0.0024,0.1097 +/- 0.0039,0.8647 +/- 0.0038
