In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

# === 1) Path to your original file ===
data_path = Path("dialoguesum_fixed.csv")   # ← your actual file name

# === 2) Load safely ===
assert data_path.exists(), f"File not found: {data_path}"
df = pd.read_csv(data_path)

# === 3) Detect the human-score column automatically ===
score_col = next((c for c in ["human_score","avg_human","rating","mean_rating"] if c in df.columns), None)
assert score_col is not None, f"No human score column found in {data_path.name}"

# === 4) Compute descriptive statistics ===
scores = pd.to_numeric(df[score_col], errors="coerce").dropna()
n = len(scores)
mean = scores.mean()
std = scores.std(ddof=1) if n > 1 else np.nan

print("Dialoguesum Results:")
print(f"Mean Human Score: {mean:.4f}" if n else "Mean Human Score: NA")
print(f"Standard Deviation: {std:.4f}" if n > 1 else "Standard Deviation: NA")
print(f"N: {n}")


Dialoguesum Results:
Mean Human Score: 3.2881
Standard Deviation: 0.8617
N: 108


In [1]:
# ==================== DialogueSum — robust BLEURT↔Human correlation (key-agnostic) ====================
import pandas as pd, numpy as np, re
from pathlib import Path
from scipy.stats import pearsonr, spearmanr, kendalltau

# ---- set your file names here (adjust if different) ----
BLEURT_PATHS = [
    "dialoguesum_bleurt_scored_safe.csv",
    "dialoguesum_bleurt_scored.csv",
    "outputs/dialoguesum_bleurt_scored_safe.csv",
]
HUMAN_PATHS  = [
    "dialoguesum_fixed.csv"
]

def first_existing(paths):
    for p in map(Path, paths):
        if p.exists(): return p
    return None

bleurt_path = first_existing(BLEURT_PATHS)
human_path  = first_existing(HUMAN_PATHS)
assert bleurt_path is not None, "Couldn't find a BLEURT-scored CSV."
assert human_path  is not None, "Couldn't find a human-annotated CSV."

db = pd.read_csv(bleurt_path)
dh = pd.read_csv(human_path)

print("BLEURT cols:", list(db.columns))
print("HUMAN  cols:", list(dh.columns))

# ---------- helpers ----------
def oneline(s: str) -> str:
    s = "" if pd.isna(s) else str(s)
    s = s.encode("utf-8","ignore").decode("utf-8","ignore")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def make_key(s: pd.Series) -> pd.Series:
    # stable text key: single-line, lowercased, trimmed to 160 chars
    return s.map(oneline).str.lower().str.slice(0,160)

def choose(colnames, *candidates):
    for cand in candidates:
        if isinstance(cand, (list, tuple)):
            for c in cand:
                if c in colnames: return c
        else:
            if cand in colnames: return cand
    return None

# ---------- detect score + id + candidate text columns ----------
bleurt_col = choose(db.columns,
    "bleurt","bleurt_score","score","score_bleurt","BLEURT")
human_col  = choose(dh.columns,
    "human_score","avg_human","mean_rating","rating")

id_b = choose(db.columns, "id","dialogue_id","sample_id")
id_h = choose(dh.columns, "id","dialogue_id","sample_id")

cand_b = choose(db.columns, "candidate","candidate_summary","hypothesis","text","dialogue_text")
cand_h = choose(dh.columns, "candidate_summary","candidate","hypothesis","text","dialogue_text")

assert bleurt_col, f"No BLEURT score-like column found in {bleurt_path.name}"
assert human_col,  f"No human score-like column found in {human_path.name}"

# numeric scores
db["_bleurt"] = pd.to_numeric(db[bleurt_col], errors="coerce")
dh["_human"]  = pd.to_numeric(dh[human_col],  errors="coerce")

# ---------- build join keys (prefer id if present; otherwise text key) ----------
use_id_join = id_b is not None and id_h is not None

if use_id_join:
    db["_key"] = db[id_b].astype(str)
    dh["_key"] = dh[id_h].astype(str)
    key_label = f"id ({id_b}↔{id_h})"
else:
    # fall back to normalized candidate text
    assert cand_b and cand_h, (
        "No shared 'id' and no candidate text columns to build a key. "
        f"BLEURT has: {db.columns.tolist()} | HUMAN has: {dh.columns.tolist()}"
    )
    db["_key"] = make_key(db[cand_b])
    dh["_key"] = make_key(dh[cand_h])
    key_label = f"text-key ({cand_b}↔{cand_h})"

# collapse BLEURT to one row per key (max-over-refs; change to 'mean' if desired)
gb = (db.dropna(subset=["_key","_bleurt"])
        .groupby("_key", as_index=False)["_bleurt"].max()
        .rename(columns={"_bleurt":"bleurt"}))

# dedupe human: one score per key
gh = (dh.dropna(subset=["_key","_human"])
        .drop_duplicates(subset=["_key"])
        [["_key","_human"]]
        .rename(columns={"_human":"human"}))

# diagnostics
overlap = len(set(gb["_key"]).intersection(set(gh["_key"])))
print(f"Join key used: {key_label}")
print(f"BLEURT keys: {gb.shape[0]} | HUMAN keys: {gh.shape[0]} | Overlap: {overlap}")

# merge
df = (gb.merge(gh, on="_key", how="inner")
        .dropna(subset=["bleurt","human"])
        .reset_index(drop=True))

print(f"Merged rows: {len(df)}")
if len(df) < 2:
    ex_missing_h = gb.loc[~gb["_key"].isin(gh["_key"]), "_key"].head(5).tolist()
    ex_missing_b = gh.loc[~gh["_key"].isin(gb["_key"]), "_key"].head(5).tolist()
    print("Examples missing in HUMAN:", ex_missing_h)
    print("Examples missing in BLEURT:", ex_missing_b)
    raise ValueError("Not enough overlap (need ≥2) after building keys. "
                     "If you intended to join on a different field, set cand_b/cand_h or id_b/id_h accordingly.")

# ---------- correlations ----------
x = df["human"].to_numpy()
y = df["bleurt"].to_numpy()

def safe_corr(a, b, fn):
    if len(a) < 2 or np.std(a)==0 or np.std(b)==0: return (np.nan, np.nan)
    try: return fn(a, b)
    except Exception: return (np.nan, np.nan)

r, rp = safe_corr(x, y, pearsonr)
s, sp = safe_corr(x, y, spearmanr)
t, tp = safe_corr(x, y, kendalltau)

print("\n=== DialogueSum (BLEURT-20) ===")
print(f"Samples (keys): {len(df)}  |  Join: {key_label}  |  Collapse: max-over-refs")
print(f"Pearson r = {r:.4f} (p = {rp if not np.isnan(rp) else 'NA'})")
print(f"Spearman ρ = {s:.4f} (p = {sp if not np.isnan(sp) else 'NA'})")
print(f"Kendall τ = {t:.4f} (p = {tp if not np.isnan(tp) else 'NA'})")

# save summary
(pd.DataFrame({
    "metric":["pearson_r","spearman_rho","kendall_tau"],
    "value":[r,s,t],
    "p_value":[rp,sp,tp],
    "n":[len(df)]*3,
    "join":[key_label]*3,
    "collapse":["max"]*3
})
 .to_csv("dialoguesum_bleurt_human_corr.csv", index=False))
print("\nSaved: dialoguesum_bleurt_human_corr.csv")


BLEURT cols: ['example_id', 'candidate', 'reference', 'bleurt']
HUMAN  cols: ['id', 'topic', 'dialogue_text', 'candidate_summary', 'reference_summary_1', 'reference_summary_2', 'human_score']
Join key used: text-key (candidate↔candidate_summary)
BLEURT keys: 35 | HUMAN keys: 54 | Overlap: 34
Merged rows: 34

=== DialogueSum (BLEURT-20) ===
Samples (keys): 34  |  Join: text-key (candidate↔candidate_summary)  |  Collapse: max-over-refs
Pearson r = 0.0337 (p = 0.8497953546278006)
Spearman ρ = 0.1171 (p = 0.5096524340866534)
Kendall τ = 0.1020 (p = 0.39790712840110853)

Saved: dialoguesum_bleurt_human_corr.csv
