In [1]:
import pandas as pd, re, json
from pathlib import Path

df = pd.read_csv("dialoguesum_lite_100_clean_improved.csv")
print("Initial shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head(3)


Initial shape: (108, 7)
Columns: ['id', 'topic', 'dialogue_text', 'candidate_summary', 'reference_summary_1', 'reference_summary_2', 'human_score']


Unnamed: 0,id,topic,dialogue_text,candidate_summary,reference_summary_1,reference_summary_2,human_score
0,manual_001,formal_greetings,"JAMES: Good morning, Professor Austin, how are...","James greets Professor Austin, presents Emma, ...",James introduces Emma to Professor Austin and ...,"James greets Professor Austin, presents Emma, ...",3.08
1,manual_002,informal_greetings,"JANE: Hi, Helen! How’s it going? HELEN: Fine, ...","Two friends exchange greetings, discuss study ...","Jane greets Helen, learns she is heading to th...","Two friends exchange greetings, discuss study ...",2.74
2,manual_003,formal_introductions,"MARGARET: Mr. Wilson, I’d like you to meet Dr....",A formal introduction leads to a brief exchang...,Margaret formally introduces Mr. Wilson and Dr...,A formal introduction leads to a brief exchang...,3.19


In [2]:
def clean_text(s: str) -> str:
    if pd.isna(s): 
        return ""
    s = str(s)
    s = s.encode("utf-8","ignore").decode("utf-8","ignore")
    s = re.sub(r"[^\S\r\n]+", " ", s)    # collapse multiple spaces/tabs
    s = re.sub(r"\s*\n\s*", " ", s)      # merge newlines
    s = s.replace("…","...").strip()
    return s


In [3]:
# Clean text fields
df["dialogue_text"] = df["dialogue_text"].apply(clean_text)
df["reference_summary_1"] = df["reference_summary_1"].apply(clean_text)
df["reference_summary_2"] = df["reference_summary_2"].apply(clean_text)

# Drop rows where dialogue or summary is too short
df = df[(df["dialogue_text"].str.len() > 20) & (df["reference_summary_1"].str.len() > 10)]
df = df.drop_duplicates(subset=["dialogue_text","reference_summary_1"])
df = df.reset_index(drop=True)
print("After cleaning:", df.shape)


After cleaning: (95, 7)


In [4]:
rows = []
for _, r in df.iterrows():
    for ref in [r.reference_summary_1, r.reference_summary_2]:
        rows.append({
            "id": r.id,
            "topic": r.topic,
            "candidate": r.dialogue_text,
            "reference": ref,
        })
df_long = pd.DataFrame(rows)
print("Expanded shape:", df_long.shape)
df_long.head(3)


Expanded shape: (190, 4)


Unnamed: 0,id,topic,candidate,reference
0,manual_001,formal_greetings,"JAMES: Good morning, Professor Austin, how are...",James introduces Emma to Professor Austin and ...
1,manual_001,formal_greetings,"JAMES: Good morning, Professor Austin, how are...","James greets Professor Austin, presents Emma, ..."
2,manual_002,informal_greetings,"JANE: Hi, Helen! How’s it going? HELEN: Fine, ...","Jane greets Helen, learns she is heading to th..."


In [7]:
out_dir = Path("./dialoguesum_bleurt_ready")
out_dir.mkdir(exist_ok=True, parents=True)

csv_path  = out_dir / "dialoguesum_bleurt_ready.csv"
jsonl_path = out_dir / "dialoguesum_bleurt_ready.jsonl"

df_long.to_csv(csv_path, index=False)
df_long.to_json(jsonl_path, orient="records", lines=True, force_ascii=False)

print("Saved:")
print(" -", csv_path.resolve())
print(" -", jsonl_path.resolve())


Saved:
 - /workspaces/bleurt/dialoguesum_bleurt_ready/dialoguesum_bleurt_ready.csv
 - /workspaces/bleurt/dialoguesum_bleurt_ready/dialoguesum_bleurt_ready.jsonl


In [5]:
from bleurt import score as bleurt_score
import numpy as np

ckpt = "./BLEURT-20"   # or bleurt-large-512
scorer = bleurt_score.BleurtScorer(ckpt)

sample = df_long.sample(10, random_state=42)
scores = scorer.score(candidates=sample["candidate"].tolist(),
                      references=sample["reference"].tolist())
sample["bleurt"] = np.round(scores,3)
sample[["id","topic","bleurt"]]


2025-11-01 07:18:06.230075: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-01 07:18:28.760233: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-01 07:18:41.642859: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


INFO:tensorflow:Reading checkpoint ./BLEURT-20.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint BLEURT-20
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:BLEURT-20
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:... vocab_file:None
INFO:tensorflow:... do_lower_case:None
INFO:tensorflow:... sp_model:sent_piece
INFO:tensorflow:... dynamic_seq_length:True
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Will load model: ./BLEURT-20/sent_piece.model.
INFO:tensorflow:SentencePiece tokenizer created.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.


2025-11-01 07:18:49.558384: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
2025-11-01 07:18:50.928318: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 256307200 exceeds 10% of free system memory.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.
2025-11-01 07:21:56.979935: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 23592960 exceeds 10% of free system memory.
2025-11-01 07:21:57.029148: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 23592960 exceeds 10% of free system memory.
2025-11-01 07:21:57.030062: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 23592960 exceeds 10% of free system memory.
2025-11-01 07:21:57.387203: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 23592960 exceeds 10% of free system memory.


Unnamed: 0,id,topic,bleurt
175,author_072,job_interview,0.529
180,manual_added_004,apartment rental,0.556
111,author_030,landlord_tenant,0.529
65,author_006,restaurant_order,0.492
101,author_024,bank_inquiry,0.523
15,manual_001w_1,formal_greetings,0.358
9,manual_005,time_and_plans,0.509
16,manual_001w_2,formal_greetings,0.357
141,author_046,restaurant_order,0.608
124,author_038,movie_plans,0.482


In [9]:
# Create folder for checkpoints
!mkdir -p BLEURT-20

# Download and unzip the BLEURT-20 model (recommended medium-sized checkpoint)
!wget -q https://storage.googleapis.com/bleurt-oss/bleurt-20.zip -O BLEURT-20.zip
!unzip -oq BLEURT-20.zip -d BLEURT-20
!rm BLEURT-20.zip

# Confirm the files
!ls BLEURT-20


[BLEURT-20.zip]
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of BLEURT-20.zip or
        BLEURT-20.zip.zip, and cannot find BLEURT-20.zip.ZIP, period.
bert_config.json    saved_model.pb    sent_piece.vocab
bleurt_config.json  sent_piece.model  variables


In [10]:
from pathlib import Path
ckpt = Path("BLEURT-20")
print("Exists:", ckpt.exists())
print("Contents:", [p.name for p in ckpt.iterdir()])
need = ["saved_model.pb", "bleurt_config.json", "variables"]
print("Has all required files:", all((ckpt / n).exists() for n in need))


Exists: True
Contents: ['saved_model.pb', 'variables', 'bert_config.json', 'sent_piece.model', 'bleurt_config.json', 'sent_piece.vocab']
Has all required files: True


In [11]:
from bleurt import score as bleurt_score
scorer = bleurt_score.BleurtScorer("BLEURT-20")   # <-- use the folder you just verified

print(scorer.score(
    candidates=["The cat is on the mat.", "He plays piano."],
    references=["A cat sits on a mat.", "He is playing the piano."]
))


INFO:tensorflow:Reading checkpoint BLEURT-20.


INFO:tensorflow:Reading checkpoint BLEURT-20.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: BLEURT-20/sent_piece.model.


INFO:tensorflow:Will load model: BLEURT-20/sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.
2025-10-30 10:29:55.272417: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 256307200 exceeds 10% of free system memory.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


[0.7462440729141235, 0.7786075472831726]


In [None]:
# This cell will:
#  - look for any "*bleurt*ready*.csv" under the workspace
#  - otherwise rebuild the file from dialoguesum_lite_100_clean.csv (or similar)

from pathlib import Path
import pandas as pd, re

TARGET_DIR = Path("dialoguesum_bleurt_ready")
TARGET_DIR.mkdir(exist_ok=True, parents=True)
TARGET_PATH = TARGET_DIR / "dialoguesum_bleurt_ready.csv"

def clean_text(s: str) -> str:
    if pd.isna(s): return ""
    s = str(s).encode("utf-8","ignore").decode("utf-8","ignore")
    s = re.sub(r"[^\S\r\n]+", " ", s)
    s = re.sub(r"\s*\n\s*", " ", s).strip()
    return s

# 1) Try to find an existing bleurt-ready file anywhere
hits = sorted(Path(".").rglob("*bleurt*ready*.csv"))
if hits:
    TARGET_PATH = hits[0]
    print("✅ Found existing BLEURT-ready file:", TARGET_PATH)

else:
    # 2) Rebuild from the clean 100-row dataset
    base = None
    patterns = [
        "dialoguesum_lite_100_clean.csv",
        "dialoguesum_lite_100.csv",
        "*lite*100*clean*.csv",
        "*lite*100*.csv"
    ]
    for pat in patterns:
        m = sorted(Path(".").rglob(pat))
        if m:
            base = m[0]
            break

    if base is None:
        raise FileNotFoundError(
            "Could not find the base dataset (e.g., dialoguesum_lite_100_clean.csv). "
            "Search your workspace or adjust the patterns above."
        )

    print("Building BLEURT-ready pairs from:", base)
    df = pd.read_csv(base)

    # Clean & filter
    for c in ["dialogue_text","reference_summary_1","reference_summary_2"]:
        df[c] = df[c].apply(clean_text)
    df = df[(df["dialogue_text"].str.len() > 20) & (df["reference_summary_1"].str.len() > 10)].copy()

    # Expand to candidate–reference rows (two refs per dialogue)
    rows = []
    for _, r in df.iterrows():
        rows.append({"id": r.id, "topic": r.topic, "candidate": r.dialogue_text, "reference": r.reference_summary_1})
        rows.append({"id": r.id, "topic": r.topic, "candidate": r.dialogue_text, "reference": r.reference_summary_2})
    df_long = pd.DataFrame(rows)
    df_long.to_csv(TARGET_PATH, index=False)
    print("✅ Rebuilt:", TARGET_PATH)

print("FINAL_PATH =", TARGET_PATH.resolve())


In [1]:
import pandas as pd
from bleurt import score as bleurt_score

# ✅ use the correct absolute or relative path
BLEURT_READY = "/workspaces/bleurt/dialoguesum_bleurt_ready/dialoguesum_bleurt_ready.csv"

df = pd.read_csv(BLEURT_READY)
print("Loaded:", df.shape)
df.head(2)


2025-10-30 10:53:04.562270: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-30 10:53:28.102973: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-30 10:53:41.378197: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


Loaded: (176, 4)


Unnamed: 0,id,topic,candidate,reference
0,manual_001,formal_greetings,"JAMES: Good morning, Professor Austin, how are...",James introduces Emma to Professor Austin and ...
1,manual_001,formal_greetings,"JAMES: Good morning, Professor Austin, how are...","James greets Professor Austin, presents Emma, ..."


In [7]:
from pathlib import Path
import pandas as pd
import numpy as np

# ---------- helpers ----------
def first_existing(paths):
    for p in map(Path, paths):
        if p.exists():
            return p
    return None

def coerce_numeric(s):
    # convert to numeric safely and drop NaNs
    return pd.to_numeric(s, errors="coerce").dropna()

def series_stats(s):
    s = coerce_numeric(s)
    n = int(s.size)
    mean = float(s.mean()) if n else np.nan
    std  = float(s.std(ddof=1)) if n > 1 else np.nan  # sample std
    return {"n": n, "mean": mean, "std": std}

# ---------- locate files (lightweight & robust) ----------
scored = first_existing([
    "dialoguesum_bleurt_scored_safe.csv",
    "dialoguesum_bleurt_scored.csv",
    "/workspaces/bleurt/dialoguesum_bleurt_scored_safe.csv",
    "/workspaces/bleurt/dialoguesum_bleurt_scored.csv",
    "outputs/dialoguesum_bleurt_scored_safe.csv",
])

annot  = first_existing([
    "dialoguesum_lite_100_clean.csv",
    "/workspaces/bleurt/dialoguesum_lite_100_clean.csv",
    "data/dialoguesum_lite_100_clean.csv",
])

assert scored is not None, "Couldn't find a scored CSV (looked for *_scored_safe.csv or *_scored.csv)."
assert annot  is not None, "Couldn't find dialoguesum_lite_100_clean.csv."

print("Using scored:", scored)
print("Using annot :", annot)

# ---------- load minimal columns ----------
sc = pd.read_csv(scored)
an = pd.read_csv(annot)

# column detection (accept common variants)
bleurt_col = next(c for c in ["bleurt","bleurt_score","BLEURT","score","score_bleurt"] if c in sc.columns)
human_col  = next(c for c in ["human_score","avg_human","rating","mean_rating"] if c in an.columns)

# ---------- overall stats ----------
bleurt_stats = series_stats(sc[bleurt_col])
human_stats  = series_stats(an[human_col])

print("\nOverall BLEURT stats:")
print(f"  n = {bleurt_stats['n']}")
print(f"  mean = {bleurt_stats['mean']:.4f}" if bleurt_stats['n'] else "  mean = NA")
print(f"  std  = {bleurt_stats['std']:.4f}"  if bleurt_stats['n']>1 else "  std  = NA")

print("\nOverall human score stats:")
print(f"  n = {human_stats['n']}")
print(f"  mean = {human_stats['mean']:.4f}" if human_stats['n'] else "  mean = NA")
print(f"  std  = {human_stats['std']:.4f}"  if human_stats['n']>1 else "  std  = NA")

# ---------- optional: per-topic stats if 'topic' exists in each file ----------
if "topic" in sc.columns:
    bt = (sc[["topic", bleurt_col]]
          .assign(val=lambda d: pd.to_numeric(d[bleurt_col], errors="coerce"))
          .dropna(subset=["val"])
          .groupby("topic")["val"]
          .agg(n="size", mean="mean", std=lambda x: x.std(ddof=1))
          .reset_index())
    print("\nPer-topic BLEURT (first 10 rows):")
    display(bt.head(10))

if "topic" in an.columns:
    ht = (an[["topic", human_col]]
          .assign(val=lambda d: pd.to_numeric(d[human_col], errors="coerce"))
          .dropna(subset=["val"])
          .groupby("topic")["val"]
          .agg(n="size", mean="mean", std=lambda x: x.std(ddof=1))
          .reset_index())
    print("\nPer-topic human scores (first 10 rows):")
    display(ht.head(10))

Using scored: dialoguesum_bleurt_scored.csv
Using annot : dialoguesum_lite_100_clean.csv

Overall BLEURT stats:
  n = 0
  mean = NA
  std  = NA

Overall human score stats:
  n = 100
  mean = 2.2100
  std  = 0.4333

Per-topic BLEURT (first 10 rows):


Unnamed: 0,topic,n,mean,std



Per-topic human scores (first 10 rows):


Unnamed: 0,topic,n,mean,std
0,bank_inquiry,4,2.0,0.0
1,call_center,3,2.333333,0.57735
2,classroom,4,2.0,0.0
3,doctor_visit,4,2.0,0.0
4,emergency,3,2.0,0.0
5,flight_change,4,2.0,0.0
6,formal_greetings,4,2.75,0.5
7,formal_introductions,5,2.0,0.0
8,hotel_booking,4,2.5,0.57735
9,immigration_enquiry,4,2.0,0.0


In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

# === 1) Path to your original file ===
data_path = Path("dialoguesum_lite_100_clean.csv")   # ← your actual file name

# === 2) Load safely ===
assert data_path.exists(), f"File not found: {data_path}"
df = pd.read_csv(data_path)

# === 3) Detect the human-score column automatically ===
score_col = next((c for c in ["human_score","avg_human","rating","mean_rating"] if c in df.columns), None)
assert score_col is not None, f"No human score column found in {data_path.name}"

# === 4) Compute descriptive statistics ===
scores = pd.to_numeric(df[score_col], errors="coerce").dropna()
n = len(scores)
mean = scores.mean()
std = scores.std(ddof=1) if n > 1 else np.nan

print("Dialoguesum_lite_100_clean Results:")
print(f"Mean Human Score: {mean:.4f}" if n else "Mean Human Score: NA")
print(f"Standard Deviation: {std:.4f}" if n > 1 else "Standard Deviation: NA")
print(f"N: {n}")


Dialoguesum_lite_100_clean Results:
Mean Human Score: 2.2100
Standard Deviation: 0.4333
N: 100


In [1]:
from pathlib import Path
import pandas as pd, numpy as np
from scipy import stats

# ---------- 0) Paths ----------
DATA = Path("dialoguesum_lite_100_clean.csv")
SC_SCORED = [Path("dialoguesum_bleurt_scored_safe.csv"),
             Path("dialoguesum_bleurt_scored.csv")]
assert DATA.exists(), f"Missing {DATA}"

# ---------- 1) Load your dataset ----------
df = pd.read_csv(DATA)

# Column detection
id_col = next((c for c in ["id","example_id","idx","sample_id","_row_id"] if c in df.columns), None)
cand_col = next((c for c in ["candidate_summary","candidate","system_summary"] if c in df.columns), None)
ref_cols = [c for c in ["reference_summary","reference_summary_1","reference_summary_2"] if c in df.columns]
human_col = next((c for c in ["human_score","avg_human","rating","mean_rating"] if c in df.columns), None)
assert cand_col and ref_cols and human_col, "Need candidate/ref(s)/human_score columns."

# ---------- 2) Try to get BLEURT without re-scoring ----------
bleurt_series = None

# 2a) If BLEURT already inside the same CSV
for bc in ["bleurt","bleurt_score","BLEURT","score_bleurt"]:
    if bc in df.columns:
        bleurt_series = pd.to_numeric(df[bc], errors="coerce")
        break

# 2b) Else look for sidecar scored CSV and align (cheap, no TF)
if bleurt_series is None:
    for p in SC_SCORED:
        if not p.exists(): 
            continue
        sc = pd.read_csv(p)
        # choose columns we need if present
        sc_id = next((c for c in ["id","example_id","idx","sample_id"] if c in sc.columns), None)
        sc_bleurt = next((c for c in ["bleurt","bleurt_score","BLEURT","score_bleurt"] if c in sc.columns), None)
        # fast path: join by id if both have ids
        if sc_id and id_col and sc_bleurt:
            m = (df[[id_col]].merge(sc[[sc_id, sc_bleurt]], left_on=id_col, right_on=sc_id, how="left"))
            bleurt_series = pd.to_numeric(m[sc_bleurt], errors="coerce")
            if bleurt_series.notna().any():
                break
        # fallback: join by normalized text signature (candidate+reference)
        if sc_bleurt and {"candidate","reference"}.issubset(sc.columns):
            def sig(c,r): 
                return (c.astype(str).str.lower().str.replace(r"\s+"," ",regex=True).str.strip()
                        + "||" + r.astype(str).str.lower().str.replace(r"\s+"," ",regex=True).str.strip())
            # build signatures on a *single* reference (first available) for a cheap join
            sig_df = sig(df[cand_col], df[ref_cols[0]])
            sig_sc = sig(sc["candidate"], sc["reference"])
            tmp = pd.DataFrame({"sig": sig_df})
            tmp2 = pd.DataFrame({"sig": sig_sc, "bleurt": sc[sc_bleurt]})
            m = tmp.merge(tmp2, on="sig", how="left")
            bleurt_series = pd.to_numeric(m["bleurt"], errors="coerce")
            if bleurt_series.notna().any():
                break

# ---------- 3) If BLEURT still missing, compute a tiny ROUGE-L recall proxy ----------
def rougeL_recall(candidate: str, reference: str) -> float:
    # whitespace-token LCS (O(n*m)); fine for ~100 examples
    a = candidate.split(); b = reference.split()
    if not a or not b: 
        return 0.0
    n, m = len(a), len(b)
    dp = [0]*(m+1)
    for i in range(1, n+1):
        prev = 0
        for j in range(1, m+1):
            t = dp[j]
            if a[i-1] == b[j-1]:
                dp[j] = prev + 1
            else:
                dp[j] = dp[j] if dp[j] > dp[j-1] else dp[j-1]
            prev = t
    lcs = dp[m]
    return lcs / m

if bleurt_series is None or bleurt_series.isna().all():
    print("No BLEURT scores found → using ROUGE-L recall proxy.")
    # Use first available reference for a consistent target
    rcol = ref_cols[0]
    # compute quickly (vectorized-ish loop; still light for ~100)
    x_scores = []
    for cand, ref in zip(df[cand_col].astype(str), df[rcol].astype(str)):
        x_scores.append(rougeL_recall(cand.strip(), ref.strip()))
    metric = pd.Series(x_scores, dtype=float, name="metric")
    target_name = "ROUGE-L recall (proxy)"
else:
    metric = pd.to_numeric(bleurt_series, errors="coerce")
    target_name = "BLEURT"

# ---------- 4) Prepare human scores and drop NaNs ----------
human = pd.to_numeric(df[human_col], errors="coerce")
mask = metric.notna() & human.notna()
x = metric[mask].to_numpy()
y = human[mask].to_numpy()
assert len(x) > 2, "Not enough valid pairs to correlate."

# ---------- 5) Correlations (fast; no bootstrap to stay ultra-light) ----------
pr, pp = stats.pearsonr(x, y)
sr, sp = stats.spearmanr(x, y)
kt, kp = stats.kendalltau(x, y, variant="b")

print(f"\nDialoguesum correlations ({target_name} vs human_score):")
print(f"  N            : {len(x)}")
print(f"  Pearson r    : {pr:.4f} (p={pp:.2e})")
print(f"  Spearman ρ   : {sr:.4f} (p={sp:.2e})")
print(f"  Kendall τ-b  : {kt:.4f} (p={kp:.2e})")


No BLEURT scores found → using ROUGE-L recall proxy.

Dialoguesum correlations (ROUGE-L recall (proxy) vs human_score):
  N            : 100
  Pearson r    : -0.0290 (p=7.74e-01)
  Spearman ρ   : -0.0344 (p=7.34e-01)
  Kendall τ-b  : -0.0323 (p=7.28e-01)
