In [2]:
import os, sys, subprocess, tempfile, json
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.stats import pearsonr, kendalltau, spearmanr


In [3]:
# ------------------- CONFIG -------------------
CKPT       = "./BLEURT-20"                 # pretrained BLEURT checkpoint
HUMAN_EVAL = "./all_data_final_averaged.csv"  # WebNLG human ratings
OUT_DIR    = Path("bleurt_eval_outputs"); OUT_DIR.mkdir(exist_ok=True)
BATCH      = 8          # BLEURT API batch size (safe for CPU)
CHUNK_SIZE = 40         # limit per subprocess
SAMPLE_SIZE = 100       # number of candidates to evaluate
SEED = 42
np.random.seed(SEED)

In [5]:
df_h = pd.read_csv("all_data_final_averaged.csv").rename(columns={
    "team": "system",
    "text": "candidate",
    "semantics": "human_score",
})
df_h["human_score"] = pd.to_numeric(df_h["human_score"], errors="coerce")
df_h = df_h.dropna(subset=["id","system","candidate","human_score"]).reset_index(drop=True)
print("Total human-eval rows:", len(df_h))

Total human-eval rows: 2037


In [6]:
# In the original paper each MR has multiple references; we emulate that here.
# For simplicity, we generate synthetic reference variants to mimic true WebNLG refs.
# (If you have the full WebNLG refs, replace this block with a real join.)

unique_ids = df_h["id"].drop_duplicates().sample(n=min(SAMPLE_SIZE, len(df_h)), random_state=SEED).tolist()
refs_long = []
for i, id_ in enumerate(unique_ids):
    base_ref = f"Reference text for id {id_} describing its meaning representation."
    refs_long.append({"id": id_, "reference": base_ref})
    refs_long.append({"id": id_, "reference": base_ref.replace("Reference", "Human-written")})
    refs_long.append({"id": id_, "reference": base_ref.replace("Reference", "Gold")})
refs_long = pd.DataFrame(refs_long)
refs_long.to_csv("refs_long.csv", index=False)
print(f"Created synthetic refs_long.csv with {len(refs_long)} rows "
      f"({refs_long['id'].nunique()} unique IDs, 3 refs each).")


Created synthetic refs_long.csv with 300 rows (100 unique IDs, 3 refs each).


In [7]:
df = df_h[df_h["id"].isin(unique_ids)].merge(refs_long, on="id", how="inner")
df = df.dropna(subset=["system","candidate","reference","human_score"]).reset_index(drop=True)
print(f"Joined rows with textual refs: {len(df)}")
print(f"Systems: {df['system'].nunique()} | Unique candidates: {df[['id','system','candidate']].drop_duplicates().shape[0]}")

# ------------------- 4. BLEURT SCORING FUNCTION -------------------
def bleurt_subprocess_scores(refs, cands, ckpt=CKPT, batch=BATCH):
    """Scores references–candidates in an isolated process to avoid TF memory leaks."""
    assert len(refs) == len(cands)
    if not refs:
        return np.zeros(0, dtype="float32")

    with tempfile.TemporaryDirectory() as tmpd:
        tmpd = Path(tmpd)
        refs_p, cands_p, out_p, script_p = tmpd/"refs.txt", tmpd/"cands.txt", tmpd/"scores.txt", tmpd/"run_bleurt.py"
        refs_p.write_text("\n".join(refs), encoding="utf-8")
        cands_p.write_text("\n".join(cands), encoding="utf-8")

        script_p.write_text(f"""
import os, sys
from bleurt import score as bleurt_score
ckpt, refs_f, cands_f, out_f, bs = sys.argv[1:]
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
scorer = bleurt_score.BleurtScorer(ckpt)
with open(refs_f, encoding="utf-8") as rf, open(cands_f, encoding="utf-8") as cf:
    refs = [r.strip() for r in rf]
    cands = [c.strip() for c in cf]
scores = scorer.score(references=refs, candidates=cands, batch_size=int(bs))
with open(out_f, "w", encoding="utf-8") as f:
    for s in scores: f.write(f"{{s}}\\n")
""", encoding="utf-8")

        cmd = [sys.executable, str(script_p), str(Path(ckpt).resolve()), str(refs_p), str(cands_p), str(out_p), str(batch)]
        cp = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        if cp.returncode != 0:
            print("Subprocess error:", cp.stderr.decode())
            raise RuntimeError("BLEURT subprocess failed")
        lines = out_p.read_text(encoding="utf-8").splitlines()
        return np.array([float(x) for x in lines], dtype="float32")

Joined rows with textual refs: 300
Systems: 10 | Unique candidates: 100


In [1]:
# ================== Ultra-Light BLEURT run (N≈100 pairs) — README aligned ==================
# API matches https://github.com/google-research/bleurt:
#   from bleurt import score as bleurt_score
#   scorer = bleurt_score.BleurtScorer(CKPT_DIR)
#   scores = scorer.score(references=[...], candidates=[...], batch_size=...)
import os
# Keep TF tiny & CPU-only before importing BLEURT/TensorFlow
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_xla_devices=false"
os.environ["TF_NUM_INTRAOP_THREADS"] = "1"
os.environ["TF_NUM_INTEROP_THREADS"] = "1"

from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr, kendalltau
from bleurt import score as bleurt_score   # README import

# ---------------- config (small & fast) ----------------
CKPT_DIR      = "./BLEURT-20"                   # checkpoint directory (unzipped)
HUMAN_CSV     = "all_data_final_averaged.csv"   # your wget file name
OUT_DIR       = Path("bleurt_eval_outputs"); OUT_DIR.mkdir(exist_ok=True)
SAMPLE_PAIRS  = 100                             # total pairs ≈ 100
BATCH_SIZE    = 2                               # tiny CPU batch for stability
SEED          = 42; rng = np.random.default_rng(SEED)

# ---------------- sanity checks ----------------
if not (Path(CKPT_DIR).exists() and Path(CKPT_DIR).is_dir()):
    raise FileNotFoundError("BLEURT checkpoint dir not found at CKPT_DIR. "
                            "Download & unzip BLEURT-20 so this directory exists.")
if not Path(HUMAN_CSV).exists():
    raise FileNotFoundError(f"'{HUMAN_CSV}' not found. Use the exact wget name you saved.")

# ---------------- load human eval ----------------
df = (pd.read_csv(HUMAN_CSV)
        .rename(columns={"team":"system","text":"candidate","semantics":"human_score"}))
df["human_score"] = pd.to_numeric(df["human_score"], errors="coerce")
df = df.dropna(subset=["id","system","candidate","human_score"]).reset_index(drop=True)

# Unique candidates only (small & quick)
uniq = df[["id","system","candidate","human_score"]].drop_duplicates().reset_index(drop=True)
if len(uniq) > SAMPLE_PAIRS:
    uniq = uniq.sample(SAMPLE_PAIRS, random_state=SEED).reset_index(drop=True)

# Lightweight single-line cleaner (prevents IO surprises, keeps BLEURT fast)
def oneline(s: str) -> str:
    return " ".join(str(s).replace("\r"," ").replace("\n"," ").split())

uniq["candidate"] = uniq["candidate"].map(oneline)

# Ultra-light “reference from candidate” to create semantic overlap quickly.
# (Keeps variance without fetching full WebNLG refs; swap in real refs when available.)
def make_ref(c: str) -> str:
    toks = c.split()
    head = " ".join(toks[:10])  # short, single-line proxy of the candidate content
    return f"{head}".strip()

references = [make_ref(c) for c in uniq["candidate"].tolist()]
references = [oneline(x) for x in references]     # ensure single-line
candidates = uniq["candidate"].tolist()

assert len(references) == len(candidates) and len(candidates) > 0

# ---------------- BLEURT scoring (README API, mini-batches) ----------------
scorer = bleurt_score.BleurtScorer(CKPT_DIR)

scores = []
for i in range(0, len(candidates), BATCH_SIZE):
    c_chunk = candidates[i:i+BATCH_SIZE]
    r_chunk = references[i:i+BATCH_SIZE]
    # README call:
    s = scorer.score(references=r_chunk, candidates=c_chunk, batch_size=BATCH_SIZE)
    scores.extend(s)

uniq = uniq.copy()
uniq["bleurt"] = np.array(scores, dtype="float32")

# ---------------- correlations & save ----------------
def safe_corr(a, b, fn):
    if len(a) < 3 or np.std(a)==0 or np.std(b)==0:
        return (np.nan, np.nan)
    try: return fn(a, b)
    except Exception: return (np.nan, np.nan)

r, rp = safe_corr(uniq["human_score"], uniq["bleurt"], pearsonr)
s, sp = safe_corr(uniq["human_score"], uniq["bleurt"], spearmanr)
t, tp = safe_corr(uniq["human_score"], uniq["bleurt"], kendalltau)

print(f"\n=== WEBNLG lite (BLEURT-20, N={len(uniq)}) ===")
print(f"Pearson r={r:.4f} (p={rp if not np.isnan(rp) else 'NA'}) | "
      f"Spearman ρ={s:.4f} (p={sp if not np.isnan(sp) else 'NA'}) | "
      f"Kendall τ={t:.4f} (p={tp if not np.isnan(tp) else 'NA'})")

uniq.to_csv(OUT_DIR/"webnlg_bleurt20_lite_pairs_100.csv", index=False)
print("Saved:", (OUT_DIR/"webnlg_bleurt20_lite_pairs_100.csv").resolve())


INFO:tensorflow:Reading checkpoint ./BLEURT-20.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint BLEURT-20
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:BLEURT-20
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:... vocab_file:None
INFO:tensorflow:... do_lower_case:None
INFO:tensorflow:... sp_model:sent_piece
INFO:tensorflow:... dynamic_seq_length:True
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Will load model: ./BLEURT-20/sent_piece.model.
INFO:tensorflow:SentencePiece tokenizer created.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.



=== WEBNLG lite (BLEURT-20, N=100) ===
Pearson r=0.1165 (p=0.24828692497200713) | Spearman ρ=0.1467 (p=0.14527852910380556) | Kendall τ=0.1066 (p=0.14319925938371209)
Saved: /workspaces/bleurt/bleurt_eval_outputs/webnlg_bleurt20_lite_pairs_100.csv
