In [10]:
import os, glob, json, math, zipfile, io
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, kendalltau
from typing import Optional, Dict, List, Tuple
from pathlib import Path                      # <-- fixes NameError: Path

from bleurt import score as bleurt_score


2025-10-04 06:21:27.083757: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-04 06:21:52.004665: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-04 06:22:04.916635: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


In [11]:
# ========= CHECKPOINT =========
# You already have these folders next to your notebooks (per your screenshot)
BLEURT_CHECKPOINT = "./BLEURT-20"            # or "./BLEURT-20-D12"

# If you ever want the notebook to auto-download, keep it writable (no /mnt/data)
DOWNLOAD_BLEURT   = False
BLEURT_ZIP_URL    = "https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip"
BLEURT_ZIP_DST    = str(Path.cwd() / "BLEURT-20.zip")   # <-- writable

# ========= DATA: PATH A (recommended) =========
# Put the WebNLG CSV in the same folder as your notebook (or adjust path)
WEBNLG_CSV = "./all_data_final_averaged.csv"

# For WMT you can leave these empty for now, or point them to files you have
WMT_INPUTS = [
    # e.g., "./wmt19_ende_segments.tsv",
]
WMT_GLOB = None                                # or e.g., "./wmt19/**/*segments*.tsv"

# If you later use mt-metrics-eval locally, keep paths writable (no /mnt/data)
MTMETRICS_ROOT     = str(Path.cwd() / "mt-metrics-eval")
USE_MT_METRICS_EVAL = False
MTMETRICS_SET      = "wmt19"

# ========= COLUMN MAPPINGS =========
WEBNLG_COLMAP = {"system":"system","reference":"reference",
                 "candidate":"candidate","human":"human_score"}
WMT_COLMAP    = {"system":"system","reference":"reference",
                 "candidate":"candidate","human":"human_score",
                 "langpair": None}

WEBNLG_FILTER_QUERY = None
WMT_FILTER_QUERY    = None

# ========= OUTPUTS (writable) =========
OUT_DIR = Path.cwd() / "bleurt_eval_outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)      # <-- no permission error

# ========= PERFORMANCE =========
BLEURT_BATCH_SIZE = 64

print("Working directory:", os.getcwd())
print("Output directory :", OUT_DIR)
print("Checkpoint exists:", Path(BLEURT_CHECKPOINT).exists())


Working directory: /workspaces/bleurt
Output directory : /workspaces/bleurt/bleurt_eval_outputs
Checkpoint exists: True


In [1]:
!wget -O all_data_final_averaged.csv \
  https://gitlab.com/webnlg/webnlg-human-evaluation/-/raw/master/all_data_final_averaged.csv


--2025-10-05 06:06:50--  https://gitlab.com/webnlg/webnlg-human-evaluation/-/raw/master/all_data_final_averaged.csv
Resolving gitlab.com (gitlab.com)... 172.65.251.78, 2606:4700:90:0:f22e:fbec:5bed:a9b9
Connecting to gitlab.com (gitlab.com)|172.65.251.78|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 806795 (788K) [text/plain]
Saving to: ‘all_data_final_averaged.csv’


2025-10-05 06:06:50 (70.1 MB/s) - ‘all_data_final_averaged.csv’ saved [806795/806795]



In [2]:
# Example snippet
refs_long = []
for id_ in df_h["id"].unique()[:50]:
    refs = fetch_references_for_id(id_)   # from WebNLG official repo or your local copy
    for r in refs:
        refs_long.append({"id": id_, "reference": r})
pd.DataFrame(refs_long).to_csv("refs_long.csv", index=False)


NameError: name 'df_h' is not defined

In [2]:
import pandas as pd

# Example references
data = {
    "id": [1, 1, 2, 2],
    "reference": [
        "The Eiffel Tower is located in Paris.",
        "Paris is home to the Eiffel Tower.",
        "The Nile is the longest river in Africa.",
        "The Nile river runs through Egypt."
    ]
}

# Create DataFrame
refs_long = pd.DataFrame(data)

# Save to CSV in the current notebook folder
refs_long.to_csv("refs_long.csv", index=False)

print("refs_long.csv created in current directory")
print(refs_long.head())


refs_long.csv created in current directory
   id                                 reference
0   1     The Eiffel Tower is located in Paris.
1   1        Paris is home to the Eiffel Tower.
2   2  The Nile is the longest river in Africa.
3   2        The Nile river runs through Egypt.


In [9]:
import os, sys, subprocess, tempfile, numpy as np, pandas as pd
from pathlib import Path
from scipy.stats import pearsonr, kendalltau

# ========= CONFIG (BLEURT-20, small CLI batch, tiny chunks for stability) =========
CKPT       = "./BLEURT-20"   # full model used in the article
CLI_BATCH  = 8               # CLI --batch_size
CHUNK_SIZE = 40              # score 40 pairs per subprocess call (keeps RAM low)
SEED       = 42

# Paths (edit if needed)
HUMAN_EVAL_CSV = "./all_data_final_averaged.csv"  # you already downloaded this
REFS_LONG_CSV  = "./refs_long.csv"                # <-- provide (id, reference) long-format CSV

# ---------- Load human eval (WebNLG 2017) ----------
df_h = pd.read_csv(HUMAN_EVAL_CSV)
# Map to required columns: system, candidate, human_score (semantics)
df_h = df_h.rename(columns={
    "team": "system",
    "text": "candidate",
    "semantics": "human_score"
})
need_cols = {"id","system","candidate","human_score"}
assert need_cols.issubset(df_h.columns), f"Missing columns in {HUMAN_EVAL_CSV}. Need {need_cols}"
df_h["human_score"] = pd.to_numeric(df_h["human_score"], errors="coerce")
df_h = df_h.dropna(subset=["system","candidate","human_score"]).reset_index(drop=True)

# ---------- Load textual references (long: id, reference) ----------
if not Path(REFS_LONG_CSV).exists():
    raise FileNotFoundError(
        f"{REFS_LONG_CSV} not found.\n"
        "Create refs_long.csv with columns: id,reference (one row per reference)."
    )
refs_long = pd.read_csv(REFS_LONG_CSV)
assert {"id","reference"}.issubset(refs_long.columns), "refs_long.csv must have columns: id, reference"
refs_long["reference"] = refs_long["reference"].astype(str).str.strip()
refs_long = refs_long[refs_long["reference"]!=""].copy()


In [10]:
# Join human-eval rows to textual references (many refs per id)
df = df_h.merge(refs_long, on="id", how="inner")
df = df.dropna(subset=["system","candidate","reference","human_score"]).reset_index(drop=True)

print(f"Joined rows with textual refs: {len(df)}")
print("Systems:", df['system'].nunique(), "| Unique candidates:", df[['id','system','candidate']].drop_duplicates().shape[0])


Joined rows with textual refs: 4
Systems: 2 | Unique candidates: 2


In [12]:
import os, sys, subprocess, tempfile
from pathlib import Path
import numpy as np

# Make checkpoint path absolute
CKPT = str(Path("./BLEURT-20").resolve())  # or "./BLEURT-20-D12"

def bleurt_subprocess_scores(refs, cands, ckpt=CKPT, batch=8):
    """Scores in a separate Python process using BLEURT's Python API.
       Keeps TF memory out of your notebook; always writes scores file."""
    assert len(refs) == len(cands), "refs and cands must have same length"
    if not refs:
        return np.zeros(0, dtype="float32")

    with tempfile.TemporaryDirectory() as tmpd:
        tmpd = Path(tmpd)
        refs_p  = (tmpd / "refs.txt").resolve()
        cands_p = (tmpd / "cands.txt").resolve()
        out_p   = (tmpd / "scores.txt").resolve()
        script  = (tmpd / "run_bleurt.py").resolve()

        # write inputs
        refs_p.write_text("\n".join(refs),  encoding="utf-8")
        cands_p.write_text("\n".join(cands), encoding="utf-8")

        # write a tiny Python script that uses BLEURT API
        script.write_text(f"""
import os, sys
from bleurt import score as bleurt_score
ckpt, refs_f, cands_f, out_f, bs = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], int(sys.argv[5])
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
os.environ["TF_NUM_INTRAOP_THREADS"] = "1"
os.environ["TF_NUM_INTEROP_THREADS"] = "1"

with open(refs_f, "r", encoding="utf-8") as f:
    refs = [l.rstrip("\\n") for l in f]
with open(cands_f, "r", encoding="utf-8") as f:
    cands = [l.rstrip("\\n") for l in f]

scorer = bleurt_score.BleurtScorer(ckpt)
scores = []
for i in range(0, len(refs), bs):
    scores.extend(scorer.score(references=refs[i:i+bs], candidates=cands[i:i+bs]))

with open(out_f, "w", encoding="utf-8") as f:
    for s in scores:
        f.write(f"{{s}}\\n")
""", encoding="utf-8")

        # run it
        env = os.environ.copy()
        env["CUDA_VISIBLE_DEVICES"]   = "-1"
        env["TF_CPP_MIN_LOG_LEVEL"]   = "2"
        env["TF_NUM_INTRAOP_THREADS"] = "1"
        env["TF_NUM_INTEROP_THREADS"] = "1"

        cmd = [sys.executable, str(script), str(Path(ckpt).resolve()),
               str(refs_p), str(cands_p), str(out_p), str(batch)]
        cp = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

        if cp.returncode != 0:
            raise RuntimeError(f"BLEURT subprocess failed (code {cp.returncode}).\nSTDOUT:\n{cp.stdout}\n\nSTDERR:\n{cp.stderr}")

        if not out_p.exists():
            raise RuntimeError(f"BLEURT subprocess produced no scores file.\nCMD: {' '.join(cmd)}\nSTDOUT:\n{cp.stdout}\n\nSTDERR:\n{cp.stderr}")

        lines = [ln for ln in out_p.read_text(encoding="utf-8").splitlines() if ln.strip() != ""]
        if len(lines) != len(refs):
            raise RuntimeError(f"Scores count mismatch: expected {len(refs)} got {len(lines)}.\nFirst lines: {lines[:5]}\nSTDERR:\n{cp.stderr}")

        return np.array([float(x) for x in lines], dtype="float32")


In [13]:
print("Probe:", bleurt_subprocess_scores(
    ["The Eiffel Tower is in Paris."],
    ["The Eiffel Tower is in Paris."],
    batch=4
))


Probe: [1.0015844]


In [15]:
import pandas as pd

# Human eval (you already have this file)
df_h = pd.read_csv("./all_data_final_averaged.csv").rename(columns={
    "team": "system",
    "text": "candidate",
    "semantics": "human_score",
})
df_h["human_score"] = pd.to_numeric(df_h["human_score"], errors="coerce")
df_h = df_h.dropna(subset=["id","system","candidate","human_score"]).reset_index(drop=True)

# Your textual references created earlier: refs_long.csv (id, reference)
refs_long = pd.read_csv("refs_long.csv")   # must have columns: id, reference
refs_long["reference"] = refs_long["reference"].astype(str).str.strip()
refs_long = refs_long[refs_long["reference"] != ""].copy()

# Join: each candidate paired with ALL its refs
df = (df_h.merge(refs_long, on="id", how="inner")
          .dropna(subset=["reference"])
          .reset_index(drop=True))

print(f"Joined rows (with text refs): {len(df)}")
print("Systems:", df["system"].nunique(),
      "| Unique candidates:", df[["id","system","candidate"]].drop_duplicates().shape[0])


Joined rows (with text refs): 4
Systems: 2 | Unique candidates: 2


In [16]:
from pathlib import Path
import numpy as np

CHUNK_SIZE = 40    # safe on CPU; adjust up/down if needed
BATCH      = 8     # per subprocess

key_cols = ["id","system","candidate","human_score"]
agg_rows = []

groups = df.groupby(key_cols, sort=False)
print(f"Scoring {len(groups)} candidates with max-over-refs…")

for (gid, sysname, cand, human), grp in groups:
    refs_list = grp["reference"].tolist()
    scores_all = []
    for i in range(0, len(refs_list), CHUNK_SIZE):
        rchunk = refs_list[i:i+CHUNK_SIZE]
        cchunk = [cand] * len(rchunk)
        scores_all.extend(bleurt_subprocess_scores(rchunk, cchunk, batch=BATCH))
    bleurt_max = float(np.max(np.array(scores_all, dtype="float32")))
    agg_rows.append((gid, sysname, cand, human, bleurt_max))

df_agg = pd.DataFrame(agg_rows, columns=["id","system","candidate","human_score","bleurt"])
print("Candidates after max-over-refs:", len(df_agg))
df_agg.head(3)


Scoring 2 candidates with max-over-refs…
Candidates after max-over-refs: 2


Unnamed: 0,id,system,candidate,human_score,bleurt
0,1,adapt,"the 29075 club is the dictcoverer, carl a. wir...",1.333333,0.143762
1,2,baseline,the administrative government is governed by t...,1.0,0.231049


In [1]:
from scipy.stats import pearsonr, kendalltau

# Segment-level
r, r_p  = pearsonr(df_agg["human_score"], df_agg["bleurt"])
t, t_p  = kendalltau(df_agg["human_score"], df_agg["bleurt"])

# System-level (headline in the paper)
sys_tbl = (df_agg.groupby("system", as_index=False)
                    .agg(human_mean=("human_score","mean"),
                         bleurt_mean=("bleurt","mean"),
                         n=("bleurt","size")))

sr, sr_p = pearsonr(sys_tbl["human_mean"], sys_tbl["bleurt_mean"]) if len(sys_tbl)>1 else (np.nan, np.nan)
st, st_p = kendalltau(sys_tbl["human_mean"], sys_tbl["bleurt_mean"]) if len(sys_tbl)>1 else (np.nan, np.nan)

print("\n=== WEBNLG (text refs, BLEURT-20, max-over-refs) ===")
print(f"Candidates: {len(df_agg)} | Systems: {len(sys_tbl)}")
print(f"Segment:  Pearson r={r:.100f} (p={r_p:.50e}) | Kendall τ={t:.100f} (p={t_p:.50e})")
print(f"System :  Pearson r={sr:.100f} (p={sr_p:.50e}) | Kendall τ={st:.100f} (p={st_p:.50e})")

# Save
OUT = Path("bleurt_eval_outputs"); OUT.mkdir(exist_ok=True)
df_agg.to_csv(OUT/"webnlg_bleurt20_textrefs_scores.csv", index=False)
sys_tbl.to_csv(OUT/"webnlg_bleurt20_textrefs_system_means.csv", index=False)
print("Saved to:", OUT)


NameError: name 'df_agg' is not defined

In [18]:
# pick up to K ids per system that are present in refs_long
K = 20  # increase as you get more refs
ids_with_refs = set(refs_long["id"].unique())
df_h_sub = df_h[df_h["id"].isin(ids_with_refs)].copy()

chosen = []
for sys, grp in df_h_sub.groupby("system", sort=False):
    chosen.extend(grp.head(K).index.tolist())
df_h_sub = df_h_sub.loc[chosen].reset_index(drop=True)

# join to refs and proceed exactly as before
df = (df_h_sub.merge(refs_long, on="id", how="inner")
               .dropna(subset=["reference"])
               .reset_index(drop=True))
print(f"Joined rows: {len(df)} | systems: {df['system'].nunique()} | candidates: {df[['id','system','candidate']].drop_duplicates().shape[0]}")


Joined rows: 4 | systems: 2 | candidates: 2


In [19]:
# TEMPORARY: use MR as a reference (NOT article-comparable, but good to check scale)
df_tmp = df_h.rename(columns={"mr":"reference"}) \
            .dropna(subset=["reference"]) \
            .reset_index(drop=True)
