In [1]:
!python -m pip install -U datasets pyarrow fsspec aiohttp ipywidgets
# If the kernel just updated packages, you may need to restart the kernel once.


Collecting datasets
  Using cached datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)
Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Using cached datasets-4.0.0-py3-none-any.whl (494 kB)
Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
Downloading widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m39.7 MB/s[0m  [33m0:00:00[0

In [3]:
from datasets import load_dataset

mirrors = ["allenai/common_gen", "GEM/common_gen"]
ds, last_err = None, None
for repo in mirrors:
    try:
        ds = load_dataset(repo)
        print("Loaded:", repo)
        break
    except Exception as e:
        last_err = e
if ds is None:
    raise RuntimeError(f"Could not load CommonGen. Last error:\n{last_err}")

split = "validation" if "validation" in ds else "test"
data = ds[split]
print("Split:", split, "size:", len(data), "keys:", list(data[0].keys()))


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.23M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/171k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/31.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67389 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4018 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1497 [00:00<?, ? examples/s]

Loaded: allenai/common_gen
Split: validation size: 4018 keys: ['concept_set_idx', 'concepts', 'target']


In [4]:
def extract_fields(ex):
    # concepts may be under 'concepts' or 'concept_set'
    concepts = ex.get("concepts") or ex.get("concept_set")
    if isinstance(concepts, str):  # rare case: comma/space-separated string
        concepts = [c.strip() for c in concepts.replace(",", " ").split()]
    assert isinstance(concepts, (list, tuple)) and len(concepts) > 0, "No concepts found"
    concepts = [str(c) for c in concepts]

    # references usually 'references' (list[str]) or 'target' (str)
    if "references" in ex and isinstance(ex["references"], (list, tuple)):
        refs = [str(r) for r in ex["references"]]
    elif "target" in ex:
        refs = [str(ex["target"])]
    else:
        raise KeyError(f"No reference text in example keys: {list(ex.keys())}")
    return concepts, refs

# quick peek
c0, r0 = extract_fields(data[0])
c0, r0[:2]


(['field', 'look', 'stand'],
 ['The player stood in the field looking at the batter.'])

In [5]:
def concepts_to_sentence(concepts):
    # keep order; join into a simple one-sentence description
    # e.g., ["dog","park","run","child"] -> "A dog and a child run in a park."
    words = [w.replace("_", " ") for w in concepts]
    if len(words) == 1:
        sent = f"{words[0]}."
    elif len(words) == 2:
        sent = f"{words[0]} and {words[1]}."
    else:
        sent = ", ".join(words[:-1]) + f", and {words[-1]}."
    # add a light template for plausibility
    sent = f"{sent[:-1]} appear together in a scene."
    return sent[:300]  # keep short for BLEURT

cands = []
refs_list = []
for ex in data:
    concepts, refs = extract_fields(ex)
    cands.append(concepts_to_sentence(concepts))
    refs_list.append(refs)
len(cands), len(refs_list), cands[0], refs_list[0][:2]


(4018,
 4018,
 'field, look, and stand appear together in a scene.',
 ['The player stood in the field looking at the batter.'])

In [6]:
import pathlib

out_dir = pathlib.Path("bleurt_runs/commongen_rule")
out_dir.mkdir(parents=True, exist_ok=True)
refs_path   = (out_dir / "refs.txt").resolve()
cands_path  = (out_dir / "cands.txt").resolve()
index_path  = (out_dir / "example_index.txt").resolve()

N = 1000  # start small; set to None for full split
indices = range(len(cands)) if N is None else range(min(N, len(cands)))

num_pairs = 0
with refs_path.open("w", encoding="utf-8") as fr, \
     cands_path.open("w", encoding="utf-8") as fc, \
     index_path.open("w", encoding="utf-8") as fi:
    for i in indices:
        cand = cands[i].replace("\n", " ")
        for ref in refs_list[i]:
            fr.write(ref.replace("\n", " ").strip() + "\n")
            fc.write(cand + "\n")
            fi.write(str(i) + "\n")
            num_pairs += 1

print("Wrote:")
print("  refs  ->", refs_path)
print("  cands ->", cands_path)
print("  map   ->", index_path)
print("Total expanded pairs:", num_pairs)


Wrote:
  refs  -> /workspaces/bleurt/bleurt_runs/commongen_rule/refs.txt
  cands -> /workspaces/bleurt/bleurt_runs/commongen_rule/cands.txt
  map   -> /workspaces/bleurt/bleurt_runs/commongen_rule/example_index.txt
Total expanded pairs: 1000


In [7]:
scores_path = (out_dir / "scores.txt").resolve()
ckpt = "BLEURT-20-D12"  # distilled, fast on CPU

!python -m bleurt.score_files \
  -candidate_file="{cands_path}" \
  -reference_file="{refs_path}" \
  -bleurt_batch_size=100 \
  -batch_same_length=True \
  -bleurt_checkpoint="{ckpt}" \
  -scores_file="{scores_path}"

!head -n 5 "{scores_path}"


2025-09-02 13:14:22.185225: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-02 13:14:44.990290: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-02 13:14:55.906521: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-02 13:15:08.094282: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
INFO:tensorflow:Running BLEURT scoring.
I0902 13:15:08.094561 133888825157440 score_files.py:168] Running BLEURT scoring.
W0902 13:15:08.094854 133888825157440 score_files.py:118] Enabling same length batching

In [8]:
import numpy as np
from collections import defaultdict

scores = np.loadtxt(scores_path)
with open(index_path, "r", encoding="utf-8") as f:
    idx_map = [int(x.strip()) for x in f]
assert len(scores) == len(idx_map)

buckets = defaultdict(list)
for s, i in zip(scores, idx_map):
    buckets[i].append(float(s))

per_example = np.array([np.mean(buckets[i]) for i in sorted(buckets.keys())])
print("Examples scored:", per_example.size)
print("Mean BLEURT (avg over refs per example):", float(per_example.mean()))


Examples scored: 1000
Mean BLEURT (avg over refs per example): 0.34721828971803187


In [2]:
from pathlib import Path
import numpy as np

common_path = Path("bleurt_runs/commongen_rule/scores.txt")
common_scores = np.loadtxt(common_path)

print("CommonGen Results:")
print(f"Mean BLEURT: {common_scores.mean():.4f}")
print(f"Standard Deviation: {common_scores.std():.4f}")
print(f"N: {len(common_scores)}")


CommonGen Results:
Mean BLEURT: 0.3472
Standard Deviation: 0.0471
N: 1000


In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from collections import defaultdict
from scipy import stats

# === 1) Required BLEURT inputs (from your CommonGen scoring) ===
base = Path("bleurt_runs/commongen_rule")
scores_path = base / "scores.txt"
index_path  = base / "example_index.txt"
assert scores_path.exists(), f"Missing BLEURT scores: {scores_path}"
assert index_path.exists(),  f"Missing example index: {index_path}"

# Load BLEURT scores and map to example ids, then average over refs per example
scores = np.loadtxt(scores_path, dtype=float, ndmin=1)
with index_path.open("r", encoding="utf-8") as f:
    idx_map = [int(x.strip()) for x in f]
assert len(scores) == len(idx_map), f"Length mismatch: {len(scores)} scores vs {len(idx_map)} indices"

buckets = defaultdict(list)
for s, i in zip(scores, idx_map):
    buckets[i].append(float(s))
bleurt_per_example = np.array([np.mean(buckets[i]) for i in sorted(buckets.keys())], dtype=float)
example_ids = np.array(sorted(buckets.keys()), dtype=int)

print(f"CommonGen BLEURT: {len(bleurt_per_example)} examples "
      f"(mean={bleurt_per_example.mean():.4f}, std={bleurt_per_example.std(ddof=0):.4f})")

# === 2) Human ratings (match the BLEURT article's correlation setup) ===
# Provide one of these files:
#   A) CSV with columns: [example_id,<one of: human_score, rating, DA>]
#   B) TXT with a single score per line, same order as *unique* example ids (sorted) — last resort.
rating_candidates = [
    Path("commongen_human_ratings.csv"),
    Path("human_scores_commongen.csv"),
    base / "human_scores.csv",
]

ratings_df = None
for p in rating_candidates:
    if p.exists():
        df = pd.read_csv(p)
        # id column
        id_col = next((c for c in ["example_id","idx","id"] if c in df.columns), None)
        # score column
        sc_col = next((c for c in ["human_score","rating","DA","score"] if c in df.columns), None)
        if sc_col is not None:
            if id_col is None:
                # still usable if length matches and is clearly per-example
                if len(df) == len(bleurt_per_example):
                    ratings_df = pd.DataFrame({
                        "example_id": example_ids,
                        "human_score": pd.to_numeric(df[sc_col], errors="coerce")
                    })
                    print(f"Loaded ratings from {p} (no id column; aligned by sorted example ids).")
                else:
                    continue
            else:
                ratings_df = df[[id_col, sc_col]].rename(columns={id_col:"example_id", sc_col:"human_score"})
                ratings_df["example_id"] = pd.to_numeric(ratings_df["example_id"], errors="coerce").astype("Int64")
                ratings_df["human_score"] = pd.to_numeric(ratings_df["human_score"], errors="coerce")
                print(f"Loaded ratings from {p} with id column.")
            break

if ratings_df is None:
    # Fallback: plain text one score per line (aligned to sorted example_ids)
    txt_fallbacks = [Path("commongen_human_ratings.txt"), base / "human_scores.txt"]
    txt_file = next((p for p in txt_fallbacks if p.exists()), None)
    if txt_file is not None:
        arr = np.loadtxt(txt_file, dtype=float, ndmin=1)
        if arr.size == bleurt_per_example.size:
            ratings_df = pd.DataFrame({"example_id": example_ids, "human_score": arr})
            print(f"Loaded ratings from {txt_file} (aligned by sorted example ids).")

# If still missing, stop with clear instructions
if ratings_df is None:
    print("\n⚠️ No human ratings found. To compute BLEURT↔human correlations (as in the BLEURT paper), "
          "provide one of the following:\n"
          "  • CSV: commongen_human_ratings.csv with columns [example_id, human_score]\n"
          "  • TXT: commongen_human_ratings.txt with one score per *example* in the order of sorted example ids\n"
          f"Examples expected: {len(bleurt_per_example)}\n")
else:
    # Align and drop NaNs
    merged = (pd.DataFrame({"example_id": example_ids, "bleurt": bleurt_per_example})
                .merge(ratings_df, on="example_id", how="inner")
                .dropna(subset=["bleurt","human_score"]))

    if merged.empty:
        raise ValueError("Ratings present but no overlap with example ids. Check 'example_id' mapping.")
    x = merged["bleurt"].to_numpy()
    y = merged["human_score"].to_numpy()

    # === 3) Correlations (paper-style): Pearson r + Kendall τ (they report both) ===
    # WMT17 used Pearson as official; WMT18–19 used a Kendall-based DARR variant. We'll report:
    #   Pearson r, Spearman ρ, Kendall τ-b (all fast, robust).  :contentReference[oaicite:2]{index=2}
    pr, pp = stats.pearsonr(x, y)
    sr, sp = stats.spearmanr(x, y)
    kt, kp = stats.kendalltau(x, y, variant="b")

    print("\nCommonGen BLEURT ↔ Human Ratings:")
    print(f"  N                : {len(x)}")
    print(f"  Pearson r        : {pr:.4f} (p={pp:.2e})")
    print(f"  Spearman ρ       : {sr:.4f} (p={sp:.2e})")
    print(f"  Kendall τ-b      : {kt:.4f} (p={kp:.2e})")

    # === 4) Tiny bootstrap CIs (kept small for speed & stability) ===
    rng = np.random.default_rng(123)
    B = min(1000, max(300, len(x)*5))
    idxs = rng.integers(0, len(x), size=(B, len(x)))
    boots_p = np.array([stats.pearsonr(x[i], y[i])[0] for i in idxs])
    lo, hi = np.quantile(boots_p, [0.025, 0.975])
    print(f"  95% CI (bootstrap, Pearson): [{lo:.4f}, {hi:.4f}]")


CommonGen BLEURT: 1000 examples (mean=0.3472, std=0.0471)

⚠️ No human ratings found. To compute BLEURT↔human correlations (as in the BLEURT paper), provide one of the following:
  • CSV: commongen_human_ratings.csv with columns [example_id, human_score]
  • TXT: commongen_human_ratings.txt with one score per *example* in the order of sorted example ids
Examples expected: 1000

