In [1]:
!pip -q install datasets


In [2]:
from datasets import load_dataset

CANDIDATE_DATASETS = [
    "ccdv/govreport-summarization",  # common mirror
    "ccdv/govreport",                # alt
    "ccdv/gov_report",               # alt
    "GEM/govreport",                 # alt (if available)
]

ds = None
last_err = None
for repo in CANDIDATE_DATASETS:
    try:
        ds = load_dataset(repo)
        print("Loaded dataset:", repo)
        break
    except Exception as e:
        last_err = e
        continue
if ds is None:
    raise RuntimeError(f"Could not load any GovReport mirror; last error:\n{last_err}")

# Pick the split you want to evaluate on
# Many mirrors provide 'train'/'validation'/'test'; if not, fall back to 'test' existence check.
split_name = "test" if "test" in ds else "validation"
data = ds[split_name]
print(split_name, "size:", len(data), "keys:", data[0].keys())


  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 2.21k/2.21k [00:00<00:00, 5.11kB/s]
Downloading data: 100%|██████████| 228M/228M [00:09<00:00, 23.5MB/s] 
Downloading data: 100%|██████████| 229M/229M [00:09<00:00, 24.6MB/s] 
Downloading data: 100%|██████████| 26.1M/26.1M [00:01<00:00, 18.2MB/s]
Downloading data: 100%|██████████| 24.0M/24.0M [00:01<00:00, 18.9MB/s]
Generating train split: 100%|██████████| 17517/17517 [00:12<00:00, 1385.39 examples/s]
Generating validation split: 100%|██████████| 973/973 [00:00<00:00, 3195.33 examples/s]
Generating test split: 100%|██████████| 973/973 [00:01<00:00, 514.86 examples/s]


Loaded dataset: ccdv/govreport-summarization
test size: 973 keys: dict_keys(['report', 'summary'])


In [3]:
example_keys = list(data[0].keys())
doc_key = "document" if "document" in example_keys else ("report" if "report" in example_keys else "article")
ref_key = "summary"  if "summary"  in example_keys else ("reference" if "reference" in example_keys else "highlights")
print("Using columns -> document:", doc_key, "reference:", ref_key)


Using columns -> document: report reference: summary


In [4]:
import re, pathlib

# Output paths (absolute to avoid 'file not found')
out_dir = pathlib.Path("bleurt_runs/govreport_lead7")
out_dir.mkdir(parents=True, exist_ok=True)
refs_path  = (out_dir / "refs.txt").resolve()
cands_path = (out_dir / "cands.txt").resolve()

SENT_SPLIT = re.compile(r'(?<=[.!?])\s+')

def lead_k(text: str, k: int = 7, char_cap: int = 3000) -> str:
    sents = SENT_SPLIT.split(text.strip())
    out = " ".join(sents[:k]) if sents else text.strip()
    # ensure single line + modest length
    return out.replace("\n", " ")[:char_cap]

# Evaluate a manageable subset first; set N=None for the full split once you’re happy
N = 200
subset = data.select(range(N)) if N else data

with refs_path.open("w", encoding="utf-8") as fr, cands_path.open("w", encoding="utf-8") as fc:
    for ex in subset:
        ref = str(ex[ref_key]).replace("\n", " ").strip()
        cand = lead_k(str(ex[doc_key]))
        fr.write(ref + "\n")
        fc.write(cand + "\n")

print("Wrote:")
print("  refs ->", refs_path)
print("  cands ->", cands_path)

# Quick sanity check: equal counts
def count_lines(p):
    with open(p, "r", encoding="utf-8") as f:
        return sum(1 for _ in f)
n_refs, n_cands = count_lines(refs_path), count_lines(cands_path)
print("Lines -> refs:", n_refs, "cands:", n_cands)
assert n_refs == n_cands and n_refs > 0


Wrote:
  refs -> /workspaces/bleurt/bleurt_runs/govreport_lead7/refs.txt
  cands -> /workspaces/bleurt/bleurt_runs/govreport_lead7/cands.txt
Lines -> refs: 200 cands: 200


In [5]:
scores_path = (out_dir / "scores.txt").resolve()
ckpt = "BLEURT-20-D12"  # or "BLEURT-20" for the full model

!python -m bleurt.score_files \
  -candidate_file="{cands_path}" \
  -reference_file="{refs_path}" \
  -bleurt_batch_size=32 \
  -batch_same_length=True \
  -bleurt_checkpoint="{ckpt}" \
  -scores_file="{scores_path}"

!head -n 5 "{scores_path}"
print("Scores saved to:", scores_path)


2025-09-02 12:37:39.933566: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-02 12:38:02.639734: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-02 12:38:14.253754: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-02 12:38:26.167542: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
INFO:tensorflow:Running BLEURT scoring.
I0902 12:38:26.167668 128407618672448 score_files.py:168] Running BLEURT scoring.
W0902 12:38:26.167879 128407618672448 score_files.py:118] Enabling same length batching

In [6]:
import numpy as np
scores = np.loadtxt(scores_path)
print("Mean BLEURT:", float(scores.mean()), "N:", scores.size)


Mean BLEURT: 0.3434018988907337 N: 200


In [2]:
from pathlib import Path
import numpy as np
gov_path = Path("bleurt_runs/govreport_lead7/scores.txt")
gov_scores = np.loadtxt(gov_path)
print("GovReport Results:")
print(f"Mean BLEURT: {gov_scores.mean():.4f}")
print(f"Standard Deviation: {gov_scores.std():.4f}")
print(f"N: {len(gov_scores)}\n")


GovReport Results:
Mean BLEURT: 0.3434
Standard Deviation: 0.0292
N: 200

