In [2]:
# 1) deps
!pip -q install datasets

# 2) load CNN/DailyMail **parquet** mirror
from datasets import load_dataset

# Prefer the official Parquet-converted mirror (no scripts needed)
ds = load_dataset("abisee/cnn_dailymail", "3.0.0")  # uses parquet
train, val, test = ds["train"], ds["validation"], ds["test"]
len(train), len(val), len(test), test[0].keys()


  from .autonotebook import tqdm as notebook_tqdm


(287113, 13368, 11490, dict_keys(['article', 'highlights', 'id']))

In [3]:
import re
from bleurt import score

def lead3(text):
    sents = re.split(r'(?<=[.!?])\s+', text.strip())
    out = " ".join(sents[:3]) if sents else text.strip()
    # keep short so BLEURT stays under 512 subword tokens
    return out[:2000]

N = 200  # subset for a quick run; set None for full split
subset = test.select(range(N)) if N else test
references = subset["highlights"]   # gold reference summaries
candidates = [lead3(a) for a in subset["article"]]

scorer = score.LengthBatchingBleurtScorer("BLEURT-20-D12")  # or "BLEURT-20"
scores = scorer.score(references=references, candidates=candidates, batch_size=64)

import pandas as pd, numpy as np
out = pd.DataFrame({"id": subset["id"], "reference": references, "candidate": candidates, "bleurt": scores})
print("Mean BLEURT:", float(np.mean(scores)), "N:", len(scores))
out.to_csv("cnndm_bleurt_lead3.csv", index=False)
"cnndm_bleurt_lead3.csv"


2025-09-02 06:02:09.898823: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-02 06:02:35.940135: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-02 06:02:49.120101: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


INFO:tensorflow:Reading checkpoint BLEURT-20-D12.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint BLEURT-20-D12
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:BLEURT-20-D12
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:... vocab_file:None
INFO:tensorflow:... do_lower_case:None
INFO:tensorflow:... sp_model:sent_piece
INFO:tensorflow:... dynamic_seq_length:True
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Will load model: BLEURT-20-D12/sent_piece.model.
INFO:tensorflow:SentencePiece tokenizer created.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.


2025-09-02 06:03:01.599961: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
2025-09-02 06:03:02.292646: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 61593600 exceeds 10% of free system memory.
2025-09-02 06:03:04.519594: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 16777216 exceeds 10% of free system memory.
2025-09-02 06:03:04.572333: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 16777216 exceeds 10% of free system memory.
2025-09-02 06:03:04.676549: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 16777216 exceeds 10% of free system memory.
2025-09-02 06:03:04.724908: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 16777216 exceeds 10% of free system memory.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:Average batch sequence length: 199.5


INFO:tensorflow:Average batch sequence length: 199.5


Mean BLEURT: 0.2799194859713316 N: 200


'cnndm_bleurt_lead3.csv'

In [4]:
import re, pathlib
from itertools import islice
from datasets import load_dataset

refs_path = pathlib.Path("cnndm_refs.txt")
cands_path = pathlib.Path("cnndm_cands_lead3_stream.txt")

SENT_SPLIT = re.compile(r'(?<=[.!?])\s+')
def lead3(t): 
    s = SENT_SPLIT.split(t.strip())
    return (" ".join(s[:3]) if s else t.strip()).replace("\n", " ")[:2000]

N = 200
stream = load_dataset("abisee/cnn_dailymail", "3.0.0", split="test", streaming=True)

with refs_path.open("w", encoding="utf-8") as fr, cands_path.open("w", encoding="utf-8") as fc:
    for ex in islice(stream, N):
        fr.write(ex["highlights"].replace("\n", " ").strip() + "\n")
        fc.write(lead3(ex["article"]) + "\n")

print("Wrote:", refs_path, cands_path)


Wrote: cnndm_refs.txt cnndm_cands_lead3_stream.txt


In [5]:
import os, re, pathlib
from datasets import load_dataset

# Output folder (keeps things tidy)
out_dir = pathlib.Path("bleurt_runs/cnndm_lead3")
out_dir.mkdir(parents=True, exist_ok=True)
refs_path  = (out_dir / "refs.txt").resolve()
cands_path = (out_dir / "cands.txt").resolve()

# Light sentence splitter + lead-3 baseline
SENT_SPLIT = re.compile(r'(?<=[.!?])\s+')
def lead3(text: str) -> str:
    sents = SENT_SPLIT.split(text.strip())
    out = " ".join(sents[:3]) if sents else text.strip()
    return out.replace("\n", " ")[:2000]  # single line, <= ~512 subwords

# Load only what you need server-side (low RAM)
N = 200  # set to None for full test
ds = load_dataset("abisee/cnn_dailymail", "3.0.0", split=f"test[:{N}]")

# Write files line-by-line
with refs_path.open("w", encoding="utf-8") as fr, cands_path.open("w", encoding="utf-8") as fc:
    for ex in ds:
        fr.write(ex["highlights"].replace("\n", " ").strip() + "\n")
        fc.write(lead3(ex["article"]) + "\n")

print("Wrote:")
print("  refs:", refs_path)
print("  cands:", cands_path)

# Sanity check: files exist & have same number of lines
def count_lines(p): 
    with p.open("r", encoding="utf-8") as f: 
        return sum(1 for _ in f)
n_refs, n_cands = count_lines(refs_path), count_lines(cands_path)
print("Lines -> refs:", n_refs, "cands:", n_cands)
assert n_refs == n_cands and n_refs > 0, "Ref/Cand line counts mismatch or empty!"


Wrote:
  refs: /workspaces/bleurt/bleurt_runs/cnndm_lead3/refs.txt
  cands: /workspaces/bleurt/bleurt_runs/cnndm_lead3/cands.txt
Lines -> refs: 200 cands: 200


In [6]:
scores_path = "/workspaces/bleurt/bleurt_runs/cnndm_lead3/scores.txt"
ckpt = "BLEURT-20-D12"  # or "BLEURT-20" if you want the full model

!python -m bleurt.score_files \
  -candidate_file="/workspaces/bleurt/bleurt_runs/cnndm_lead3/cands.txt" \
  -reference_file="/workspaces/bleurt/bleurt_runs/cnndm_lead3/refs.txt" \
  -bleurt_batch_size=64 \
  -batch_same_length=True \
  -bleurt_checkpoint="{ckpt}" \
  -scores_file="{scores_path}"

print("Scores saved to:", scores_path)
!head -n 5 "{scores_path}"


2025-09-02 06:12:52.373823: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-02 06:12:52.422157: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-02 06:12:54.505413: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-02 06:12:57.027547: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
INFO:tensorflow:Running BLEURT scoring.
I0902 06:12:57.027747 138104150554432 score_files.py:168] Running BLEURT scoring.
W0902 06:12:57.027957 138104150554432 score_files.py:118] Enabling same length batching

In [1]:
from pathlib import Path
import numpy as np
cnn_path = Path("bleurt_runs/cnndm_lead3/scores.txt")
cnn_scores = np.loadtxt(cnn_path)
print("CNN/DailyMail Results:")
print(f"Mean BLEURT: {cnn_scores.mean():.4f}")
print(f"Standard Deviation: {cnn_scores.std():.4f}")
print(f"N: {len(cnn_scores)}\n")

CNN/DailyMail Results:
Mean BLEURT: 0.2791
Standard Deviation: 0.0660
N: 200

