In [1]:
 !pip install -q transformers sentence-transformers datasets evaluate rouge-score sentencepiece torch

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [2]:
import os, sys, json, csv, math, pathlib, random, numpy as np, torch
from google.colab import drive
drive.mount('/content/drive')

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = False

ROOT = pathlib.Path("/content")

Mounted at /content/drive


In [3]:
!mkdir -p /content/data/AmaSum
!unzip -q /content/drive/MyDrive/AmaSum/raw_min_10_max_100_revs.zip -d /content/data/AmaSum
!git clone https://github.com/yuanyuanlei-nlp/polarity_calibration_naacl_2024.git
%cd /content/polarity_calibration_naacl_2024

error:  zipfile read error
Cloning into 'polarity_calibration_naacl_2024'...
remote: Enumerating objects: 401, done.[K
remote: Counting objects: 100% (29/29), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 401 (delta 15), reused 0 (delta 0), pack-reused 372 (from 1)[K
Receiving objects: 100% (401/401), 890.21 KiB | 13.29 MiB/s, done.
Resolving deltas: 100% (37/37), done.
/content/polarity_calibration_naacl_2024


In [4]:
import json, pathlib, re

repo = pathlib.Path("/content/polarity_calibration_naacl_2024")
amasum = pathlib.Path("/content/data/AmaSum/min_10_max_100_revs_filt_complete")
preds_root = repo / "generated_summary_AmaSum"
test_ids_file = preds_root / "test_file_names.txt"

gold_dir = repo / "work_amasum" / "gold_test"
inp_dir  = repo / "work_amasum" / "input_texts_test"
os.makedirs(gold_dir, exist_ok=True)
os.makedirs(inp_dir,  exist_ok=True)

ids = [x.strip() for x in open(test_ids_file, encoding="utf-8").read().splitlines() if x.strip()]
files = {p.stem: p for p in (amasum / "test").glob("*.json")}

def get_reviews_list(D):
    if isinstance(D.get("reviews"), list): return D["reviews"]
    if isinstance(D.get("customer_reviews"), list): return D["customer_reviews"]
    return []

def clean_txt(s: str) -> str:
    s = s.replace("\u2013", "-").replace("\u2014", "-")
    s = re.sub(r"\s+", " ", s).strip()
    return s

missing = []
for _id in ids:
    p = files.get(_id)
    if not p:
        missing.append(_id); continue
    data = json.loads(p.read_text(encoding="utf-8"))

    ws = data["website_summaries"][0]
    verdict = clean_txt(ws.get("verdict") or "")
    pros = ". ".join([clean_txt(x) for x in ws.get("pros", []) if clean_txt(x)])
    cons = ". ".join([clean_txt(x) for x in ws.get("cons", []) if clean_txt(x)])
    if pros and not pros.endswith("."): pros += "."
    if cons and not cons.endswith("."): cons += "."
    gold = " ".join([x for x in [verdict, pros, cons] if x]).strip()
    (gold_dir / f"{_id}.txt").write_text(gold, encoding="utf-8")

    reviews = get_reviews_list(data)
    texts = []
    for r in reviews:
        t = clean_txt((r.get("text") or ""))
        if not t: continue
        ttl = clean_txt((r.get("title") or ""))
        body = f"{ttl}. {t}" if ttl else t
        if len(body) >= 40:
            texts.append(body)
    joined = "\n".join(texts)
    (inp_dir / f"{_id}.txt").write_text(joined, encoding="utf-8")

print("missing", len(missing))

missing 0


In [5]:
import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

from nltk.tokenize import sent_tokenize

systems = {
    "base": preds_root / "base_summarizer_flan_t5_large.txt",
    "poca": preds_root / "calibrated_summarizer_PoCa.txt",
}

def load_system_sentences(name, limit_ids=None):
    lines = [ln.strip() for ln in open(systems[name], encoding="utf-8").read().splitlines()]
    n = min(len(ids), len(lines))
    if limit_ids is not None:
        n = min(n, limit_ids)
    out = {}
    for i in range(n):
        sid = ids[i]
        sents = [s.strip() for s in sent_tokenize(lines[i]) if s.strip()]
        out[sid] = sents
    return out

sys_sents = {k: load_system_sentences(k) for k in systems.keys()}

In [6]:
import nltk
nltk.download('punkt', quiet=True)
from nltk.tokenize import sent_tokenize

review_sents = {}
for _id in ids:
    doc = (inp_dir / f"{_id}.txt").read_text(encoding="utf-8")
    sents = [s.strip() for s in sent_tokenize(doc)]
    sents = [s for s in dict.fromkeys(sents) if s and len(s) >= 30]
    review_sents[_id] = sents

In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

def topk_indices(mat, k):
    k = min(k, mat.shape[1])
    return np.argpartition(-mat, kth=k-1, axis=1)[:, :k] if k > 0 else np.zeros((mat.shape[0],0), dtype=int)

prod_embeddings = {}

def get_prod_emb(_id):
    if _id not in prod_embeddings:
        if len(review_sents[_id]) == 0:
            prod_embeddings[_id] = np.zeros((0, 384), dtype=np.float32)
        else:
            prod_embeddings[_id] = embedder.encode(review_sents[_id], convert_to_numpy=True, normalize_embeddings=True)
    return prod_embeddings[_id]

def retrieve_topk(_id, query_sents, k=5):
    ref = get_prod_emb(_id)
    if ref.shape[0] == 0:
        return [[] for _ in query_sents]
    Q = embedder.encode(query_sents, convert_to_numpy=True, normalize_embeddings=True)
    sims = np.matmul(Q, ref.T)
    idxs = topk_indices(sims, k)
    out = []
    for i in range(idxs.shape[0]):
        cand = [(int(j), float(sims[i, j])) for j in idxs[i]]
        cand.sort(key=lambda x: x[1], reverse=True)
        out.append(cand)
    return out

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F

nli_ckpt = "cross-encoder/nli-deberta-v3-base"
nli_tok = AutoTokenizer.from_pretrained(nli_ckpt, use_fast=True)
nli_mdl = AutoModelForSequenceClassification.from_pretrained(nli_ckpt).to(device).eval()
ENTAIL_IDX = 2
THRESH = 0.6

def entail_prob(premise, hypothesis):
    enc = nli_tok(premise, hypothesis, truncation=True, max_length=256, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = nli_mdl(**enc).logits
    return float(F.softmax(logits, dim=-1)[0, ENTAIL_IDX].item())

tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

In [9]:
art = repo / "artifacts"
art.mkdir(exist_ok=True)

def dedup_keep_order(items):
    seen = set(); out = []
    for x in items:
        if x not in seen:
            seen.add(x); out.append(x)
    return out

def run_attribution_for_system(name, k=5):
    out_path = art / f"evidence_{name}.jsonl"
    with open(out_path, "w", encoding="utf-8") as w:
        for _id in ids:
            summ = (open(systems[name], encoding="utf-8").read().splitlines()[ids.index(_id)]).strip()
            summ_sents = [s.strip() for s in sent_tokenize(summ) if s.strip()]
            summ_sents = [s for s in summ_sents if len(s) >= 20]
            rev_sents = review_sents[_id]
            if not summ_sents:
                w.write(json.dumps({"id": _id, "sentences": []}) + "\n"); continue
            cand = retrieve_topk(_id, summ_sents, k=min(k, max(1, len(rev_sents))))
            bundle = []
            for i, s in enumerate(summ_sents):
                evs = []
                for j, _sim in cand[i]:
                    prem = rev_sents[j]
                    p_ent = entail_prob(prem, s)
                    if p_ent >= THRESH:
                        evs.append({"text": prem, "entail_p": round(p_ent, 4)})
                texts = [e["text"] for e in evs]
                keep = dedup_keep_order(texts)
                evs_final = []
                for t in keep:
                    for e in evs:
                        if e["text"] == t:
                            evs_final.append(e); break
                bundle.append({"summary_sent": s, "evidence": evs_final})
            w.write(json.dumps({"id": _id, "sentences": bundle}) + "\n")
    return out_path

evidence_files = {name: run_attribution_for_system(name) for name in systems.keys()}
evidence_files

{'base': PosixPath('/content/polarity_calibration_naacl_2024/artifacts/evidence_base.jsonl'),
 'poca': PosixPath('/content/polarity_calibration_naacl_2024/artifacts/evidence_poca.jsonl')}

In [10]:
def load_jsonl(p):
    with open(p, encoding="utf-8") as f:
        for line in f:
            yield json.loads(line)

rows = []
for name, fpath in evidence_files.items():
    total_sents = 0
    supported = 0
    support_counts = []
    for rec in load_jsonl(fpath):
        for s in rec.get("sentences", []):
            total_sents += 1
            c = len(s.get("evidence", []))
            support_counts.append(c)
            if c >= 1:
                supported += 1
    if total_sents == 0:
        attr1 = 0.0; unsup = 0.0; mean_sup = 0.0
    else:
        attr1 = supported / total_sents
        unsup = 1.0 - attr1
        mean_sup = float(np.mean(support_counts))
    rows.append({
        "system": name,
        "Attribution_at_1": round(attr1, 4),
        "Unsupported_rate": round(unsup, 4),
        "Mean_entailing_supports": round(mean_sup, 4),
        "Total_summary_sentences": total_sents
    })

with open(art / "attr_table.csv", "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=rows[0].keys())
    w.writeheader(); w.writerows(rows)

rows

[{'system': 'base',
  'Attribution_at_1': 0.972,
  'Unsupported_rate': 0.028,
  'Mean_entailing_supports': 4.0366,
  'Total_summary_sentences': 930},
 {'system': 'poca',
  'Attribution_at_1': 0.9584,
  'Unsupported_rate': 0.0416,
  'Mean_entailing_supports': 3.8775,
  'Total_summary_sentences': 865}]