In [1]:
from google.colab import drive
drive.mount('/content/drive')

!unzip -q "/content/drive/My Drive/AmaSum/raw_min_10_max_100_revs.zip" -d /content/AmaSum

Mounted at /content/drive


In [2]:
!pip -q install rouge-score nltk pandas==2.2.2 tqdm==4.66.4
import os, glob, json, random, pathlib, pandas as pd
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize
import nltk; nltk.download('punkt'); nltk.download('punkt_tab')

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dataproc-spark-connect 0.8.3 requires tqdm>=4.67, but you have tqdm 4.66.4 which is incompatible.[0m[31m
[0m

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
BASE = "/content/AmaSum/min_10_max_100_revs_filt_complete"
train_files = sorted(glob.glob(f"{BASE}/train/*.json"))
valid_files = sorted(glob.glob(f"{BASE}/valid/*.json"))
test_files  = sorted(glob.glob(f"{BASE}/test/*.json"))
print(len(train_files), len(test_files), len(valid_files))

25203 3166 3114


In [4]:
!git clone -q https://github.com/yuanyuanlei-nlp/polarity_calibration_naacl_2024.git
GEN = "/content/polarity_calibration_naacl_2024/generated_summary_AmaSum"

In [5]:
test_names_path = f"{GEN}/test_file_names.txt"
with open(test_names_path) as f:
    test_names = [ln.strip() for ln in f if ln.strip()]

print(len(test_names))

200


In [6]:
def load_gold_ref_by_id(file_id: str) -> str:
    with open(f"{BASE}/test/{file_id}.json") as f:
        obj = json.load(f)
    ws = obj["website_summaries"][0]
    verdict = ws.get("verdict","").strip()
    pros = ". ".join(ws.get("pros", [])).strip().rstrip(".")
    cons = ". ".join(ws.get("cons", [])).strip().rstrip(".")
    parts = [p for p in [verdict, pros, cons] if p]
    return (". ".join(parts) + ".").strip()

def load_hyps(path_or_file: str):
    if os.path.isfile(path_or_file):
        return [ln.strip() for ln in open(path_or_file).read().splitlines()]
    files = sorted(glob.glob(os.path.join(path_or_file, "*.txt")))
    if len(files) == 1:
        return [ln.strip() for ln in open(files[0]).read().splitlines()]
    return [open(p).read().strip() for p in files]

def eval_one(hyps, test_names):
    assert len(hyps) == len(test_names), f"Mismatch: {len(hyps)} hyps vs {len(test_names)} ids"
    refs = [load_gold_ref_by_id(fid) for fid in test_names]
    scorer = rouge_scorer.RougeScorer(["rouge1","rouge2","rougeL","rougeLsum"], use_stemmer=True, split_summaries=True)
    rows = []
    for fid, ref, hyp in zip(test_names, refs, hyps):
        sc = scorer.score(ref, hyp)
        rows.append({
            "file": f"{fid}.json",
            "rouge1_f": sc["rouge1"].fmeasure,
            "rouge2_f": sc["rouge2"].fmeasure,
            "rougeL_f": sc["rougeL"].fmeasure,
            "rougeLsum_f": sc["rougeLsum"].fmeasure,
        })
    return pd.DataFrame(rows)

In [7]:
ART = "/content/artifacts"; os.makedirs(ART, exist_ok=True)

targets = {
    "lexrank": f"{GEN}/lexrank.txt",
    "base_flan_t5_large": f"{GEN}/base_summarizer_flan_t5_large.txt",
    "poca_calibrated":  f"{GEN}/calibrated_summarizer_PoCa.txt",
    "hercules_extractive":  f"{GEN}/hercules_extractive.txt",
    "hercules_abstractive":f"{GEN}/hercules_abstractive.txt",
    "copycat": f"{GEN}/copycat.txt",
    "qt": f"{GEN}/qt.txt",
    "semae":  f"{GEN}/semae.txt",
    "bimanevae_avg": f"{GEN}/bimeanvae_avg.txt",
    "bimanevae_coop": f"{GEN}/bimeanvae_coop.txt",
    "gpt_35_turbo":  f"{GEN}/gpt_35_turbo.txt",
    "gpt_4": f"{GEN}/gpt_4.txt",
}

summary_table = []
for name, path in targets.items():
    print(name)
    hyps = load_hyps(path)
    df = eval_one(hyps, test_names)
    csv_path = f"{ART}/rouge_{name}.csv"
    df.to_csv(csv_path, index=False)
    means = df[["rouge1_f","rouge2_f","rougeL_f","rougeLsum_f"]].mean().round(4)
    summary_table.append({"model": name, **means.to_dict()})
    print(csv_path)

results = pd.DataFrame(summary_table)
display(results)

results.to_csv(f"{ART}/rouge_summary_table.csv", index=False)
!ls -lh /content/artifacts

Evaluating lexrank ...
  saved → /content/artifacts/rouge_lexrank.csv
Evaluating base_flan_t5_large ...
  saved → /content/artifacts/rouge_base_flan_t5_large.csv
Evaluating poca_calibrated ...
  saved → /content/artifacts/rouge_poca_calibrated.csv
Evaluating hercules_extractive ...
  saved → /content/artifacts/rouge_hercules_extractive.csv
Evaluating hercules_abstractive ...
  saved → /content/artifacts/rouge_hercules_abstractive.csv
Evaluating copycat ...
  saved → /content/artifacts/rouge_copycat.csv
Evaluating qt ...
  saved → /content/artifacts/rouge_qt.csv
Evaluating semae ...
  saved → /content/artifacts/rouge_semae.csv
Evaluating bimanevae_avg ...
  saved → /content/artifacts/rouge_bimanevae_avg.csv
Evaluating bimanevae_coop ...
  saved → /content/artifacts/rouge_bimanevae_coop.csv
Evaluating gpt_35_turbo ...
  saved → /content/artifacts/rouge_gpt_35_turbo.csv
Evaluating gpt_4 ...
  saved → /content/artifacts/rouge_gpt_4.csv


Unnamed: 0,model,rouge1_f,rouge2_f,rougeL_f,rougeLsum_f
0,lexrank,0.1992,0.0261,0.121,0.1803
1,base_flan_t5_large,0.2923,0.0564,0.1719,0.2626
2,poca_calibrated,0.2842,0.0512,0.1696,0.2536
3,hercules_extractive,0.229,0.0308,0.1255,0.2125
4,hercules_abstractive,0.1982,0.0216,0.1172,0.1871
5,copycat,0.1738,0.0136,0.1095,0.1556
6,qt,0.2117,0.0156,0.1137,0.1924
7,semae,0.2033,0.0162,0.1136,0.1833
8,bimanevae_avg,0.2131,0.0201,0.1232,0.1939
9,bimanevae_coop,0.2367,0.0272,0.1397,0.2137


total 244K
-rw-r--r-- 1 root root 19K Sep  6 19:02 rouge_base_flan_t5_large.csv
-rw-r--r-- 1 root root 18K Sep  6 19:02 rouge_bimanevae_avg.csv
-rw-r--r-- 1 root root 18K Sep  6 19:02 rouge_bimanevae_coop.csv
-rw-r--r-- 1 root root 17K Sep  6 19:02 rouge_copycat.csv
-rw-r--r-- 1 root root 18K Sep  6 19:02 rouge_gpt_35_turbo.csv
-rw-r--r-- 1 root root 19K Sep  6 19:02 rouge_gpt_4.csv
-rw-r--r-- 1 root root 18K Sep  6 19:02 rouge_hercules_abstractive.csv
-rw-r--r-- 1 root root 18K Sep  6 19:02 rouge_hercules_extractive.csv
-rw-r--r-- 1 root root 19K Sep  6 19:02 rouge_lexrank.csv
-rw-r--r-- 1 root root 18K Sep  6 19:02 rouge_poca_calibrated.csv
-rw-r--r-- 1 root root 17K Sep  6 19:02 rouge_qt.csv
-rw-r--r-- 1 root root 18K Sep  6 19:02 rouge_semae.csv
-rw-r--r-- 1 root root 527 Sep  6 19:02 rouge_summary_table.csv
