In [1]:
from utils_summac_benchmark import SummaCBenchmark
import utils_summac_benchmark, random

benchmark = SummaCBenchmark(
    benchmark_folder="/home/phillab/data/summac_benchmark/", cut="test"
)
benchmark.print_stats()

Using custom data configuration default
Reusing dataset xsum (/home/phillab/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499)
Reusing dataset cnn_dailymail (/home/phillab/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


        name     N  N_pos  N_neg  frac_pos
0  cogensumm   400    312     88  0.780000
1  xsumfaith  1250    130   1120  0.104000
2   polytope   634     41    593  0.064669
3     factcc   503    441     62  0.876740
4   summeval   850    770     80  0.905882
5      frank  1575    529   1046  0.335873


# Table 2: Main Table of Results


In [2]:
import sklearn, torch, numpy as np, json, os, tqdm, pandas as pd, nltk, seaborn as sns
from model_guardrails import NERInaccuracyPenalty
from model_summac import SummaCConv, SummaCZS
from model_baseline import BaselineScorer

# from model_entailment import EntailmentScorer
from model_classifier import Classifier
from utils_scoring import ScorerWrapper

use_cache = True
scorers = [
    {
        "name": "NER",
        "model": NERInaccuracyPenalty(flipped=True),
        "only_doc": True,
        "sign": 1,
    },
    #     {"name": "MNLI", "model": EntailmentScorer(model_card="roberta-large-mnli", contradiction_idx=0), "sign": 1},
    # {"name": "FactCC-CLS", "model": Classifier(model_card="roberta-base", score_class=1, model_file="/home/phillab/models/cls_roberta-base_factcc_first_0_f1_0.4766.bin"), "sign": 1, "only_doc": True},
    {"name": "DAE", "model": BaselineScorer(model="dae"), "only_doc": True, "sign": 1},
    {
        "name": "FEQA",
        "model": BaselineScorer(model="feqa"),
        "only_doc": True,
        "sign": 1,
    },
    {
        "name": "QuestEval",
        "model": BaselineScorer(model="questeval"),
        "only_doc": True,
        "sign": 1,
    },
    {
        "name": "SummaC-ZS-VITC-L",
        "model": SummaCZS(
            granularity="sentence", model_name="vitc", imager_load_cache=use_cache
        ),
        "sign": 1,
        "only_doc": True,
    },
    {
        "name": "SummaC-Histo-VITC-L",
        "model": SummaCConv(
            models=["vitc"],
            granularity="sentence",
            start_file="/home/phillab/models/summac/vitc_sentence_percentile_e_bacc0.744.bin",
            bins="percentile",
            imager_load_cache=use_cache,
            device="cpu",
        ),
        "sign": 1,
        "only_doc": True,
    },
]

scorer_doc = ScorerWrapper(
    scorers, scoring_method="sum", max_batch_size=20, use_caching=True
)
scorer_para = ScorerWrapper(
    [s for s in scorers if "only_doc" not in s],
    scoring_method="sum",
    max_batch_size=20,
    use_caching=True,
)

<All keys matched successfully>


In [3]:
results = []
for dataset in benchmark.tasks:
    print("======= %s ========" % (dataset["name"]))
    datas = dataset["task"]
    utils_summac_benchmark.compute_doc_level(scorer_doc, datas)
    utils_summac_benchmark.compute_paragraph_level(scorer_para, datas)

    labels = [d["label"] for d in datas]
    pred_labels = [k for k in datas[0].keys() if "pred_" in k]
    for pred_label in pred_labels:
        model_name, input_type = pred_label.replace("pred_", "").split("|")
        preds = [d[pred_label] for d in datas]
        scores = [d[pred_label.replace("pred_", "")] for d in datas]
        balanced_acc = sklearn.metrics.balanced_accuracy_score(labels, preds)
        roc_auc = sklearn.metrics.roc_auc_score(labels, scores)

        results.append(
            {
                "model_name": model_name,
                "dataset_name": dataset["name"],
                "input": input_type,
                "%s_bacc" % (dataset["name"]): balanced_acc,
                "%s_roc_auc" % (dataset["name"]): roc_auc,
                "labels": labels,
                "preds": preds,
                "scores": scores,
            }
        )

  0%|          | 0/400 [00:00<?, ?it/s]



100%|██████████| 400/400 [00:10<00:00, 36.95it/s]
100%|██████████| 11100/11100 [00:00<00:00, 1611853.43it/s]
  8%|▊         | 100/1250 [00:00<00:01, 842.17it/s]



100%|██████████| 1250/1250 [00:01<00:00, 680.07it/s]
100%|██████████| 24117/24117 [00:00<00:00, 1446235.21it/s]
 19%|█▉        | 120/634 [00:00<00:00, 1134.43it/s]



100%|██████████| 634/634 [00:00<00:00, 1139.87it/s]
100%|██████████| 14348/14348 [00:00<00:00, 1441434.10it/s]
 20%|█▉        | 100/503 [00:00<00:00, 924.54it/s]



100%|██████████| 503/503 [00:00<00:00, 895.87it/s]
100%|██████████| 10154/10154 [00:00<00:00, 1481819.10it/s]
 14%|█▍        | 120/850 [00:00<00:00, 1096.83it/s]



100%|██████████| 850/850 [00:00<00:00, 860.55it/s] 
100%|██████████| 850/850 [00:00<00:00, 1015714.64it/s]
  6%|▋         | 100/1575 [00:00<00:01, 979.90it/s]



100%|██████████| 1575/1575 [00:02<00:00, 748.74it/s]
100%|██████████| 1575/1575 [00:00<00:00, 1032353.31it/s]


In [7]:
# If needed for caching
for scorer in scorers:
    if "SummaC" in scorer["name"]:
        scorer["model"].save_imager_cache()

## Balanced accuracy score

In [4]:
cm = sns.light_palette("green", as_cmap=True)


def highlight_max(data):
    is_max = data == data.max()
    return ["font-weight: bold" if v else "" for v in is_max]


df = pd.DataFrame(results)
df = df.groupby(["model_name", "input"]).agg(
    {"%s_bacc" % (d): "mean" for d in benchmark.task_name_to_task}
)
df.rename(columns={k: k.replace("_bacc", "") for k in df.keys()}, inplace=True)
df.drop("total", inplace=True)
df["overall"] = (
    df["factcc"]
    + df["frank"]
    + df["polytope"]
    + df["cogensumm"]
    + df["summeval"]
    + df["xsumfaith"]
) / (6.0)

df.style.apply(highlight_max).background_gradient(
    cmap=cm, high=1.0, low=0.0
).set_precision(3).set_caption("Balanced Accuracy")

Unnamed: 0_level_0,Unnamed: 1_level_0,cogensumm,xsumfaith,polytope,factcc,summeval,frank,overall
model_name,input,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DAE,doc,0.634,0.508,0.628,0.759,0.703,0.617,0.642
FEQA,doc,0.61,0.56,0.578,0.536,0.538,0.699,0.587
NER,doc,0.502,0.623,0.517,0.5,0.568,0.609,0.553
QuestEval,doc,0.626,0.621,0.703,0.666,0.725,0.821,0.694
SummaC-Histo-VITC-L,doc,0.647,0.664,0.627,0.895,0.817,0.816,0.744
SummaC-ZS-VITC-L,doc,0.704,0.584,0.62,0.838,0.787,0.79,0.721


In [16]:
# Analysis with confidence interval
strongest_baseline = {
    "cogensumm": "DAE",
    "xsumfaith": "NER",
    "polytope": "QuestEval",
    "factcc": "DAE",
    "summeval": "QuestEval",
    "frank": "QuestEval",
}

P5 = 5 / 2  # Correction due to the fact that we are running 2 tests with the same data
P1 = 1 / 2  # Correction due to the fact that we are running 2 tests with the same data


def resample_balanced_acc(preds, labels, n_samples=100, sample_ratio=0.7):
    N = len(preds)
    idxs = list(range(N))
    N_batch = int(sample_ratio * N)

    bal_accs = []
    for _ in range(n_samples):
        random.shuffle(idxs)
        batch_preds = [preds[i] for i in idxs[:N_batch]]
        batch_labels = [labels[i] for i in idxs[:N_batch]]

        bal_accs.append(
            sklearn.metrics.balanced_accuracy_score(batch_labels, batch_preds)
        )
    return bal_accs


print("DATASET NAME".ljust(15), "MODEL NAME".ljust(20))

sampled_batch_preds = {res["model_name"]: [] for res in results}
for res in results:
    if res["model_name"] == "total":
        print("==================================================")
        continue

    samples = resample_balanced_acc(res["preds"], res["labels"])
    sampled_batch_preds[res["model_name"]].append(samples)
    low5, high5 = np.percentile(samples, P5), np.percentile(samples, 100 - P5)
    low1, high1 = np.percentile(samples, P1), np.percentile(samples, 100 - P1)
    bacc = sklearn.metrics.balanced_accuracy_score(res["labels"], res["preds"])
    if (
        "SummaC" in res["model_name"]
        or res["model_name"] == strongest_baseline[res["dataset_name"]]
    ):

        print(
            res["dataset_name"].ljust(15),
            res["model_name"].ljust(20),
            " - %.3f (%.3f - %.3f) (%.3f - %.3f)" % (bacc, low5, high5, low1, high1),
        )
        if res["model_name"] == strongest_baseline[res["dataset_name"]]:
            bl5, bh5, bl1, bh1 = low5, high5, low1, high1
            print("--------------")
        else:
            if low5 >= bh5:
                print("Significant difference (p < 0.05)")
            if low1 >= bh1:
                print("Significant difference (p < 0.01)")

print("==========================")
print("==========================")
print("==========================")

baseline = np.mean(np.array(sampled_batch_preds["QuestEval"]), axis=0)
summaczs = np.mean(np.array(sampled_batch_preds["SummaC-ZS-VITC-L"]), axis=0)
summacconv = np.mean(np.array(sampled_batch_preds["SummaC-Histo-VITC-L"]), axis=0)

for model in ["QuestEval", "SummaC-ZS-VITC-L", "SummaC-Histo-VITC-L"]:
    samples = np.mean(np.array(sampled_batch_preds[model]), axis=0)
    low5, high5 = np.percentile(samples, P5), np.percentile(samples, 100 - P5)
    low1, high1 = np.percentile(samples, P1), np.percentile(samples, 100 - P1)

    print(
        "OVERALL".ljust(15),
        model.ljust(20),
        " - (%.3f - %.3f) (%.3f - %.3f)" % (low5, high5, low1, high1),
    )

DATASET NAME    MODEL NAME          
cogensumm       DAE                   - 0.634 (0.598 - 0.677) (0.594 - 0.688)
--------------
cogensumm       SummaC-ZS-VITC-L      - 0.704 (0.668 - 0.745) (0.654 - 0.749)
cogensumm       SummaC-Histo-VITC-L   - 0.647 (0.618 - 0.680) (0.612 - 0.684)
xsumfaith       NER                   - 0.623 (0.610 - 0.640) (0.607 - 0.644)
--------------
xsumfaith       SummaC-ZS-VITC-L      - 0.584 (0.561 - 0.606) (0.553 - 0.614)
xsumfaith       SummaC-Histo-VITC-L   - 0.664 (0.643 - 0.694) (0.638 - 0.704)
Significant difference (p < 0.05)
polytope        QuestEval             - 0.703 (0.672 - 0.742) (0.657 - 0.745)
--------------
polytope        SummaC-ZS-VITC-L      - 0.620 (0.570 - 0.667) (0.557 - 0.684)
polytope        SummaC-Histo-VITC-L   - 0.627 (0.552 - 0.680) (0.547 - 0.690)
factcc          DAE                   - 0.759 (0.720 - 0.797) (0.708 - 0.808)
--------------
factcc          SummaC-ZS-VITC-L      - 0.838 (0.809 - 0.870) (0.803 - 0.880)
Significant

## ROC AUC score

In [6]:
df = pd.DataFrame(results)
df = df.groupby(["model_name", "input"]).agg(
    {"%s_roc_auc" % (d): "mean" for d in benchmark.task_name_to_task}
)
df.rename(columns={k: k.replace("_roc_auc", "") for k in df.keys()}, inplace=True)
df.drop("total", inplace=True)
df["overall"] = (
    df["factcc"]
    + df["frank"]
    + df["polytope"]
    + df["cogensumm"]
    + df["summeval"]
    + df["xsumfaith"]
) / (6.0)

df.style.apply(highlight_max).background_gradient(
    cmap=cm, high=1.0, low=0.0
).set_precision(3).set_caption("ROC AUC")

Unnamed: 0_level_0,Unnamed: 1_level_0,cogensumm,xsumfaith,polytope,factcc,summeval,frank,overall
model_name,input,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DAE,doc,0.678,0.413,0.641,0.827,0.774,0.643,0.663
FEQA,doc,0.608,0.534,0.546,0.507,0.522,0.748,0.577
NER,doc,0.502,0.623,0.517,0.5,0.568,0.609,0.553
QuestEval,doc,0.644,0.664,0.722,0.715,0.79,0.879,0.736
SummaC-Histo-VITC-L,doc,0.676,0.702,0.624,0.922,0.86,0.884,0.778
SummaC-ZS-VITC-L,doc,0.731,0.58,0.603,0.837,0.855,0.853,0.743


In [17]:
# Analysis with confidence interval
strongest_baseline = {
    "cogensumm": "DAE",
    "xsumfaith": "QuestEval",
    "polytope": "QuestEval",
    "factcc": "DAE",
    "summeval": "QuestEval",
    "frank": "QuestEval",
}

P5 = 5 / 2  # Correction due to the fact that we are running 2 tests with the same data
P1 = 1 / 2  # Correction due to the fact that we are running 2 tests with the same data


def resample_roc_auc(scores, labels, n_samples=100, sample_ratio=0.7):
    N = len(scores)
    idxs = list(range(N))
    N_batch = int(sample_ratio * N)

    roc_aucs = []
    for _ in range(n_samples):
        random.shuffle(idxs)
        batch_scores = [scores[i] for i in idxs[:N_batch]]
        batch_labels = [labels[i] for i in idxs[:N_batch]]
        roc_aucs.append(sklearn.metrics.roc_auc_score(batch_labels, batch_scores))
    return roc_aucs


sampled_batch_preds = {res["model_name"]: [] for res in results}
print("DATASET NAME".ljust(15), "MODEL NAME".ljust(20))
for res in results:
    if res["model_name"] == "total":
        print("==================================================")
        continue
    samples = resample_roc_auc(res["scores"], res["labels"])
    sampled_batch_preds[res["model_name"]].append(samples)
    low5, high5 = np.percentile(samples, P5), np.percentile(samples, 100 - P5)
    low1, high1 = np.percentile(samples, P1), np.percentile(samples, 100 - P1)
    roc_auc = sklearn.metrics.roc_auc_score(res["labels"], res["scores"])
    if (
        "SummaC" in res["model_name"]
        or res["model_name"] == strongest_baseline[res["dataset_name"]]
    ):
        print(
            res["dataset_name"].ljust(15),
            res["model_name"].ljust(20),
            " - %.3f (%.3f - %.3f) (%.3f - %.3f)" % (roc_auc, low5, high5, low1, high1),
        )
        if res["model_name"] == strongest_baseline[res["dataset_name"]]:
            bl5, bh5, bl1, bh1 = low5, high5, low1, high1
            print("--------------")
        else:
            if low5 >= bh5:
                print("Significant difference (p < 0.05)")
            if low1 >= bh1:
                print("Significant difference (p < 0.01)")

print("==========================")
print("==========================")
print("==========================")

baseline = np.mean(np.array(sampled_batch_preds["QuestEval"]), axis=0)
summaczs = np.mean(np.array(sampled_batch_preds["SummaC-ZS-VITC-L"]), axis=0)
summacconv = np.mean(np.array(sampled_batch_preds["SummaC-Histo-VITC-L"]), axis=0)

for model in ["QuestEval", "SummaC-ZS-VITC-L", "SummaC-Histo-VITC-L"]:
    samples = np.mean(np.array(sampled_batch_preds[model]), axis=0)
    low5, high5 = np.percentile(samples, P5), np.percentile(samples, 100 - P5)
    low1, high1 = np.percentile(samples, P1), np.percentile(samples, 100 - P1)

    print(
        "OVERALL".ljust(15),
        model.ljust(20),
        " - (%.3f - %.3f) (%.3f - %.3f)" % (low5, high5, low1, high1),
    )

DATASET NAME    MODEL NAME          
cogensumm       DAE                   - 0.678 (0.639 - 0.726) (0.632 - 0.735)
--------------
cogensumm       SummaC-ZS-VITC-L      - 0.731 (0.697 - 0.767) (0.685 - 0.778)
cogensumm       SummaC-Histo-VITC-L   - 0.676 (0.633 - 0.716) (0.627 - 0.720)
xsumfaith       QuestEval             - 0.664 (0.631 - 0.688) (0.626 - 0.699)
--------------
xsumfaith       SummaC-ZS-VITC-L      - 0.580 (0.552 - 0.615) (0.547 - 0.616)
xsumfaith       SummaC-Histo-VITC-L   - 0.702 (0.675 - 0.733) (0.666 - 0.740)
polytope        QuestEval             - 0.722 (0.683 - 0.762) (0.682 - 0.766)
--------------
polytope        SummaC-ZS-VITC-L      - 0.603 (0.529 - 0.667) (0.524 - 0.685)
polytope        SummaC-Histo-VITC-L   - 0.624 (0.560 - 0.679) (0.530 - 0.696)
factcc          DAE                   - 0.827 (0.793 - 0.863) (0.787 - 0.881)
--------------
factcc          SummaC-ZS-VITC-L      - 0.837 (0.800 - 0.879) (0.786 - 0.891)
factcc          SummaC-Histo-VITC-L   - 0.922