In [1]:
import sys
sys.path.insert(0, "/home/phillab/summac/")
from utils_summac_benchmark import SummaCBenchmark
import utils_summac_benchmark

benchmark = SummaCBenchmark(cut="test")

benchmark.print_stats()

Using custom data configuration default
Reusing dataset xsum (/home/phillab/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499)
Reusing dataset cnn_dailymail (/home/phillab/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


        name     N  N_pos  N_neg  frac_pos
0  cogensumm   400    312     88  0.780000
1  xsumfaith  1250    130   1120  0.104000
2   polytope   634     41    593  0.064669
3     factcc   503    441     62  0.876740
4   summeval   850    770     80  0.905882
5      frank  1575    529   1046  0.335873


# Table 2: Main Table of Results


In [5]:
import sklearn, torch, numpy as np, json, os, tqdm, pandas as pd, nltk, utils_misc, seaborn as sns
from model_guardrails import NERInaccuracyPenalty
from model_summac import SummaCHisto, SummaCZS
from model_baseline import BaselineScorer
from model_entailment import EntailmentScorer
from model_classifier import Classifier
from utils_scoring import ScorerWrapper

use_cache = True
scorers = [
#     {"name": "NER", "model": NERInaccuracyPenalty(flipped=True), "sign": 1},
#     {"name": "MNLI", "model": EntailmentScorer(model_card="roberta-large-mnli", contradiction_idx=0), "sign": 1},
    # {"name": "FactCC-CLS", "model": Classifier(model_card="roberta-base", score_class=1, model_file="/home/phillab/models/cls_roberta-base_factcc_first_0_f1_0.4766.bin"), "sign": 1, "only_doc": True},
    {"name": "DAE", "model": BaselineScorer(model="dae"), "only_doc": True, "sign": 1},
    {"name": "FEQA", "model": BaselineScorer(model="feqa"), "only_doc": True, "sign": 1},
    {"name": "QuestEval", "model": BaselineScorer(model="questeval"), "only_doc": True, "sign": 1},
    {"name": "SummaC-ZS-VITC-L", "model": SummaCZS(granularity="sentence", model_name="vitc", imager_load_cache=use_cache), "sign": 1, "only_doc": True},
    {"name": "SummaC-Histo-VITC-L", "model": SummaCHisto(models=["vitc"], granularity="sentence", start_file="/home/phillab/models/summac/vitc_sentence_percentile_e_bacc0.751.bin", bins="percentile", imager_load_cache=use_cache), "sign": 1, "only_doc": True},
#     {"name": "SummaC-Histo-Multi-L", "model": SummaCHisto(models=["mnli", "anli", "vitc"], granularity="sentence", start_file="/home/phillab/models/summac/no_con_histo_multi_sentence_percentile_bacc0.774.bin", bins="percentile", imager_load_cache=use_cache, use_con=False), "sign": 1, "only_doc": True},
]

scorer_doc = ScorerWrapper(scorers, scoring_method="sum", max_batch_size=20, use_caching=True)
scorer_para = ScorerWrapper([s for s in scorers if "only_doc" not in s], scoring_method="sum", max_batch_size=20, use_caching=True)

<All keys matched successfully>


In [6]:
results = []
for dataset in benchmark.tasks:
    print("======= %s ========" % (dataset["name"]))
    datas = dataset["task"]
    utils_summac_benchmark.compute_doc_level(scorer_doc, datas)
    utils_summac_benchmark.compute_paragraph_level(scorer_para, datas)
    
    labels = [d["label"] for d in datas]
    pred_labels = [k for k in datas[0].keys() if "pred_" in k]
    for pred_label in pred_labels:
        model_name, input_type = pred_label.replace("pred_", "").split("|")
        balanced_acc = sklearn.metrics.balanced_accuracy_score(labels, [d[pred_label] for d in datas])
        roc_auc = sklearn.metrics.roc_auc_score(labels, [d[pred_label.replace("pred_", "")] for d in datas])
        results.append({"model_name": model_name, "dataset_name": dataset["name"], "input": input_type, "%s_bacc" % (dataset["name"]): balanced_acc, "%s_roc_auc" % (dataset["name"]): roc_auc})

100%|██████████| 400/400 [00:00<00:00, 3229.47it/s]




100%|██████████| 11100/11100 [00:00<00:00, 1539016.05it/s]
 46%|████▋     | 580/1250 [00:00<00:00, 5619.17it/s]



100%|██████████| 1250/1250 [00:00<00:00, 5583.27it/s]
100%|██████████| 24117/24117 [00:00<00:00, 1356534.03it/s]
100%|██████████| 634/634 [00:00<00:00, 4579.27it/s]




100%|██████████| 14348/14348 [00:00<00:00, 1299781.29it/s]
100%|██████████| 503/503 [00:00<00:00, 5521.23it/s]




100%|██████████| 10154/10154 [00:00<00:00, 1392069.13it/s]
 49%|████▉     | 420/850 [00:00<00:00, 4188.94it/s]



100%|██████████| 850/850 [00:00<00:00, 4218.98it/s]
100%|██████████| 850/850 [00:00<00:00, 894420.07it/s]
 37%|███▋      | 580/1575 [00:00<00:00, 5789.76it/s]



100%|██████████| 1575/1575 [00:00<00:00, 4483.06it/s]
100%|██████████| 1575/1575 [00:00<00:00, 1042124.75it/s]


In [7]:
# If needed for caching
for scorer in scorers:
    if "SummaC" in scorer["name"]:
        scorer["model"].save_imager_cache()

In [8]:
cm = sns.light_palette("green", as_cmap=True)

def highlight_max(data):
    is_max = data == data.max()
    return ['font-weight: bold' if v else '' for v in is_max]

df = pd.DataFrame(results)
df = df.groupby(["model_name", "input"]).agg({"%s_bacc" % (d): "mean" for d in benchmark.task_name_to_task})
df.rename(columns={k: k.replace("_bacc", "") for k in df.keys()}, inplace=True)
df.drop("total",inplace=True)
df["overall"] = (df["factcc"]+df["frank"]+df["polytope"]+df["cogensumm"]+df["summeval"]+df["xsumfaith"]) / (6.0)

df.style.apply(highlight_max).background_gradient(cmap=cm, high=1.0, low=0.0).set_precision(3).set_caption("Balanced Accuracy")

Unnamed: 0_level_0,Unnamed: 1_level_0,cogensumm,xsumfaith,polytope,factcc,summeval,frank,overall
model_name,input,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DAE,doc,0.634,0.508,0.628,0.759,0.703,0.617,0.642
FEQA,doc,0.61,0.56,0.578,0.536,0.538,0.699,0.587
QuestEval,doc,0.626,0.621,0.703,0.666,0.725,0.821,0.694
SummaC-Histo-VITC-L,doc,0.625,0.652,0.628,0.899,0.818,0.817,0.74
SummaC-ZS-VITC-L,doc,0.704,0.584,0.62,0.838,0.787,0.79,0.721


In [9]:
df = pd.DataFrame(results)
df = df.groupby(["model_name", "input"]).agg({"%s_roc_auc" % (d): "mean" for d in benchmark.task_name_to_task})
df.rename(columns={k: k.replace("_roc_auc", "") for k in df.keys()}, inplace=True)
df.drop("total",inplace=True)
df["overall"] = (df["factcc"]+df["frank"]+df["polytope"]+df["cogensumm"]+df["summeval"]+df["xsumfaith"]) / (6.0)

df.style.apply(highlight_max).background_gradient(cmap=cm, high=1.0, low=0.0).set_precision(3).set_caption("ROC AUC")

Unnamed: 0_level_0,Unnamed: 1_level_0,cogensumm,xsumfaith,polytope,factcc,summeval,frank,overall
model_name,input,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DAE,doc,0.678,0.413,0.641,0.827,0.774,0.643,0.663
FEQA,doc,0.608,0.534,0.546,0.507,0.522,0.748,0.577
QuestEval,doc,0.644,0.664,0.722,0.715,0.79,0.879,0.736
SummaC-Histo-VITC-L,doc,0.669,0.656,0.626,0.93,0.861,0.887,0.772
SummaC-ZS-VITC-L,doc,0.731,0.58,0.603,0.837,0.855,0.853,0.743


# Inter-Annotator Agreements

In [3]:
import statsmodels, numpy as np

from statsmodels.stats.inter_rater import fleiss_kappa

In [30]:
from collections import Counter

def vectorize(annotations):
    return [annotations.count(0), annotations.count(1)]

def build_fleiss_matrix(all_annotations):
    annot_counts = Counter([len(annots) for annots in all_annotations])
    num_annots = annot_counts.most_common(1)[0][0]
    filtered_annotations = [d[:num_annots] for d in all_annotations if len(d) >= num_annots]
    return np.array([vectorize(d) for d in filtered_annotations])

for task in benchmark.tasks:
    print("================== %s ====================" % (task["name"].upper()))
    
    if len(task["task"][0]["annotations"]) == 1:
        print("Analysis is skipped (only 1 annotation)")
        continue
    
    all_annotations = [d["annotations"] for d in task["task"]]
    fleiss_matrix = build_fleiss_matrix(all_annotations)
    print("Fleiss Kappa Agreement: %.3f" % (fleiss_kappa(fleiss_matrix)))
    
    

Analysis is skipped (only 1 annotation)
Fleiss Kappa Agreement: 0.530
Analysis is skipped (only 1 annotation)
Analysis is skipped (only 1 annotation)
Fleiss Kappa Agreement: 0.700
Fleiss Kappa Agreement: 0.796
