In [2]:
import sklearn, torch, numpy as np, json, os, tqdm, pandas as pd, nltk, utils_misc, seaborn as sns, sys, glob
sys.path.insert(0, "/home/phillab/summac/")
from model_summac import SummaCHisto, SummaCZS, model_map
from utils_summac_benchmark import SummaCBenchmark
from utils_scoring import ScorerWrapper
import utils_summac_benchmark

cm = sns.light_palette("green", as_cmap=True)
benchmark = SummaCBenchmark(cut="test")
benchmark.print_stats()

def path_to_model_info(file_path):
    toks = file_path.split("/")
    file_name = toks[-1].replace(".bin", "")
    # vitc_sentence_percentile_ecn_f10.738.bin
    model_type = "histo"
    model_card, granularity, bins, nli_labels, acc = file_name.split("_")
    acc = float(acc.replace("bacc", "").replace("f1", ""))
    return {"model_type": model_type, "model_card": model_card, "granularity": granularity, "bins": bins, "acc": acc, "model_path": file_path, "nli_labels": nli_labels}



        name     N  N_pos  N_neg  frac_pos
0  cogensumm   400    312     88  0.780000
1  xsumfaith  1250    130   1120  0.104000
2   polytope   634     41    593  0.064669
3     factcc   503    441     62  0.876740
4   summeval   850    770     80  0.905882
5      frank  1575    529   1046  0.335873


# Table 3: NLI Model Selection


In [2]:
scorers = []
model_keys = list(model_map.keys())#+ ["decomp"]
# model_keys = ["decomp"]

for model_key in model_keys:
    scorers.append({"name": "ZS-%s" % (model_key.upper().replace("-", "_")), "model": SummaCZS(granularity="sentence", model_name=model_key), "sign": 1, "only_doc": True})
    
    # Add a histogram based-model
    model_files = glob.glob("/home/phillab/models/summac/%s_sentence*" % (model_key))
    if len(model_files) == 0:
        print("No model for [%s] was found" % (model_key))
        continue
    best = sorted([path_to_model_info(mf) for mf in model_files], key=lambda m: m["acc"])[-1]
    scorers.append({"name": "Histo-%s" % (model_key.upper().replace("-", "_")), "model": SummaCHisto(bins=best["bins"], nli_labels=best["nli_labels"], models=[model_key], granularity="sentence", start_file=best["model_path"]), "sign": 1})

scorer_doc = ScorerWrapper(scorers, scoring_method="sum", max_batch_size=20, use_caching=True)

<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>


In [3]:
benchmark = SummaCBenchmark(cut="test")

results = {}
for dataset in benchmark.tasks:
    print("======= %s ========" % (dataset["name"]))
    datas = dataset["task"]
    labels = [d["label"] for d in datas]
    utils_summac_benchmark.compute_doc_level(scorer_doc, datas)
    
    for pred_label in datas[0].keys():
        if "pred_" not in pred_label or "total" in pred_label: continue
        balanced_acc = sklearn.metrics.balanced_accuracy_score(labels, [d[pred_label] for d in datas])
        model_name, input_type = pred_label.replace("pred_", "").split("|")
        model_type, nli_name = model_name.split("-")
        k = (model_type, nli_name)
        if k not in results:
            results[k] = []
        results[k].append(balanced_acc)

cleaned_results = []
for (model_type, nli), vs in results.items():
    cleaned_results.append({"nli_name": nli, "model_type": model_type, "score": np.mean(vs)})
    
pd.DataFrame(cleaned_results).groupby(["nli_name", "model_type"]).agg({"score": "sum"}).style.set_precision(3).set_caption("Balanced Accuracy")

Using custom data configuration default
Reusing dataset xsum (/home/phillab/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499)
Reusing dataset cnn_dailymail (/home/phillab/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
  0%|          | 0/400 [00:00<?, ?it/s]



Some weights of the model checkpoint at microsoft/deberta-base-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-base-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification 



100%|██████████| 1250/1250 [13:06<00:00,  1.59it/s]
  0%|          | 0/634 [00:00<?, ?it/s]



100%|██████████| 634/634 [19:13<00:00,  1.82s/it]
  0%|          | 0/503 [00:00<?, ?it/s]



100%|██████████| 503/503 [08:13<00:00,  1.02it/s]
  5%|▍         | 40/850 [00:00<00:02, 325.65it/s]



100%|██████████| 850/850 [00:02<00:00, 383.71it/s]
  0%|          | 0/1575 [00:00<?, ?it/s]



100%|██████████| 1575/1575 [39:21<00:00,  1.50s/it]


Unnamed: 0_level_0,Unnamed: 1_level_0,score
nli_name,model_type,Unnamed: 2_level_1
ANLI,Histo,0.699
ANLI,ZS,0.717
MNLI,Histo,0.73
MNLI,ZS,0.709
MNLI_BASE,Histo,0.698
MNLI_BASE,ZS,0.695
SNLI_BASE,Histo,0.64
SNLI_BASE,ZS,0.666
SNLI_LARGE,Histo,0.624
SNLI_LARGE,ZS,0.666


In [4]:
for scorer in scorers:
        scorer["model"].save_imager_cache()

# Table 4: Choice of NLI Category

In [None]:
scorers = []
for model_key in ["vitc", "mnli", "anli"]:
    for nli_labels in ["e", "c", "n", "ec", "en", "cn", "ecn"]:
    
        model_files = glob.glob("/home/phillab/models/summac/%s_sentence_percentile_%s*" % (model_key, nli_labels))
        if len(model_files) == 0:
            print("No model for [%s, %s] was found" % (model_key, nli_labels))
            continue
        best = sorted([path_to_model_info(mf) for mf in model_files], key=lambda m: m["acc"])[-1]
        scorers.append({"name": "Histo-%s-%s" % (model_key.upper().replace("-", "_"), nli_labels), "model": SummaCHisto(bins=best["bins"], nli_labels=best["nli_labels"], models=[model_key], granularity="sentence", start_file=best["model_path"]), "sign": 1})

scorer_doc = ScorerWrapper(scorers, max_batch_size=20, use_caching=True)
print("%d scorers loaded" % (len(scorers)))

benchmark = SummaCBenchmark(cut="test")

results = {}
for dataset in benchmark.tasks:
    print("======= %s ========" % (dataset["name"]))
    datas = dataset["task"]
    labels = [d["label"] for d in datas]
    utils_summac_benchmark.compute_doc_level(scorer_doc, datas)
    
    for pred_label in datas[0].keys():
        if "pred_" not in pred_label or "total" in pred_label: continue
        balanced_acc = sklearn.metrics.balanced_accuracy_score(labels, [d[pred_label] for d in datas])
        model_name, input_type = pred_label.replace("pred_", "").split("|")
        model_type, nli_name, nli_labels = model_name.split("-")
        k = (nli_name, nli_labels)
        if k not in results:
            results[k] = []
        results[k].append(balanced_acc)

cleaned_results = []
for (nli, nli_labels), vs in results.items():
    cleaned_results.append({"nli_name": nli, "nli_labels": nli_labels, "model_type": model_type, "score": np.mean(vs)})
    
pd.DataFrame(cleaned_results).groupby(["nli_name", "nli_labels"]).agg({"score": "sum"}).style.set_precision(3).set_caption("Balanced Accuracy")

# Table 5: Granularity Selection


In [3]:
scorers = []
for model_key in ["mnli", "vitc"]:
    for granularity in ["sentence", "2sents", "paragraph"]: # ["sentence", "paragraph"]
        scorers.append({"name": "ZS-%s-%s" % (model_key.upper(), granularity), "model": SummaCZS(granularity=granularity, model_name=model_key), "sign": 1})
    
        model_files = glob.glob("/home/phillab/models/summac/%s_%s*" % (model_key, granularity))
        if len(model_files) == 0:
            print("No model for [%s, %s] was found" % (model_key, granularity))
            continue
        best = sorted([path_to_model_info(mf) for mf in model_files], key=lambda m: m["acc"])[-1]
        scorers.append({"name": "Histo-%s-%s" % (model_key.upper().replace("-", "_"), granularity), "model": SummaCHisto(bins=best["bins"], nli_labels=best["nli_labels"], models=[model_key], granularity=granularity, start_file=best["model_path"]), "sign": 1})

scorer_doc = ScorerWrapper(scorers, max_batch_size=20, use_caching=True)
print("%d scorers loaded" % (len(scorers)))

<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>
12 scorers loaded


In [4]:
benchmark = SummaCBenchmark(cut="test")

results = {}
for dataset in benchmark.tasks:
    print("======= %s ========" % (dataset["name"]))
    datas = dataset["task"]
    labels = [d["label"] for d in datas]
    utils_summac_benchmark.compute_doc_level(scorer_doc, datas)
    
    for pred_label in datas[0].keys():
        if "pred_" not in pred_label or "total" in pred_label: continue
        balanced_acc = sklearn.metrics.balanced_accuracy_score(labels, [d[pred_label] for d in datas])
        model_name, input_type = pred_label.replace("pred_", "").split("|")
        model_type, nli_name, gran = model_name.split("-")
        k = (model_type, nli_name, gran)
        if k not in results:
            results[k] = []
        results[k].append(balanced_acc)

cleaned_results = []
for (model_type, nli, gran), vs in results.items():
    cleaned_results.append({"nli_name": nli, "granularity": gran, "model_type": model_type, "score": np.mean(vs)})
    
pd.DataFrame(cleaned_results).groupby(["nli_name", "granularity", "model_type"]).agg({"score": "sum"}).style.set_precision(3).set_caption("Balanced Accuracy")

  0%|          | 0/400 [00:00<?, ?it/s]



Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another tas



100%|██████████| 1250/1250 [14:22<00:00,  1.45it/s]
  0%|          | 0/634 [00:00<?, ?it/s]



100%|██████████| 634/634 [23:39<00:00,  2.24s/it]
  0%|          | 0/503 [00:00<?, ?it/s]



100%|██████████| 503/503 [07:58<00:00,  1.05it/s]
  9%|▉         | 80/850 [00:00<00:01, 700.97it/s]



100%|██████████| 850/850 [00:01<00:00, 705.64it/s]
  0%|          | 0/1575 [00:00<?, ?it/s]



100%|██████████| 1575/1575 [30:29<00:00,  1.16s/it]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score
nli_name,granularity,model_type,Unnamed: 3_level_1
MNLI,2sents,Histo,0.729
MNLI,2sents,ZS,0.697
MNLI,paragraph,Histo,0.623
MNLI,paragraph,ZS,0.621
MNLI,sentence,Histo,0.73
MNLI,sentence,ZS,0.703
VITC,2sents,Histo,0.732
VITC,2sents,ZS,0.714
VITC,paragraph,Histo,0.725
VITC,paragraph,ZS,0.706


In [5]:
for scorer in scorers:
        scorer["model"].save_imager_cache()

# Table 6: SummaCZS Operator Choice

In [7]:
scorers = []
for op1 in ["min", "mean", "max"]:
    for op2 in ["min", "mean", "max"]:
        scorers.append({"name": "ZS-%s-%s" % (op1, op2), "model": SummaCZS(granularity="sentence", model_name="vitc", op1=op1, op2=op2), "sign": 1})
        
scorer_doc = ScorerWrapper(scorers, max_batch_size=20, use_caching=True)
print("%d scorers loaded" % (len(scorers)))

9 scorers loaded


In [8]:
benchmark = SummaCBenchmark(cut="test")

results = {}
for dataset in benchmark.tasks:
    print("======= %s ========" % (dataset["name"]))
    datas = dataset["task"]
    labels = [d["label"] for d in datas]
    utils_summac_benchmark.compute_doc_level(scorer_doc, datas)
    
    for pred_label in datas[0].keys():
        if "pred_" not in pred_label or "total" in pred_label: continue
        balanced_acc = sklearn.metrics.balanced_accuracy_score(labels, [d[pred_label] for d in datas])
        model_name, input_type = pred_label.replace("pred_", "").split("|")
        model_type, op1, op2 = model_name.split("-")
        k = (op1, op2)
        if k not in results:
            results[k] = []
        results[k].append(balanced_acc)

cleaned_results = []
for (op1, op2), vs in results.items():
    cleaned_results.append({"op1": op1, "op2": op2, "score": np.mean(vs)})
    
pd.DataFrame(cleaned_results).groupby(["op1", "op2"]).agg({"score": "sum"}).style.set_precision(3).set_caption("Balanced Accuracy")

100%|██████████| 400/400 [00:00<00:00, 4027.15it/s]




 38%|███▊      | 480/1250 [00:00<00:00, 4655.20it/s]



100%|██████████| 1250/1250 [00:00<00:00, 4622.31it/s]
100%|██████████| 634/634 [00:00<00:00, 4340.38it/s]




100%|██████████| 503/503 [00:00<00:00, 4521.10it/s]




 52%|█████▏    | 440/850 [00:00<00:00, 4396.31it/s]



100%|██████████| 850/850 [00:00<00:00, 3651.48it/s]
 30%|███       | 480/1575 [00:00<00:00, 4628.48it/s]



100%|██████████| 1575/1575 [00:00<00:00, 4369.67it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,score
op1,op2,Unnamed: 2_level_1
max,max,0.691
max,mean,0.718
max,min,0.72
mean,max,0.62
mean,mean,0.628
mean,min,0.605
min,max,0.574
min,mean,0.557
min,min,0.531


# Max Doc Sents

In [5]:
scorers = []
for max_doc_sents in [1, 2, 4, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 100]:
    scorers.append({"name": "mds-%d" % (max_doc_sents), "model": SummaCZS(granularity="sentence", model_name="vitc", max_doc_sents=max_doc_sents), "sign": 1})
        
scorer_doc = ScorerWrapper(scorers, max_batch_size=20, use_caching=True)
print("%d scorers loaded" % (len(scorers)))

14 scorers loaded


In [6]:
benchmark = SummaCBenchmark(cut="val")

results = {}
for dataset in benchmark.tasks:
    print("======= %s ========" % (dataset["name"]))
    datas = dataset["task"]
    labels = [d["label"] for d in datas]
    utils_summac_benchmark.compute_doc_level(scorer_doc, datas)
    
    for pred_label in datas[0].keys():
        if "pred_" not in pred_label or "total" in pred_label: continue
        balanced_acc = sklearn.metrics.balanced_accuracy_score(labels, [d[pred_label] for d in datas])
        model_name, input_type = pred_label.replace("pred_", "").split("|")
        model_type, max_doc_sents = model_name.split("-")
        if max_doc_sents not in results:
            results[max_doc_sents] = []
        results[max_doc_sents].append(balanced_acc)

cleaned_results = []
for max_doc_sents, vs in results.items():
    cleaned_results.append({"max_doc_sents": int(max_doc_sents), "score": np.mean(vs)})

pd.DataFrame(cleaned_results).groupby(["max_doc_sents"]).agg({"score": "sum"}).style.set_precision(3).set_caption("Balanced Accuracy")

 45%|████▌     | 420/931 [00:00<00:00, 4078.50it/s]



100%|██████████| 931/931 [00:00<00:00, 4119.69it/s]
 39%|███▊      | 260/671 [00:00<00:00, 2449.49it/s]



100%|██████████| 671/671 [00:00<00:00, 2742.45it/s]
100%|██████████| 634/634 [00:00<00:00, 4130.34it/s]




 20%|██        | 260/1281 [00:00<00:00, 2495.64it/s]



100%|██████████| 1281/1281 [00:00<00:00, 3604.15it/s]
 49%|████▉     | 420/850 [00:00<00:00, 4095.00it/s]



100%|██████████| 850/850 [00:00<00:00, 3507.40it/s]
 35%|███▌      | 440/1250 [00:00<00:00, 4315.12it/s]



100%|██████████| 1250/1250 [00:00<00:00, 4264.95it/s]


Unnamed: 0_level_0,score
max_doc_sents,Unnamed: 1_level_1
1,0.57
2,0.623
4,0.665
5,0.674
10,0.686
15,0.69
20,0.689
25,0.69
30,0.689
35,0.689
