In [2]:
from collections import Counter, namedtuple
from functools import lru_cache

import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.util import ngrams
from typing import Union, Dict, List
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    # TODO(odashi): Avoid programatic download: it requires unnecessary outbound
    # connection and won't work in offline systems.
    nltk.download("punkt")


class SUMAttribute:
    """This class calculates several attributes given a sample summary.

    These attributes are all refernce free.
    * source_len
    * hypothesis_len
    * density
    * coverage
    * compression
    * repetition
    * novelty
    * copy_len
    """

    # TODO(odashi): Use dataclass instead.
    Match = namedtuple("Match", ("summary", "text", "length"))

    def __call__(self, texts: List[str], summaries: List[str]) -> List[dict]:
        """Calculate attributes of each pair of text and summary.

        Args:
            texts: a list of source documents.
            summaries: a list of generated summaries.

        Returns:
            A list of dicts with attributes.
        """
        out = []
        for text, summary in zip(texts, summaries):
            out.append(self.cal_attributes_each(text, summary))
        return out

    @lru_cache(maxsize=10)
    def cal_attributes_each(self, text: str, summary: str) -> Dict[str, Union[int , float]]:
        """For a single instance, calculate the attributes of each text/summary pair.

        Args:
            text: The text.
            summary: The summary.

        Returns:
            Returns the summary.
        """
        # Normalize text
        tokenized_text = word_tokenize(text)
        tokenized_summary = word_tokenize(summary)
        normalized_text = [str(t).lower() for t in tokenized_text]
        normalized_summary = [str(t).lower() for t in tokenized_summary]

        # Calculate matches
        matches = self.overlap(normalized_summary, normalized_text)
        summary_len = len(tokenized_summary)

        if summary_len == 0:
            density, coverage, compression = 0.0, 0.0, 0.0
        else:
            # Density
            density = sum(float(o.length) ** 2 for o in matches) / summary_len
            # Coverage
            coverage = sum(float(o.length) for o in matches) / summary_len
            # Compression
            compression = float(len(tokenized_text)) / summary_len

        # Repetition
        repetition = self.cal_repetition(summary)
        # Novelty
        novelty = self.cal_novelty(text, summary)

        # Copy length
        copy_lens = [o.length for o in matches]
        if len(copy_lens) == 0:
            copy_len = 0.0
        else:
            copy_len = sum(copy_lens) / len(copy_lens)
        return {
            "attr_density": density,
            "attr_coverage": coverage,
            "attr_compression": compression,
            "attr_repetition": repetition,
            "attr_novelty": novelty,
            "attr_copy_len": copy_len,
            "attr_source_len": len(normalized_text),
            "attr_hypothesis_len": len(normalized_summary),
        }

    def _get_ngrams(self, doc, n):
        doc = doc.lower()
        doc_sents = sent_tokenize(doc)
        _ngrams = []
        for sent in doc_sents:
            sent = word_tokenize(sent)
            _ngrams.extend(list(ngrams(sent, n=n)))
        return _ngrams

    def cal_novelty(self, text: str, summary: str, n: int = 2) -> float:
        """Returns the novelty score.

        Novelty is the proportion of segments in the summaries that haven’t appeared in
        source documents. The segments can be instantiated as n-grams.

        Args:
            text: The text.
            summary: The summary.
            n: The order of n-grams used in novelty calculation.

        Returns:
            The ratio of novel n-grams in the summary.
        """
        cnt_all = 0
        cnt_nov = 0
        _ngrams_text = self._get_ngrams(text, n=n)
        _ngrams_summary = self._get_ngrams(summary, n=n)
        counter_text: Counter = Counter(_ngrams_text)
        counter_summary: Counter = Counter(_ngrams_summary)
        for k, v in counter_summary.items():
            cnt_all += v
            if k not in counter_text:
                cnt_nov += v
        if cnt_all == 0:
            return 0
        else:
            return cnt_nov / cnt_all

    def cal_repetition(self, summary: str, n: int = 3) -> float:
        """Return the ratio of repeated segments in the summary.

        Args:
            summary: The summary.
            n: The length of the n-grams to be used in the calculation.

        Returns:
            The number of n-grams that are repeated in the summary.
        """
        cnt_all = 0
        cnt_rep = 0
        _ngrams = self._get_ngrams(summary, n=n)
        counter: Counter = Counter(_ngrams)
        for k, v in counter.items():
            cnt_all += v
            if v >= 2:
                cnt_rep += v - 1
        if cnt_all == 0:
            return 0
        else:
            return cnt_rep / cnt_all

    def overlap(self, summary: List[str], text: List[str]) -> List[Match]:
        """Return a list of Match objects between summary and text.

        This is a list of named tuples of the form (summary, text, length):
            - summary (int): the start index of the match in the summary
            - text (int): the start index of the match in the reference
            - length (int): the length of the extractive fragment

        Args:
            summary: the summary
            text: the text

        Returns:
            A list of Match objects indicating matches between the summary and text.
        """
        matches = []
        summary_start = 0
        text_start = 0
        while summary_start < len(summary):
            best_match = None
            best_match_length = 0
            while text_start < len(text):
                if summary[summary_start] == text[text_start]:
                    summary_end = summary_start
                    text_end = text_start
                    while (
                        summary_end < len(summary)
                        and text_end < len(text)
                        and text[text_end] == summary[summary_end]
                    ):
                        text_end += 1
                        summary_end += 1
                    length = summary_end - summary_start
                    if length > best_match_length:
                        best_match = SUMAttribute.Match(
                            summary_start, text_start, length
                        )
                        best_match_length = length
                    text_start = text_end
                else:
                    text_start += 1
            text_start = 0
            if best_match:
                if best_match_length > 0:
                    matches.append(best_match)
                summary_start += best_match_length
            else:
                summary_start += 1
        return matches

In [8]:
# from bert_score import score
def process(x):
    return sent_tokenize(" ".join(word_tokenize(x.strip())))
d,r=[],[]
predict = []
reference=[]
# rouge_scorer = RougeScorer(['rouge1', 'rouge2', 'rougeLsum'], use_stemmer=True)
# rouge1, rouge2, rougeLsum = 0, 0, 0
name="cache_06-14-11-06-1718334387_transformer_NN_VB_JJ_RB_CD_again_continue_3_model_cur1"
for cnt in range(11490):
    with open("./result/%s/candidate_ranking/%d.dec"%(name, cnt), "r") as dec:
        with open("./result/%s/reference_ranking/%d.ref"%(name, cnt), "r") as ref:
            x=process(ref.read().replace("\n"," "))
            y=process(dec.read().replace("\n"," "))
            predict.append(" ".join(y))
            reference.append(" ".join(x))
            # score = rouge_scorer.score("\n".join(x), "\n".join(y))
            # rouge1 += score["rouge1"].fmeasure
            # rouge2 += score["rouge2"].fmeasure
            # rougeLsum += score["rougeLsum"].fmeasure
            # break
# rouge1 = rouge1 / (cnt+1)
# rouge2 = rouge2 / (cnt+1)
# rougeLsum = rougeLsum / (cnt+1)
# print("ranking rouge1: %.6f, rouge2: %.6f, rougeL: %.6f"%(rouge1, rouge2, rougeLsum))


In [4]:
from datasets import load_from_disk
test = load_from_disk("/mnt/nas4/m11115088/WordRank/Dataset/cnndm_gpt_all")["test"]


  from .autonotebook import tqdm as notebook_tqdm


In [40]:
from tqdm import tqdm
import statistics
x = SUMAttribute()
res_1=[]
res_2=[]
for d,p in tqdm(zip(test['document'],predict)):
    res_1.append(x.cal_novelty(
        text=d,
        summary=p,
        n=1
    ))
    res_2.append(x.cal_novelty(
        text=d,
        summary=p,
        n=2
    ))
statistics.mean(res_1),statistics.mean(res_2)

11490it [02:00, 95.29it/s]


(0.0467482810617267, 0.0467482810617267)

In [5]:
from BARTScore.bart_score import BARTScorer
import statistics
import evaluate
bertscore = evaluate.load("bertscore")
meteor = evaluate.load('meteor')
bleu = evaluate.load("bleu")
bart_scorer = BARTScorer(device='cuda:1', checkpoint='facebook/bart-large-cnn')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/m11115088/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/m11115088/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
# bertscore_result = bertscore.compute(predictions=predict, references=reference, lang="en",verbose=True)
# print("bertscore: ",statistics.mean(bertscore_result['f1']),statistics.mean(bertscore_result['precision']),statistics.mean(bertscore_result['recall']))
results = meteor.compute(predictions=predict, references=reference)
print("meteor: ",results)
results = bleu.compute(predictions=predict, references=reference)
print("bleu: ",results)
result = bart_scorer.score(predict, reference, batch_size=8) # generation scores from the first list of texts to the second list of texts.
print("bartscore: ",statistics.mean(result))

meteor:  {'meteor': 0.3215493205858537}
bleu:  {'bleu': 0.09019974870068764, 'precisions': [0.38704816592407415, 0.12930199086982871, 0.05387568620582575, 0.024550382899373457], 'brevity_penalty': 1.0, 'length_ratio': 1.1524181493221255, 'translation_length': 953641, 'reference_length': 827513}
bartscore:  -3.6158353603436075


In [54]:
import pandas as pd
all_data_pd = pd.read_json(path_or_buf="/mnt/nas4/m11115088/WordRank/CNNDM_Models_generate/LLM_Teached_BART_CNNDM/generated_predictions.json")
# all_data_pd = pd.read_json(path_or_buf="cnndm_test_gpt4_turbo_modify.json", lines=True)
test = load_from_disk("/mnt/nas4/m11115088/WordRank/Dataset/cnndm_gpt_all")['test']
test_pd = pd.DataFrame(test)
test_pd['model_summary'] = all_data_pd['summary']

In [55]:
bertscore_result = bertscore.compute(predictions=test_pd['model_summary'].to_list(), references=test_pd['summary'].to_list(), lang="en",verbose=True)
import statistics
statistics.mean(bertscore_result['f1']),statistics.mean(bertscore_result['precision']),statistics.mean(bertscore_result['recall'])

calculating scores...
computing bert embedding.


100%|██████████| 360/360 [01:43<00:00,  3.49it/s]


computing greedy matching.


100%|██████████| 180/180 [00:03<00:00, 47.57it/s]


done in 15451561.83 seconds, 0.00 sentences/sec


(0.9044668506704485, 0.9033925881481254, 0.9058175513389528)