In [4]:
from collections import Counter, namedtuple
from functools import lru_cache

import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.util import ngrams
from typing import Union, Dict, List
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    # TODO(odashi): Avoid programatic download: it requires unnecessary outbound
    # connection and won't work in offline systems.
    nltk.download("punkt")


class SUMAttribute:
    """This class calculates several attributes given a sample summary.

    These attributes are all refernce free.
    * source_len
    * hypothesis_len
    * density
    * coverage
    * compression
    * repetition
    * novelty
    * copy_len
    """

    # TODO(odashi): Use dataclass instead.
    Match = namedtuple("Match", ("summary", "text", "length"))

    def __call__(self, texts: List[str], summaries: List[str]) -> List[dict]:
        """Calculate attributes of each pair of text and summary.

        Args:
            texts: a list of source documents.
            summaries: a list of generated summaries.

        Returns:
            A list of dicts with attributes.
        """
        out = []
        for text, summary in zip(texts, summaries):
            out.append(self.cal_attributes_each(text, summary))
        return out

    @lru_cache(maxsize=10)
    def cal_attributes_each(self, text: str, summary: str) -> Dict[str, Union[int , float]]:
        """For a single instance, calculate the attributes of each text/summary pair.

        Args:
            text: The text.
            summary: The summary.

        Returns:
            Returns the summary.
        """
        # Normalize text
        tokenized_text = word_tokenize(text)
        tokenized_summary = word_tokenize(summary)
        normalized_text = [str(t).lower() for t in tokenized_text]
        normalized_summary = [str(t).lower() for t in tokenized_summary]

        # Calculate matches
        matches = self.overlap(normalized_summary, normalized_text)
        summary_len = len(tokenized_summary)

        if summary_len == 0:
            density, coverage, compression = 0.0, 0.0, 0.0
        else:
            # Density
            density = sum(float(o.length) ** 2 for o in matches) / summary_len
            # Coverage
            coverage = sum(float(o.length) for o in matches) / summary_len
            # Compression
            compression = float(len(tokenized_text)) / summary_len

        # Repetition
        repetition = self.cal_repetition(summary)
        # Novelty
        novelty = self.cal_novelty(text, summary)

        # Copy length
        copy_lens = [o.length for o in matches]
        if len(copy_lens) == 0:
            copy_len = 0.0
        else:
            copy_len = sum(copy_lens) / len(copy_lens)
        return {
            "attr_density": density,
            "attr_coverage": coverage,
            "attr_compression": compression,
            "attr_repetition": repetition,
            "attr_novelty": novelty,
            "attr_copy_len": copy_len,
            "attr_source_len": len(normalized_text),
            "attr_hypothesis_len": len(normalized_summary),
        }

    def _get_ngrams(self, doc, n):
        doc = doc.lower()
        doc_sents = sent_tokenize(doc)
        _ngrams = []
        for sent in doc_sents:
            sent = word_tokenize(sent)
            _ngrams.extend(list(ngrams(sent, n=n)))
        return _ngrams

    def cal_novelty(self, text: str, summary: str, n: int = 2) -> float:
        """Returns the novelty score.

        Novelty is the proportion of segments in the summaries that haven’t appeared in
        source documents. The segments can be instantiated as n-grams.

        Args:
            text: The text.
            summary: The summary.
            n: The order of n-grams used in novelty calculation.

        Returns:
            The ratio of novel n-grams in the summary.
        """
        cnt_all = 0
        cnt_nov = 0
        _ngrams_text = self._get_ngrams(text, n=n)
        _ngrams_summary = self._get_ngrams(summary, n=n)
        counter_text: Counter = Counter(_ngrams_text)
        counter_summary: Counter = Counter(_ngrams_summary)
        for k, v in counter_summary.items():
            cnt_all += v
            if k not in counter_text:
                cnt_nov += v
        if cnt_all == 0:
            return 0
        else:
            return cnt_nov / cnt_all

    def cal_repetition(self, summary: str, n: int = 3) -> float:
        """Return the ratio of repeated segments in the summary.

        Args:
            summary: The summary.
            n: The length of the n-grams to be used in the calculation.

        Returns:
            The number of n-grams that are repeated in the summary.
        """
        cnt_all = 0
        cnt_rep = 0
        _ngrams = self._get_ngrams(summary, n=n)
        counter: Counter = Counter(_ngrams)
        for k, v in counter.items():
            cnt_all += v
            if v >= 2:
                cnt_rep += v - 1
        if cnt_all == 0:
            return 0
        else:
            return cnt_rep / cnt_all

    def overlap(self, summary: List[str], text: List[str]) -> List[Match]:
        """Return a list of Match objects between summary and text.

        This is a list of named tuples of the form (summary, text, length):
            - summary (int): the start index of the match in the summary
            - text (int): the start index of the match in the reference
            - length (int): the length of the extractive fragment

        Args:
            summary: the summary
            text: the text

        Returns:
            A list of Match objects indicating matches between the summary and text.
        """
        matches = []
        summary_start = 0
        text_start = 0
        while summary_start < len(summary):
            best_match = None
            best_match_length = 0
            while text_start < len(text):
                if summary[summary_start] == text[text_start]:
                    summary_end = summary_start
                    text_end = text_start
                    while (
                        summary_end < len(summary)
                        and text_end < len(text)
                        and text[text_end] == summary[summary_end]
                    ):
                        text_end += 1
                        summary_end += 1
                    length = summary_end - summary_start
                    if length > best_match_length:
                        best_match = SUMAttribute.Match(
                            summary_start, text_start, length
                        )
                        best_match_length = length
                    text_start = text_end
                else:
                    text_start += 1
            text_start = 0
            if best_match:
                if best_match_length > 0:
                    matches.append(best_match)
                summary_start += best_match_length
            else:
                summary_start += 1
        return matches

In [5]:
from compare_mt.rouge.rouge_scorer import RougeScorer
# from bert_score import score
def process(x):
    return sent_tokenize(" ".join(word_tokenize(x.strip())))
d,r=[],[]
predict = []
reference=[]
rouge_scorer = RougeScorer(['rouge1', 'rouge2', 'rougeLsum'], use_stemmer=True)

# rouge1, rouge2, rougeLsum = 0, 0, 0
# name_wordrank="cache_06-26-23-21-1719415302_transformer_cnn_origin_SCAN_model_cur3"
name_wordrank="cache_06-28-21-03-1719579831_pegasus_origin_norm_xsum_SCAN_again_model_cur9"
name_origin="_mnt_nas4_m11115088_WordRank_CNN_Model_XSUM_BART_100K"
R1_wordrank=[]
R2_wordrank=[]
RL_wordrank=[]
R1_origin=[]
R2_origin=[]
RL_origin=[]
# 11490
# 11334
for cnt in range(11334):
    with open("./result/%s/candidate_ranking/%d.dec"%(name_wordrank, cnt), "r") as dec:
        with open("./result/%s/reference_ranking/%d.ref"%(name_wordrank, cnt), "r") as ref:
            x=process(ref.read().replace("\n"," "))
            y=process(dec.read().replace("\n"," "))
            predict.append(" ".join(y))
            reference.append(" ".join(x))
            score = rouge_scorer.score("\n".join(x), "\n".join(y))
            rouge1 = score["rouge1"].fmeasure
            rouge2 = score["rouge2"].fmeasure
            rougeLsum = score["rougeLsum"].fmeasure
            R1_wordrank.append(rouge1)
            R2_wordrank.append(rouge2)
            RL_wordrank.append(rougeLsum)
    # with open("./result/%s/candidate_ranking/%d.dec"%(name_origin, cnt), "r") as dec:
    #     with open("./result/%s/reference_ranking/%d.ref"%(name_origin, cnt), "r") as ref:
    #         x=process(ref.read().replace("\n"," "))
    #         y=process(dec.read().replace("\n"," "))
    #         score = rouge_scorer.score("\n".join(x), "\n".join(y))
    #         rouge1 = score["rouge1"].fmeasure
    #         rouge2 = score["rouge2"].fmeasure
    #         rougeLsum = score["rougeLsum"].fmeasure    
    #         R1_origin.append(rouge1)    
    #         R2_origin.append(rouge2)    
    #         RL_origin.append(rougeLsum)    
            # break
            
# rouge1 = rouge1 / (cnt+1)
# rouge2 = rouge2 / (cnt+1)
# rougeLsum = rougeLsum / (cnt+1)
# print("ranking rouge1: %.6f, rouge2: %.6f, rougeL: %.6f"%(rouge1, rouge2, rougeLsum))
from statistics import mean
mean(R1_wordrank),mean(R2_wordrank),mean(RL_wordrank),len(R1_wordrank)


KeyboardInterrupt: 

In [None]:
import pandas as pd
f = {}
f['R1_wordrank'] = R1_wordrank
f['R2_wordrank'] = R2_wordrank
f['RL_wordrank'] = RL_wordrank
f['R1_origin'] = R1_origin
f['R2_origin'] = R2_origin
f['RL_origin'] = RL_origin
pd.DataFrame(f).to_csv("transformer_rouge.csv", sep='\t', index=False)

In [None]:
from statistics import mean
mean(R1_wordrank),mean(R2_wordrank),mean(RL_wordrank),
# mean(R1_origin),mean(R2_origin),mean(RL_origin)

(0.46329056341186303, 0.20869133813897192, 0.39104259125376156)

In [None]:
x = pd.DataFrame(load_from_disk("/mnt/nas4/m11115088/WordRank/Dataset/cnndm_gpt_all")['test'])
x

Unnamed: 0,document,summary,id
0,(CNN)The Palestinian Authority officially beca...,The Palestinian Authority joined the ICC as th...,f001ec5c4704938247d27a44948eebb37ae98d01
1,(CNN)Never mind cats having nine lives. A stra...,A resilient dog named Theia survived being hit...,230c522854991d053fe98a718b1defa077a8efef
2,"(CNN)If you've been following the news lately,...","Iranian Foreign Minister Mohammad Javad Zarif,...",4495ba8f3a340d97a9df1476f8a35502bcce1f69
3,(CNN)Five Americans who were monitored for thr...,Five Americans exposed to Ebola in Sierra Leon...,a38e72fed88684ec8d60dd5856282e999dc8c0ca
4,(CNN)A Duke student has admitted to hanging a ...,A Duke University student admitted to hanging ...,c27cf1b136cc270023de959e7ab24638021bc43f
...,...,...,...
11485,Angus Hawley's brother has spoken of his shock...,"Angus Hawley, Antonia Kidman's ex-husband, die...",b4a1738c4a0acdf3d189264a0927005aa5b856d6
11486,"(CNN)Tornadoes, fierce winds and severe thunde...",Severe storms with potential tornadoes forecas...,ac5f7eb411c744562e9ff10c537288b3a125610c
11487,"(CNN)Tornadoes, fierce winds and severe thunde...",The Midwest and Plains brace for tornadoes and...,5c0c00dcd3a745b98a380431609355658b9e4163
11488,Defiant Nigel Farage today insisted he did not...,UKIP leader Nigel Farage disputed the composit...,5e7b53548bb3a7998b9ed67d23bb3507ecef8ee0


In [None]:
from datasets import load_dataset, load_from_disk

import json
import pandas as pd

# with open('LLAMA_generate/llama3_finish_cnndm_test.json', 'r') as file:
#     x= pd.DataFrame(json.load(file))

d = pd.DataFrame(load_dataset('cnn_dailymail',"3.0.0")['test'])
m = x.merge(d,on=['id'])
import evaluate
bertscore = evaluate.load("bertscore")
import statistics
reference = m['highlights'].to_list()
predict = m['summary'].to_list()
bertscore_result = bertscore.compute(predictions=predict, references=reference, lang="en",rescale_with_baseline=True,verbose=True,device="cuda:2")
print("bertscore: ",statistics.mean(bertscore_result['f1']),statistics.mean(bertscore_result['precision']),statistics.mean(bertscore_result['recall']))


calculating scores...
computing bert embedding.


100%|██████████| 359/359 [01:40<00:00,  3.58it/s]


computing greedy matching.


100%|██████████| 180/180 [00:05<00:00, 30.88it/s]


done in 166499.63 seconds, 0.07 sentences/sec
bertscore:  0.23565260955108025 0.2461637729478197 0.22314966794462965


In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [None]:
torch.log(u) - torch.log(1 - u)

tensor([-0.6536,  0.2148,  0.1005, -0.8787])

In [None]:
from scipy.stats import ttest_ind, ttest_rel

# Independent t-test

# t_stat, p_value = ttest_ind(RL_origin, RL_wordrank)
# print("Independent t-test p-value:", p_value)

# Paired t-test

_, p_value = ttest_rel(R1_origin, R1_wordrank)
print("Paired t-test p-value:", p_value)
_, p_value = ttest_rel(R2_origin, R2_wordrank)
print("Paired t-test p-value:", p_value)
_, p_value = ttest_rel(RL_origin, RL_wordrank)
print("Paired t-test p-value:", p_value)

Paired t-test p-value: 2.531732542537365e-07
Paired t-test p-value: 3.304193406707968e-09
Paired t-test p-value: 0.04995956616403459


In [None]:
len(R2_origin)

11490

In [None]:
_, p_value = ttest_ind(R1_origin, R1_wordrank)
print("ind t-test p-value:", p_value)
_, p_value = ttest_ind(R2_origin, R2_wordrank)
print("ind t-test p-value:", p_value)
_, p_value = ttest_ind(RL_origin, RL_wordrank)
print("ind t-test p-value:", p_value)

ind t-test p-value: 0.011195688562843906
ind t-test p-value: 2.0473962145530893e-06
ind t-test p-value: 1.3474440296486458e-10


In [None]:
print(R2_origin)
print(R2_wordrank)

[0.1590909090909091, 0.12173913043478263, 0.15, 0.22641509433962265, 0.20618556701030927, 0.15000000000000002, 0.10852713178294573, 0.2990654205607477, 0.25, 0.21176470588235297, 0.18181818181818182, 0.10638297872340426, 0.15384615384615383, 0.25, 0.1038961038961039, 0.14678899082568808, 0.13223140495867766, 0.17741935483870966, 0.3218390804597701, 0.16279069767441862, 0.31111111111111117, 0.23611111111111108, 0.0975609756097561, 0.09523809523809525, 0.14893617021276595, 0.09677419354838711, 0.2857142857142857, 0.05405405405405406, 0.17391304347826086, 0.3287671232876712, 0.16417910447761194, 0.14, 0.14, 0.19718309859154928, 0.14035087719298245, 0.23529411764705885, 0.1090909090909091, 0.14414414414414417, 0.18867924528301885, 0.22448979591836735, 0.05555555555555555, 0.15384615384615385, 0.271604938271605, 0.3214285714285714, 0.29545454545454547, 0.3098591549295775, 0.08823529411764705, 0.15999999999999998, 0.1492537313432836, 0.1558441558441558, 0.16216216216216214, 0.175438596491228

In [None]:
from scipy.stats import ttest_ind, ttest_rel

# Independent t-test
data1 = [20, 22, 19, 20, 21]  # Group A
data2 = [30, 32, 29, 30, 31]  # Group B
t_stat, p_value = ttest_ind(data1, data2)
print("Independent t-test p-value:", p_value)

# Paired t-test
before = [210, 200, 214, 198, 223]  # Before treatment
after = [180, 182, 177, 190, 185]   # After treatment
t_stat, p_value = ttest_rel(after, before)
print("Paired t-test p-value:", p_value)


Independent t-test p-value: 7.071086608820127e-07
Paired t-test p-value: 0.010571463889056906


In [None]:
from datasets import load_from_disk
test = load_from_disk("/mnt/nas4/m11115088/WordRank/Dataset/cnndm_gpt_all")["test"]


In [None]:
from tqdm import tqdm
import statistics
x = SUMAttribute()
res_1=[]
res_2=[]
for d,p in tqdm(zip(test['document'],predict)):
    res_1.append(x.cal_novelty(
        text=d,
        summary=p,
        n=1
    ))
    res_2.append(x.cal_novelty(
        text=d,
        summary=p,
        n=2
    ))
statistics.mean(res_1),statistics.mean(res_2)

11490it [02:01, 94.88it/s]


(0.06320116696254176, 0.38032074532769394)

In [None]:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
tok.tokenize(" encoder ed")

['▁encoder', '▁', 'ed']

In [21]:
from BARTScore.bart_score import BARTScorer
import statistics
import evaluate
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
meteor = evaluate.load('meteor')
bleu = evaluate.load("bleu")
device="cuda:2"
bart_scorer = BARTScorer(device=device, checkpoint='/mnt/nas4/m11115088/WordRank/CNN_Model/GPT_final_chpt_bart')
# bart_scorer = BARTScorer(device=device, checkpoint='/mnt/nas4/m11115088/WordRank/XSUM_Models/XSUM_BART_100K')
# bart_scorer = BARTScorer(device=device, checkpoint='facebook/bart-large-cnndm')

In [34]:
# from bert_score import score
import json
from nltk import sent_tokenize,word_tokenize
from datasets import load_from_disk,load_dataset
def process(x):
    return sent_tokenize(" ".join(word_tokenize(x.strip())))
with open("/mnt/nas4/m11115088/BRIO/output/cnndm.test.ours.cased.out") as f:
    predict = [" ".join(process(i))  for i in f.read().strip().split("\n")]
with open("/mnt/nas4/m11115088/BRIO/output/cnndm.test.cased.reference") as f:
    reference = [" ".join(process(i))  for i in f.read().strip().split("\n")]

In [None]:
# from bert_score import score
import json
from nltk import sent_tokenize,word_tokenize
from datasets import load_from_disk,load_dataset
def process(x):
    return sent_tokenize(" ".join(word_tokenize(x.strip())))
d,r=[],[]
predict = []
reference=[]
# cnndm gpt
test = load_from_disk("/mnt/nas4/m11115088/WordRank/Dataset/cnndm_gpt_all")['test']
# xsum gpt
# test = load_from_disk("/mnt/nas4/m11115088/WordRank/Dataset/xsum/xsum_test_gpt4_turbo")['test']

# cnndm origin
# test = load_from_disk("/mnt/nas4/m11115088/WordRank/Dataset/cnndm_origin_norm_dataset_format")['test']
# test = load_dataset('cnn_dailymail',"3.0.0")["test"]
# xsum origin
# test = load_from_disk("/mnt/nas4/m11115088/WordRank/xsum/xsum_origin_norm_dataset_format")['test']
# test = load_dataset("EdinburghNLP/xsum")['test']

# cnndm gpt
# bart
# name_wordrank="_mnt_nas4_m11115088_WordRank_CNN_Model_final_chpt_bart"
# bart wr
# name_wordrank="cache_06-12-21-19-1718198372_BART_NN_VB_JJ_RB_CD_continue_3_model_cur1"
# bart wr adaptive
# name_wordrank="cache_06-19-15-32-1718782339_BART_NN_VB_JJ_RB_CD_again_continue_3_SSAN_11_10_9_c2_model_cur1"
# # # pegasus
# name_wordrank="cache_06-13-08-51-1718239876_pegasus_origin_continue_model_cur1"
# # # pegasus wr
# name_wordrank="cache_06-14-10-15-1718331357_correct_pegasus_llm_continue_7_model_cur1"
# # # pegasus wr adaptive
# name_wordrank="cache_06-20-22-57-1718895466_correct_pegasus_llm_continue_7_SSAN_0_model_cur1"
# # or
# name_wordrank="cache_07-05-15-50-1720165849_pegasus_gpt_cnndm_SCAN_new_again_model_cur3"

# # # # # transformer 
name_wordrank="cache_06-13-13-01-1718254918_transformer_continue_2_model_cur1"
# # # # # transformer wr 
name_wordrank="cache_06-14-11-06-1718334387_transformer_NN_VB_JJ_RB_CD_again_continue_3_model_cur1"
# # # # # transformer wr adaptive
# name_wordrank="cache_06-20-21-09-1718888968_transformer_NN_VB_JJ_RB_CD_again_continue_3_SSAN_11_10_9_0_model_cur1"


# # # # # xsum gpt
# # # # # bart
# name_wordrank="_mnt_nas4_m11115088_WordRank_CNN_Model_XSUM_BART_100K"
# # # # # bart wr
# name_wordrank="cache_06-19-22-30-1718807451_xsum-BART_NN_VB_JJ_RB_CD_model_cur5"
# # # # # bart wr + adaptive
# name_wordrank="cache_06-24-12-05-1719201903_xsum-BART_NN_VB_JJ_RB_CD_SCAN_KP_0_0.5S_again_model_cur1"
# # # # # pegasus 拿秉諺檔案
# # # # # pegasus wr
# name_wordrank="cache_06-25-13-41-1719294111_pegasus_LLM_XSUM_NN_VB_JJ_RB_CD_model_cur1"
# # # # # # pegasus wr + adaptive
# name_wordrank="cache_06-25-13-51-1719294688_pegasus_LLM_XSUM_SCAN_NN_VB_JJ_RB_CD_model_cur1"

# # # # # tansformer 
# name_wordrank="cache_06-26-09-47-1719366452_transformer_xsum_gpt_SCAN_model_cur1"
# # # # # tansformer wr
# name_wordrank="cache_06-25-20-31-1719318699_Transformer_xsum_gpt_origin_c1_model_cur10"
# # # # # tansformer wr + adaptive
# name_wordrank="cache_06-25-21-26-1719321963_transformer-xsum-origin_SCAN_model_cur2"

# # # BRIO wr+adaptive
# name_wordrank="XSUM_Models_GPT_BRIO_pegasus_wr_model_generation"

# # # # cnndm origin
# # bart
# name_wordrank="cased_bart_other_model_cur1"
# # # # bart wr
# name_wordrank="CNN_Model_cased_bart_WR_model_cur3"
# # bart wr+adaptive
# name_wordrank="cache_07-03-16-04-1719993891_BART_origin_norm_cnndm_SCAN_0_new_again_model_cur2"

# # # # pegasus wr+adaptive
# name_wordrank="cache_06-27-17-12-1719479574_pegasus_origin_norm_cnndm_SCAN_model_cur3"

# # # # 待確認
# # # # transformer
# name_wordrank="cache_06-26-23-21-1719415302_transformer_cnn_origin_SCAN_model_cur3"
# # # transformer wr
# # name_wordrank="transformer_cnn_wordrank_model_cur7"
# # # # transformer wr+adaptive
# name_wordrank="cache_07-04-16-21-1720081270_transformer_cnn_origin_WR_c_model_cur1"

# # # # # xsum origin
# # # bart wr+adaptive
# # name_wordrank="cache_06-26-16-44-1719391478_BART_origin_xsum_SCAN_model_cur3"

# # # # pegasus origin
# # name_wordrank="cache_06-27-13-11-1719465067_pegaus_origin_xsum_SCAN_model_cur3"
# # # # pegasus wr
# # name_wordrank="cache_06-28-21-03-1719579831_pegasus_origin_norm_xsum_SCAN_again_model_cur12"
# # # # pegasus wr +adaptive
# name_wordrank="cache_06-28-21-03-1719579831_pegasus_origin_norm_xsum_SCAN_again_model_cur9"

# # #transformer
# name_wordrank="XSUM_Models_Transformer_epoch28_model_cur1"
# # #transformer wr
# name_wordrank="XSUM_Models_Transformer_WR_epoch24_model_cur3"
# # #transformer wr+adapative
# name_wordrank="cache_06-25-21-26-1719321963_transformer-xsum-origin_SCAN_model_cur2"
# 11490
# 11334
for cnt in range(11490):
    try:
        # with open(os.path.join(self.fdir, "%d.json"%idx), "r") as f:
        #     data = json.load(f)
        with open("./result/%s/candidate_ranking/%d.dec"%(name_wordrank, cnt), "r") as dec:
            with open("./result/%s/reference_ranking/%d.ref"%(name_wordrank, cnt), "r") as ref:
                x=process(ref.read().replace("\n"," "))
                y=process(dec.read().replace("\n"," "))
                predict.append(" ".join(y))
                reference.append(" ".join(x))
    except:
        break


In [143]:
from tqdm import tqdm
import statistics
x = SUMAttribute()
res_1=[]
res_2=[]
for d,p in tqdm(zip(test['document'],predict)):
# for d,p in tqdm(zip(test['article'],predict)):
    res_1.append(x.cal_novelty(
        text=d,
        summary=p,
        n=1
    ))
    res_2.append(x.cal_novelty(
        text=d,
        summary=p,
        n=2
    ))
statistics.mean(res_1),statistics.mean(res_2)

16it [00:00, 74.07it/s]

11490it [02:15, 85.09it/s]


(0.015378972854376474, 0.12655208269147622)

In [36]:
import json
with open("BRIO_all_generate/BRIO_bart_cnndm_origin_wr.json") as f:
    x = json.load(f)
# x

In [39]:
with open("/mnt/nas4/m11115088/BRIO/output/my.out","w") as wm:
    wm.write('')
with open("/mnt/nas4/m11115088/BRIO/output/ref.out","w") as wf:
    wf.write('')
predict=[]
reference=[]
for a in x:
    with open("/mnt/nas4/m11115088/BRIO/output/my.out","a") as wm:
        wm.write(a['predict_summary'].lower() + '\n')
    with open("/mnt/nas4/m11115088/BRIO/output/ref.out","a") as wf:
        wf.write(a['human_summary'].lower() + '\n')
    predict.append(" ".join(process(a['predict_summary'])))
    reference.append(" ".join(process(a['human_summary'])))
    # print(a['predict_summary'])
    # print(a['human_summary'])
    # break

In [42]:
rouge_result = rouge.compute(predictions=predict, references=reference, use_stemmer=True)
rouge_result

{'rouge1': 0.4765624477431656,
 'rouge2': 0.23841115724938278,
 'rougeL': 0.32581749350054334,
 'rougeLsum': 0.325774837502808}

In [217]:
import json
import pandas as pd
# with open("/mnt/nas4/m11115088/WordRank/BRIO_all_generate/BRIO_bart_cnndm_origin_wr.json") as f:
with open("/mnt/nas4/m11115088/WordRank/BRIO_all_generate/BRIO_bart_cnndm_origin.json") as f:
# with open("/work/u5516210/BRIO/BRIO_bart_cnndm_gpt_wr_again_b4.json") as f:
    wordrank = pd.DataFrame(json.load(f))
    predict = wordrank['summary'].to_list()
    # reference = wordrank['human_summary'].to_list()

In [None]:
from datasets import load_from_disk,load_dataset
reference = load_dataset('cnn_dailymail',"3.0.0")["test"]['highlights']

: 

In [6]:
bertscore_result = bertscore.compute(predictions=predict, references=reference, lang="en",rescale_with_baseline=True,verbose=True,device=device)
print("bertscore: ",statistics.mean(bertscore_result['f1']),statistics.mean(bertscore_result['precision']),statistics.mean(bertscore_result['recall']))
result = bart_scorer.score(predict, reference, batch_size=8) # generation scores from the first list of texts to the second list of texts.
print("bartscore: ",statistics.mean(result))
results = meteor.compute(predictions=predict, references=reference)
print("meteor: ",results)
results = bleu.compute(predictions=predict, references=reference)
print("bleu: ",results)


calculating scores...
computing bert embedding.


100%|██████████| 359/359 [01:38<00:00,  3.66it/s]


computing greedy matching.


100%|██████████| 180/180 [00:04<00:00, 41.59it/s]


done in 263788.66 seconds, 0.04 sentences/sec
bertscore:  0.3352359719982564 0.3168015085083122 0.35316672813246547
bartscore:  -3.9472507272521966
meteor:  {'meteor': 0.4104798733119624}
bleu:  {'bleu': 0.18521985958797088, 'precisions': [0.4633733369036317, 0.21925999802182586, 0.13099235959321884, 0.08843288451608876], 'brevity_penalty': 1.0, 'length_ratio': 1.0908262104899789, 'translation_length': 739434, 'reference_length': 677866}


In [191]:
# origin_result = result
wordrank_result = result
# adaptive_wordrank_result = result



In [199]:
statistics.mean(origin_result),statistics.mean(wordrank_result)

(-4.26135012779325, -4.362055405950723)

In [197]:
from scipy.stats import ttest_ind, ttest_rel

# Independent t-test
t_stat, p_value = ttest_ind(origin_result,wordrank_result )
print("Independent t-test p-value:", p_value)

Independent t-test p-value: 7.058446791524808e-20


In [16]:
import json
import pandas as pd
from datasets import load_from_disk,load_dataset
with open("/mnt/nas4/m11115088/WordRank/result/BRIO_CNNDM_ORIGIN/generated_predictions.json") as f:
    x = pd.DataFrame(json.load(f)).rename(columns={"summary":"summary_predict","document":"document_p"})
y = pd.DataFrame(load_dataset('cnn_dailymail',"3.0.0")['test'])
merge = pd.concat([x,y],axis=1)
predict = merge['summary_predict'].to_list()
reference = merge['highlights'].to_list()

In [121]:

with open("/mnt/nas4/m11115088/WordRank/brio_xsum_gpt.out") as f:
    brio_gpt_xsum = f.read().strip().split("\n")
predict = brio_gpt_xsum


In [15]:

merge['summary_predict'] = merge['summary_predict'].apply(lambda x : process(x.replace("\n"," "))[0])
merge['highlights'] = merge['highlights'].apply(lambda x : process(x.replace("\n"," "))[0])
predict = merge['summary_predict'].to_list()
reference = merge['highlights'].to_list()
rouge_result = rouge.compute(predictions=predict, references=reference, use_stemmer=True)
rouge_result

INFO:absl:Using default tokenizer.


{'rouge1': 0.40833039347345157,
 'rouge2': 0.24010882766492758,
 'rougeL': 0.37143760772115275,
 'rougeLsum': 0.37141773628249275}

In [80]:
from scipy.stats import ttest_ind, ttest_rel

# Independent t-test
data1 = [20, 22, 19, 20, 21]  # Group A
data2 = [30, 32, 29, 30, 31]  # Group B
t_stat, p_value = ttest_ind(data1, data2)
print("Independent t-test p-value:", p_value)

# Paired t-test
before = [210, 200, 214, 198, 223]  # Before treatment
after = [180, 182, 177, 190, 185]   # After treatment
t_stat, p_value = ttest_rel(before, after)
print("Paired t-test p-value:", p_value)


Independent t-test p-value: 7.071086608820127e-07
Paired t-test p-value: 0.010571463889056906


In [54]:
import pandas as pd
all_data_pd = pd.read_json(path_or_buf="/mnt/nas4/m11115088/WordRank/CNNDM_Models_generate/LLM_Teached_BART_CNNDM/generated_predictions.json")
# all_data_pd = pd.read_json(path_or_buf="cnndm_test_gpt4_turbo_modify.json", lines=True)
test = load_from_disk("/mnt/nas4/m11115088/WordRank/Dataset/cnndm_gpt_all")['test']
test_pd = pd.DataFrame(test)
test_pd['model_summary'] = all_data_pd['summary']

In [55]:
bertscore_result = bertscore.compute(predictions=test_pd['model_summary'].to_list(), references=test_pd['summary'].to_list(), lang="en",verbose=True)
import statistics
statistics.mean(bertscore_result['f1']),statistics.mean(bertscore_result['precision']),statistics.mean(bertscore_result['recall'])

calculating scores...
computing bert embedding.


100%|██████████| 360/360 [01:43<00:00,  3.49it/s]


computing greedy matching.


100%|██████████| 180/180 [00:03<00:00, 47.57it/s]


done in 15451561.83 seconds, 0.00 sentences/sec


(0.9044668506704485, 0.9033925881481254, 0.9058175513389528)