# Evaluate using ROUGE Score 

## imports

In [1]:
%load_ext lab_black

In [2]:
import os
import rouge

from glob import glob

# from rouge_score import rouge_scorer, scoring
from tqdm import tqdm
from konlpy.tag import Mecab

## ROUGE Class

In [20]:
class RougeScorer:
    def __init__(self, use_tokenizer=True):

        self.use_tokenizer = use_tokenizer
        if use_tokenizer:
            self.tokenizer = Mecab()

        self.rouge_evaluator = rouge.Rouge(
            metrics=["rouge-n", "rouge-l"],
            max_n=2,
            limit_length=True,
            length_limit=1000,
            length_limit_type="words",
            apply_avg=True,
            apply_best=False,
            alpha=0.5,  # Default F1_score
            weight_factor=1.2,
            stemming=True,
        )

    def compute_rouge(self, ref_path, hyp_path):
        ref_fnames = glob(f"{ref_path}/*.txt")
        hyp_fnames = glob(f"{hyp_path}/*.txt")
        ref_fnames.sort()
        hyp_fnames.sort()

        self.reference_summaries = []
        self.generated_summaries = []

        for ref_fname, hyp_fname in tqdm(
            zip(ref_fnames, hyp_fnames), total=len(ref_fnames)
        ):
            assert os.path.split(ref_fname)[1] == os.path.split(hyp_fname)[1]

            with open(ref_fname, "r", encoding="utf8") as f:
                ref = f.read().split("\n")
                ref = "".join(ref)

            with open(hyp_fname, "r", encoding="utf8") as f:
                hyp = f.read().split("\n")
                hyp = "".join(hyp)

            if self.use_tokenizer:
                ref = self.tokenizer.morphs(ref)
                hyp = self.tokenizer.morphs(hyp)

            ref = " ".join(ref)
            hyp = " ".join(hyp)

            self.reference_summaries.append(ref)
            self.generated_summaries.append(hyp)

        scores = self.rouge_evaluator.get_scores(
            self.generated_summaries, self.reference_summaries
        )
        str_scores = self.format_rouge_scores(scores)
        self.save_rouge_scores(str_scores)
        return str_scores

    def save_rouge_scores(self, str_scores):
        with open("rouge_scores.txt", "w") as output:
            output.write(str_scores)

    def format_rouge_scores(self, scores):
        return """\n
    ****** ROUGE SCORES ******
    ** ROUGE 1
    F1        >> {:.3f}
    Precision >> {:.3f}
    Recall    >> {:.3f}
    ** ROUGE 2
    F1        >> {:.3f}
    Precision >> {:.3f}
    Recall    >> {:.3f}
    ** ROUGE L
    F1        >> {:.3f}
    Precision >> {:.3f}
    Recall    >> {:.3f}""".format(
            scores["rouge-1"]["f"],
            scores["rouge-1"]["p"],
            scores["rouge-1"]["r"],
            scores["rouge-2"]["f"],
            scores["rouge-2"]["p"],
            scores["rouge-2"]["r"],
            scores["rouge-l"]["f"],
            scores["rouge-l"]["p"],
            scores["rouge-l"]["r"],
        )

In [21]:
# class Rouge:
#     def __init__(
#         self,
#         rouge_types=["rouge1", "rouge2", "rougeL"],
#         use_tokenizer=False,
#         use_agregator=True,
#     ):

#         self.use_tokenizer = use_tokenizer
#         if use_tokenizer:
#             self.tokenizer = Mecab()

#         self.use_agregator = use_agregator
#         self.scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types)
#         if use_agregator:
#             self.agregator = scoring.BootstrapAggregator()
#         else:
#             self.scores = []

#     def rouge_score(self, ref_path, hyp_path):
#         ref_fnames = glob(f"{ref_path}/*.txt")
#         hyp_fnames = glob(f"{hyp_path}/*.txt")
#         ref_fnames.sort()
#         hyp_fnames.sort()

#         self.ref_list = []
#         self.hyp_list = []

#         for ref_fname, hyp_fname in tqdm(
#             zip(ref_fnames, hyp_fnames), total=len(ref_fnames)
#         ):
#             assert os.path.split(ref_fname)[1] == os.path.split(hyp_fname)[1]

#             with open(ref_fname, "r", encoding="utf8") as f:
#                 ref = f.read().split("\n")
#                 ref = "".join(ref)

#             with open(hyp_fname, "r", encoding="utf8") as f:
#                 hyp = f.read().split("\n")
#                 hyp = "".join(hyp)

#             if self.use_tokenizer:
#                 ref = self.tokenizer.morphs(ref)
#                 hyp = self.tokenizer.morphs(hyp)

#             ref = " ".join(ref)
#             hyp = " ".join(hyp)

#             self.ref_list.append(ref)
#             self.hyp_list.append(hyp)

#             score = self.scorer.score(ref, hyp)
#             if self.use_agregator:
#                 self.agregator.add_scores(score)
#             else:
#                 self.scores.append(score)

#         if self.use_agregator:
#             result = self.agregator.aggregate()
#         else:
#             result = {}
#             for key in self.scores[0]:
#                 result[key] = list(score[key] for score in self.scores)

#         return result

In [22]:
rouge_eval = RougeScorer(use_tokenizer=True)

In [29]:
ref_path = "../outputs/ext_ref"
hyp_path = "../outputs/hyp"

result = rouge_eval.compute_rouge(ref_path, hyp_path)

100%|██████████| 5000/5000 [00:03<00:00, 1486.70it/s]


### hyp vs ext_ref

In [30]:
print(result)



    ****** ROUGE SCORES ******
    ** ROUGE 1
    F1        >> 0.651
    Precision >> 0.653
    Recall    >> 0.718
    ** ROUGE 2
    F1        >> 0.502
    Precision >> 0.507
    Recall    >> 0.551
    ** ROUGE L
    F1        >> 0.673
    Precision >> 0.671
    Recall    >> 0.729


### hyp vs abs_ref

In [28]:
print(result)



    ****** ROUGE SCORES ******
    ** ROUGE 1
    F1        >> 0.421
    Precision >> 0.349
    Recall    >> 0.668
    ** ROUGE 2
    F1        >> 0.188
    Precision >> 0.156
    Recall    >> 0.302
    ** ROUGE L
    F1        >> 0.433
    Precision >> 0.363
    Recall    >> 0.638


### ext_ref vs abs_ref

In [24]:
print(result)



    ****** ROUGE SCORES ******
    ** ROUGE 1
    F1        >> 0.556
    Precision >> 0.470
    Recall    >> 0.793
    ** ROUGE 2
    F1        >> 0.276
    Precision >> 0.234
    Recall    >> 0.395
    ** ROUGE L
    F1        >> 0.552
    Precision >> 0.472
    Recall    >> 0.746
