# Dataset

## imports

In [1]:
%load_ext lab_black

In [2]:
import sys

sys.path.append("..")

In [3]:
import os
import json

from tqdm import tqdm
from textrank import TextRank
from utils import RougeScorer

## Data Load

In [4]:
def get_data(path):
    with open(path, "r", encoding="utf-8") as f:
        jsonl = list(f)

    datasets = []
    for json_str in jsonl:
        datasets.append(json.loads(json_str))

    data = []
    for dataset in datasets:
        doc_id = dataset["id"]
        text = dataset["article_original"]
        gold = dataset["abstractive"]
        data.append((doc_id, text, gold))

    return data

In [14]:
path = "../../../../datasets/kor_data/magazine/test.jsonl"

data = get_data(path)

## TextRank

In [15]:
output_path = "../outputs/magazine"
hyp_path = f"{output_path}/hyp"
abs_ref_path = f"{output_path}/abs_ref"

if not os.path.exists(output_path):
    os.mkdir(output_path)
if not os.path.exists(hyp_path):
    os.mkdir(hyp_path)
if not os.path.exists(abs_ref_path):
    os.mkdir(abs_ref_path)

In [16]:
model = TextRank(method="algebraic")

In [17]:
for articles in tqdm(data):
    doc_id, sents, gold = articles

    hyp = model.summarize(sents)

    with open(f"{abs_ref_path}/{doc_id}.txt", "w", encoding="utf8") as f:
        f.write(gold)
    with open(f"{hyp_path}/{doc_id}.txt", "w", encoding="utf8") as f:
        f.write(hyp)

100%|██████████| 5000/5000 [01:14<00:00, 66.89it/s]


## Rouge Score

In [15]:
hyp_path = "../outputs/magazine/hyp"
ref_path = "../outputs/magazine/abs_ref"

In [16]:
rouge_eval = RougeScorer()
result = rouge_eval.compute_rouge(ref_path, hyp_path)

100%|██████████| 5000/5000 [00:00<00:00, 30289.03it/s]

--------------------------------------------------
# of Testset : 5000
--------------------------------------------------





In [11]:
# 뉴스
print(result)



    ****** ROUGE SCORES ******
    ** ROUGE 1
    F1        >> 0.361
    Precision >> 0.271
    Recall    >> 0.589
    ** ROUGE 2
    F1        >> 0.191
    Precision >> 0.144
    Recall    >> 0.312
    ** ROUGE L
    F1        >> 0.255
    Precision >> 0.192
    Recall    >> 0.416


In [14]:
# 법률
print(result)



    ****** ROUGE SCORES ******
    ** ROUGE 1
    F1        >> 0.551
    Precision >> 0.498
    Recall    >> 0.676
    ** ROUGE 2
    F1        >> 0.379
    Precision >> 0.346
    Recall    >> 0.458
    ** ROUGE L
    F1        >> 0.440
    Precision >> 0.399
    Recall    >> 0.535


In [17]:
# 메거진
print(result)



    ****** ROUGE SCORES ******
    ** ROUGE 1
    F1        >> 0.316
    Precision >> 0.266
    Recall    >> 0.427
    ** ROUGE 2
    F1        >> 0.112
    Precision >> 0.093
    Recall    >> 0.152
    ** ROUGE L
    F1        >> 0.210
    Precision >> 0.177
    Recall    >> 0.284
