## Text Summarization 

Transformer 기반 모델인 Text-to-Text Transfer Transformer (T5)를 사용할 예정으로 T5는 encoder-decoder 구조를 갖는 모델로 요약, 번역등 다양한 task에서 활용되고 있음.

데이터 셋은 aihub에서 제공하는 문서요약 텍스트로 원문 데이터 40만 건(신문기사 30만 건, 기고문 6만 건, 잡지기사 1만 건, 법원 판결문 3만 건)을 활용하여 각각 추출요약 40만 건, 생성요약 40만 건, 총 80만 건의 요약문 도출 / 원문으로부터 변형 없이 그대로 선택된 3개 문장으로 추출요약문 생성 / 원문의 내용을 바탕으로 재작성된 생성요약문 생성 

In [None]:
!pip install transformers==4.20.0
!pip install nltk
!pip install -U nltk
!pip install rouge-score
!pip install keras_nlp==0.3.0
!pip install datasets
!pip install huggingface-hub

In [None]:
import os
import logging
from pprint import pprint
import json
import nltk
import numpy as np
import tensorflow as tf
from tensorflow import keras

tf.get_logger().setLevel(logging.ERROR)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
TRAIN_TEST_SPLIT = 0.1

MAX_INPUT_LENGTH = 512 
MIN_TARGET_LENGTH = 5 
MAX_TARGET_LENGTH = 512
BATCH_SIZE = 32
LEARNING_RATE = 2e-5  
MAX_EPOCHS = 5

MODEL_CHECKPOINT = "psyche/KoT5-summarization"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from datasets import Dataset
import json
ds_path = "/content/drive/MyDrive/article_train_original.json"

_id, document, summary = [], [], []

with open(ds_path, "r") as st_json:
    doc = json.load(st_json)

for i in doc['documents']:
    _id.append(i['id'])
    document.append(' '.join([sent['sentence'] for sent in i['text'][0]]))
    summary.append(i['abstractive'][0])

raw_datasets = Dataset.from_dict({"id":_id,
                                  "document":document,
                                  "summary":summary})

print(raw_datasets[0])

In [None]:
raw_datasets = raw_datasets.train_test_split(
    train_size=1-TRAIN_TEST_SPLIT, test_size=TRAIN_TEST_SPLIT
)
raw_datasets

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [None]:
if MODEL_CHECKPOINT in ["psyche/KoT5-summarization", "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "요약: "
else:
    prefix = ""

prefix

In [None]:
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"], max_length=MAX_TARGET_LENGTH, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

In [None]:
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT, from_pt=True)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
test_dataset = tokenized_datasets["test"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)
generation_dataset = (
    tokenized_datasets["test"]
    .shuffle()
    .select(list(range(200)))
    .to_tf_dataset(
        batch_size=BATCH_SIZE,
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        collate_fn=data_collator,
    )
)

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer)

In [None]:
import keras_nlp

rouge_l = keras_nlp.metrics.RougeL()


def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id 
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_l(decoded_labels, decoded_predictions)
  
    result = {"RougeL": result["f1_score"]}

    return result

In [None]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=generation_dataset, predict_with_generate=True
)

callbacks = [metric_callback]

model.fit(
    train_dataset, validation_data=test_dataset, epochs=MAX_EPOCHS, callbacks=callbacks
)    

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")

summary = summarizer(
              raw_datasets["test"][0]["document"],
              min_length=MIN_TARGET_LENGTH,
              max_length=MAX_TARGET_LENGTH,
          )

print(f'document:{raw_datasets["test"][0]["document"]}')
print(f'label summary:{raw_datasets["test"][0]["summary"]}')
print(f'pred summary: {summary[0]["summary_text"]}')

In [None]:
[raw_datasets["test"][0]["summary"].split()]

In [None]:
summary[0]["summary_text"].split()

In [None]:
# import nltk.translate.bleu_score as bleu
# print('BLEU Score:',bleu.sentence_bleu([raw_datasets["test"][0]["summary"].split()],summary[0]["summary_text"].split()))

In [None]:
sentence = input("input string:")
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")
summary = summarizer(
            sentence,
            min_length=MIN_TARGET_LENGTH,
            max_length=MAX_TARGET_LENGTH,
        )
print(f'pred summary: {summary[0]["summary_text"]}')

## original 및 variant 요약 결과 저장 

In [None]:
import pandas as pd
original = pd.read_csv("/content/drive/MyDrive/숨고/이인균님(인물 분류, 문서 요약)/original.csv",encoding='cp949',header=None)
original.columns = ['contents']
variant = pd.read_csv("/content/drive/MyDrive/숨고/이인균님(인물 분류, 문서 요약)/variant.csv",encoding='cp949',header=None)
variant.columns = ['contents']

In [None]:
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")
summary_original = summarizer(
              original['contents'].values.tolist(),
              min_length=MIN_TARGET_LENGTH,
              max_length=MAX_TARGET_LENGTH,
          )
summary_variant = summarizer(
              variant['contents'].values.tolist(),
              min_length=MIN_TARGET_LENGTH,
              max_length=MAX_TARGET_LENGTH,
          )

In [None]:
summary_original = [i['summary_text'] for i in summary_original]
summary_original = pd.DataFrame(summary_original)
summary_original.columns = ['contents']
summary_original.to_csv("/content/drive/MyDrive/숨고/이인균님(인물 분류, 문서 요약)/summary_original.csv", index=False)
summary_variant = [i['summary_text'] for i in summary_variant]
summary_variant = pd.DataFrame(summary_variant)
summary_variant.columns = ['contents']
summary_variant.to_csv("/content/drive/MyDrive/숨고/이인균님(인물 분류, 문서 요약)/summary_variant.csv", index=False)

## Original / variant 차이 비교

In [None]:
summary_original = pd.read_csv("/content/drive/MyDrive/숨고/이인균님(인물 분류, 문서 요약)/summary_original.csv")
summary_variant = pd.read_csv("/content/drive/MyDrive/숨고/이인균님(인물 분류, 문서 요약)/summary_variant.csv")

df_diff = []
for i,j in zip(summary_original['contents'], summary_variant['contents']):
    tmp_org = set(i.split(" "))
    tmp_var = set(j.split(" "))
    diff = list(tmp_org.union(tmp_var) - tmp_org.intersection(tmp_var))
    df_diff.append(diff)

df_diff = pd.DataFrame({'difference':df_diff})
df_diff.to_csv("/content/drive/MyDrive/숨고/이인균님(인물 분류, 문서 요약)/difference.csv", index=False)
df_diff

## mysummary와 original summary간의 rouge 계산

In [None]:
!pip install rouge py-rouge
!pip install rouge-score

In [None]:
from rouge import Rouge
from rouge_score import rouge_scorer
import numpy as np

mysummary = pd.read_csv("/content/drive/MyDrive/숨고/이인균님(인물 분류, 문서 요약)/mysummary.csv",encoding='cp949',header=None)
mysummary.columns = ['contents']

rouge = Rouge()
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeLsum'], use_stemmer=True)

rouge_1 = []
rouge_2 = []
rouge_lsum = []
rouge_u = []
rouge_su = []
rdass = []
for i,j in zip(mysummary['contents'], summary_original['contents']):
    rouge_scores = rouge.get_scores(i, j)
    rouge_u_scores = scorer.score(i, j)
    rouge_1.append(rouge_scores[0]['rouge-1'])
    rouge_2.append(rouge_scores[0]['rouge-2'])
    rouge_lsum.append(rouge_u_scores[0]['rougeLsum'][2])
    rouge_u.append(rouge_u_scores[0]['rouge1'][2])
    rouge_su.append(rouge_u_scores[0]['rouge2'][2])
    rdass.append((rouge_u_scores['rouge1'][2] + rouge_u_scores['rouge2'][2]) / 2)

# 결과 출력
print("rouge-1: ", np.mean(rouge_1))
print("rouge-2: ", np.mean(rouge_2))
print("rouge-lsum: ", np.mean(rouge_lsum))
print("rouge-u: ", np.mean(rouge_u))
print("rouge-su: ", np.mean(rouge_su))
print("rdass: ", np.mean(rdass))