In [None]:
#seq2seq model
from IPython.display import Image
Image(url="https://tutorials.pytorch.kr/_images/seq2seq.png", width=800, height=400)

In [None]:
!pip install transformers[sentencepiece] datasets evaluate rouge_score

In [None]:
import nltk
nltk.download('punkt')
import datasets # https://huggingface.co/docs/datasets/nlp_process
import evaluate # https://huggingface.co/docs/evaluate/choosing_a_metric
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

### data loading & preprocessing

In [None]:
qg_data = datasets.load_dataset("lmqg/qg_koquad") #https://huggingface.co/datasets/lmqg/qg_koquad
qg_data

In [None]:
qg_data['train'][0]

In [None]:
Image(url="https://raw.githubusercontent.com/asahi417/lm-question-generation/master/assets/qg_diagram.png", width=600, height=400)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("KETI-AIR/ke-t5-small-ko") #https://github.com/AIRC-KETI/ke-t5
model = AutoModelForSeq2SeqLM.from_pretrained("KETI-AIR/ke-t5-small-ko")

In [None]:
# https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html
Image(url="https://static.news.zumst.com/images/98/2020/02/25/40908d3b82274fe683755fcbba5fdea2.png", width=800, height=300)

In [None]:
# your code
def prefix_add(texts, prefix=""): # prefix --> trigger tokens
    # texts: row of dataset
    inputs = [prefix + doc for doc in texts["paragraph_question"]]  # texts["paragraph_question"]: batch of rows of dataset
    model_inputs = tokenizer(inputs, max_length=256, truncation=True)
    labels = tokenizer(texts["answer"], max_length=64, truncation=True)
    model_inputs["labels"] = labels["input_ids"]  # add key 'labels' to data row
    return model_inputs  # returns batch of processed rows of dataset

# Apply a function to all the elements in the table and update the table (row by row)
tokenized_dataset = qg_data.map(prefix_add, batched=True,)
tokenized_dataset

In [None]:
# adjust size of dataset
for key in tokenized_dataset.keys():  # iteration for train, validation, test
    length = len(tokenized_dataset[key])
    tokenized_dataset[key] = tokenized_dataset[key].select(range(int(length * 0.25)))

In [None]:
tokenized_dataset

### hyperparameter setups

In [None]:
metric = evaluate.load("rouge") # https://huggingface.co/spaces/evaluate-metric/rouge
metric

In [None]:
def compute_metrics(eval_pred, tokenizer=tokenizer, metric=metric):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id) # -100 --> missing token
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value * 100 for key, value in result.items()}

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
# https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/trainer#transformers.TrainingArguments
training_args = Seq2SeqTrainingArguments(
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    output_dir="KoT5_for_QG",
    logging_steps=3,
    evaluation_strategy="epoch", 
    save_strategy="epoch",
    load_best_model_at_end=True,
    predict_with_generate=True
)

### model training

In [None]:
# https://huggingface.co/docs/transformers/main_classes/trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model),
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip '/content/drive/MyDrive/연세대학교 2학년 2학기 (2023-1)/CSI4121 Big Data/HW2/checkpoint-47740.zip'

In [None]:
trainer.train()
# trainer.train(resume_from_checkpoint="/content/KoT5_for_QG/checkpoint-47740")

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!zip -r /content/KoT5_for_QG/checkpoint-68200.zip /content/KoT5_for_QG/checkpoint-68200

### evaluation & prediction

In [None]:
import os
trainer.evaluate(tokenized_dataset["test"], 
        num_beams=5)

In [None]:
from google.colab import runtime
runtime.unassign()

여기까지만돌렸음~

In [None]:
save_model_path = "./KoT5_QG" # save in the current working directory, you can change this of course
if not os.path.exists(save_model_path):
  os.mkdir(save_model_path)

model.save_pretrained(save_model_path)

In [None]:
from transformers import T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained(save_model_path)

In [None]:
inputs = tokenizer(qg_data["test"][0]["paragraph_question"], return_tensors="pt", max_length=256, truncation=True)
inputs = {k:v for k, v in inputs.items()}
inputs

In [None]:
outputs = model.generate(inputs["input_ids"], num_beams=5)

print("Ground paragraph_question:", qg_data["test"][0]["paragraph_question"])
print("Ground truth-answer:", qg_data["test"][0]["answer"])
print('-'*30)
print("Generated docstring:", tokenizer.decode(outputs[0], skip_special_tokens=True))

📌 실습

직접 sentence를 넣어 모델에 넣어주고, 모델이 질문을 어떻게 생성하는지 살펴보세요

In [None]:
pq = "question: 박영한이 처음으로 출연한 작품은?, context: 박영한은 대한민국의 방송인이다. 그는 2022년 tvN 백패커에서 처음으로 데뷔했다."
print("Paragaph_question:", pq)
inputs = tokenizer(pq, return_tensors="pt", max_length=256, truncation=True)
inputs = {k:v for k, v in inputs.items()}

outputs = model.generate(inputs["input_ids"], num_beams=5)

print('-'*30)
print("Generated docstring:", tokenizer.decode(outputs[0], skip_special_tokens=True))

## Assignment

- 오늘 실습에서는 passage가 주어졌을 때 question을 생성하는 task를 수행하였습니다.
- 실습 내용을 바탕으로, passage와 question이 주어졌을 때 answer를 생성하도록 모델을 학습하세요.
- ROUGE-1 score가 5.0이 넘도록 데이터 양 및 하이퍼파라미터를 조정하세요.
- 보고서의 내용과 Rouge score를 기반으로 채점합니다.
- 보고서(전체 A4 1장 내외):
  1.   실험 과정 요약
  2.   최적화 하이퍼파라미터 정리
  3.   최종 성능 결과 보고



