In [1]:
# KorQuAD데이터셋을 활용해 Generation-based MRC를 이해하고 huggingface 라이브러리를 활용해 데이터 전치리와 모델 학습 및 평가

# 필요한 패키지 설치

In [2]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install evaluate
!pip install nltk



In [3]:
from datasets import load_dataset
from evaluate import load
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')

dataset = load_dataset("squad_kor_v1")
metrics = load("squad")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Pre-trained 모델 및 토크나이저 불러오기

eneration-based MRC에서 사용하게될 모델은 Extraction-based MRC와는 다르게 T5 모델을 활용

T5 모델은 Seq2Seq모델이고 Text를 입력으로 받아서 Text를 출력하는 모델

In [4]:
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer
)

In [45]:
model_name = "google/mt5-small"

In [46]:
config = AutoConfig.from_pretrained(
    model_name,
    cache_dir=None,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=None,
    use_fast=True,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    config=config,
    cache_dir=None,
)



In [47]:
max_source_length = 1024
max_target_length = 128
padding = False
preprocessing_num_workers = 12
num_beams = 2
max_train_samples = 16
max_val_samples = 16
num_train_epochs = 3

# 전처리하기

T5의 Text-to-Text format을 위한 전처리 함수

In [48]:
def preprocess_function(examples):
    inputs = [f'question: {q}  context: {c} </s>' for q, c in zip(examples['question'], examples['context'])]
    targets = [f'{a["text"][0]} </s>' for a in examples['answers']]
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

# Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    model_inputs["example_id"] = []
    for i in range(len(model_inputs["labels"])):
        model_inputs["example_id"].append(examples["id"][i])
    return model_inputs

In [49]:
column_names = dataset['train'].column_names

In [50]:
train_dataset = dataset["train"]
train_dataset = train_dataset.select(range(max_train_samples))
train_dataset = train_dataset.map(
            preprocess_function,
            batched=True,
            num_proc=preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=False,
        )

Map (num_proc=12):   0%|          | 0/16 [00:00<?, ? examples/s]



In [51]:
eval_dataset = dataset["validation"]
eval_dataset = eval_dataset.select(range(max_val_samples))
eval_dataset = eval_dataset.map(
            preprocess_function,
            batched=True,
            num_proc=preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=False,
        )

Map (num_proc=12):   0%|          | 0/16 [00:00<?, ? examples/s]



# Fine-tuning하기

pretrained T5 모델을 KorQuAD에 맞게 Fine-tuning을 진행

In [52]:
from transformers import (
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

In [53]:
label_pad_token_id = tokenizer.pad_token_id

data_collator = DataCollatorForSeq2Seq(
            tokenizer,
            model=model,
            label_pad_token_id=label_pad_token_id,
            pad_to_multiple_of=None,
        )

In [54]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [55]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
# decoded_labels is for rouge metric, not used for f1/em metric
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    formatted_predictions = [{"id": ex['id'], "prediction_text": decoded_preds[i]} for i, ex in enumerate(dataset["validation"].select(range(max_val_samples)))]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in dataset["validation"].select(range(max_val_samples))]

    result = metrics.compute(predictions=formatted_predictions, references=references)
    return result

In [56]:
args = Seq2SeqTrainingArguments(
    output_dir='outputs',
    do_train=True,
    do_eval=True,
    predict_with_generate=True,
    num_train_epochs=num_train_epochs
)

In [57]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [58]:
train_result = trainer.train(resume_from_checkpoint=None)
train_result

Step,Training Loss


TrainOutput(global_step=6, training_loss=42.269439697265625, metrics={'train_runtime': 40.1555, 'train_samples_per_second': 1.195, 'train_steps_per_second': 0.149, 'total_flos': 24289438924800.0, 'train_loss': 42.269439697265625, 'epoch': 3.0})

In [60]:
## 평가하기
metrics = trainer.evaluate(
    max_length=max_target_length, num_beams=num_beams, metric_key_prefix="eval"
)
metrics

AttributeError: 'dict' object has no attribute 'compute'

In [None]:
# # 예제
# document = "세종대왕은 언제 태어났어?"
# input_ids = tokenizer(document, return_tensors='pt').input_ids
# outputs = model.generate(input_ids)
# tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

document = "질문: 세종대왕은 언제 태어났어? 답변:"
input_ids = tokenizer(document, return_tensors='pt').input_ids.to(device)
model = model.to(device)

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))