In [None]:
from datasets import load_dataset
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering

def preprocess_data(example, tokenizer):
    tokenized = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",
        return_offsets_mapping=True
    )
    start_index = example["answers"]["answer_start"][0]
    answer_text = example["answers"]["text"][0]
    answer_tokens = tokenizer.encode(answer_text, add_special_tokens=False)
    answer_tokens_length = len(answer_tokens)

    start_context_tokens_index = tokenized["input_ids"].index(tokenizer.sep_token_id)
    context_offset_mapping = tokenized["offset_mapping"][start_context_tokens_index:]
    tokenized["start_positions"] = len(tokenized["input_ids"])
    tokenized["end_positions"] = len(tokenized["input_ids"])
    
    for i, (start_offset, end_offset) in enumerate(context_offset_mapping):
        if start_offset >= start_index:
            tokenized["start_positions"] = start_context_tokens_index + i
            tokenized["end_positions"] = tokenized["start_positions"] + answer_tokens_length
            break

    return tokenized

model_name = "klue/roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
model = RobertaForQuestionAnswering.from_pretrained(model_name)

dataset = load_dataset("klue", "mrc")
processed_dataset = dataset.filter(lambda x: not x["is_impossible"])
processed_dataset = processed_dataset.map(
    lambda example: preprocess_data(example, tokenizer), batched=False
)
processed_dataset = processed_dataset.filter(
    lambda x: x["start_positions"] < tokenizer.model_max_length
)
processed_dataset = processed_dataset.filter(
    lambda x: x["end_positions"] < tokenizer.model_max_length
)
print(dataset)
print(processed_dataset)

In [None]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

collator = DataCollatorWithPadding(tokenizer, padding="longest")

training_arguments = TrainingArguments(
    output_dir="question-answering",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    num_train_epochs=1,
    eval_steps=250,
    logging_steps=250,
    seed=42
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    data_collator=collator,
    train_dataset=processed_dataset["train"].select(range(10000)),
    eval_dataset=processed_dataset["validation"].select(range(100))
)

trainer.train()

In [None]:
import torch

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

question = "대한민국의 수도는 어디인가요?"
context = "서울은 대한민국의 수도다."
inputs = tokenizer(question, context, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**inputs)

start_index = outputs["start_logits"].argmax(dim=-1).item()
end_index = outputs["end_logits"].argmax(dim=-1).item()
predicted_ids = inputs["input_ids"][0][start_index : end_index]
predicted_text = tokenizer.decode(predicted_ids)
print(predicted_text)

In [None]:
from evaluate import evaluator

metric = evaluator("question-answering")
results = metric.compute(
    model,
    tokenizer=tokenizer,
    data=processed_dataset["validation"].select(range(100)),
    id_column="guid",
    question_column="question",
    context_column="context",
    label_column="answers"
)
print(results)