In [None]:
from datasets import load_dataset
from transformers import T5TokenizerFast, T5ForConditionalGeneration

def preprocess_data(example, tokenizer):
    translation = example["translation"]
    translation_source = ["en: " + instance["en"] for instance in translation]
    translation_target = ["ko: " + instance["ko"] for instance in translation]
    tokenized = tokenizer(
        translation_source,
        text_target=translation_target,
        truncation=True
    )
    return tokenized

model_name = "KETI-AIR/long-ke-t5-small"
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained("KETI-AIR/long-ke-t5-small")

dataset = load_dataset("Helsinki-NLP/opus-100", "en-ko")
processed_dataset = dataset.map(
    lambda example: preprocess_data(example, tokenizer),
    batched=True,
    remove_columns=dataset["train"].column_names
)

sample = processed_dataset["test"][0]
print(sample)
print("변환된 출발 언어 :", tokenizer.decode(sample["input_ids"]))
print("변환된 도착 언어 :", tokenizer.decode(sample["labels"]))

In [None]:
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

seq2seq_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding="longest",
    return_tensors="pt"
)

training_arguments = Seq2SeqTrainingArguments(
    output_dir="t5-translation",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    num_train_epochs=1,
    eval_steps=2500,
    logging_steps=2500,
    seed=42
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_arguments,
    data_collator=seq2seq_collator,
    train_dataset=processed_dataset["train"].select(range(100000)),
    eval_dataset=processed_dataset["validation"].select(range(1000))
)

trainer.train()

In [None]:
import torch

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

data = "en: It's always great to acquire new knowledge."
inputs = tokenizer(data, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=512,
        num_beams=4,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
import evaluate
from torch.utils.data import DataLoader

dataloader = DataLoader(
    processed_dataset["test"].select(range(100)),
    collate_fn=seq2seq_collator,
    batch_size=4,
    shuffle=False
)

generated_translated = []
true_translated_ids = processed_dataset["test"].select(range(100))["labels"]
true_translated = tokenizer.batch_decode(true_translated_ids, skip_special_tokens=True)

with torch.no_grad():
    for batch in dataloader:
        batch = batch.to(device)
        output = model.generate(
            **batch,
            max_length=1026,
            num_beams=4,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
        batch_translated = tokenizer.batch_decode(output, skip_special_tokens=True)
        generated_translated.extend(batch_translated)

metric = evaluate.load("bleu")
bleu_scores = metric.compute(
    predictions=generated_translated,
    references=true_translated
)
print(bleu_scores)