In [3]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset, load_dataset

# Load CommonsenseQA dataset
dataset = load_dataset("commonsense_qa", split="train")

# Preprocess for T5
def preprocess(example):
    # Combine question and choices for input, use correct answer as target
    choices = " ".join([f"{chr(65+i)}: {c}" for i, c in enumerate(example["choices"]["text"])])
    input_text = f"question: {example['question']} choices: {choices}"
    target_text = example["choices"]["text"][example["choices"]["label"].index(example["answerKey"])]
    return {
        "input_text": input_text,
        "target_text": target_text
    }

dataset = dataset.map(preprocess)

# Tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def tokenize(batch):
    inputs = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=64)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(tokenize, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir=f"./{model_name}_cqa_finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=100,
    learning_rate=3e-4,
    fp16=True,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Train
trainer.train()

# Save the model and tokenizer
model.save_pretrained(f"./{model_name}_cqa_finetuned")
tokenizer.save_pretrained(f"./{model_name}_cqa_finetuned")

Step,Training Loss
100,0.8951
200,0.0498
300,0.0476
400,0.0437
500,0.043
600,0.0427
700,0.0413
800,0.0389
900,0.0408
1000,0.0391


('./t5-small_cqa_finetuned/tokenizer_config.json',
 './t5-small_cqa_finetuned/special_tokens_map.json',
 './t5-small_cqa_finetuned/spiece.model',
 './t5-small_cqa_finetuned/added_tokens.json')