<a href="https://colab.research.google.com/github/trinhtin/generative-ai-learning-resources/blob/main/fine_tuning_t5_generate_w3schools_quiz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers datasets

# Step 1: Prepare the Dataset

import pandas as pd

# Sample dataset format
data = [
    {
        "context": "HTML stands for Hyper Text Markup Language. It is used to create web pages.",
        "question": "What does HTML stand for?",
        "answer": "Hyper Text Markup Language"
    },
    {
        "context": "CSS stands for Cascading Style Sheets. It is used to style web pages.",
        "question": "What does CSS stand for?",
        "answer": "Cascading Style Sheets"
    }
]

df = pd.DataFrame(data)
df.to_csv('web_programming_quiz.csv', index=False)

In [None]:
# Step 2: Load and Preprocess the Data

from datasets import load_dataset

dataset = load_dataset('csv', data_files='web_programming_quiz.csv')

def preprocess_function(examples):
    return {
        'input_text': examples['context'],
        'target_text': examples['question'] + " [SEP] " + examples['answer']
    }

dataset = dataset.map(preprocess_function, remove_columns=dataset['train'].column_names)

In [None]:
# Step 3: Fine-tune T5 Model

from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def tokenize_function(examples):
    model_inputs = tokenizer(examples['input_text'], max_length=512, truncation=True)
    labels = tokenizer(examples['target_text'], max_length=128, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['train']
)

trainer.train()

In [None]:
# Step 4: Generate Quiz Questions

def generate_quiz(context, max_length=128):
    input_text = context
    input_ids = tokenizer(input_text, return_tensors='pt').input_ids
    outputs = model.generate(input_ids, max_length=max_length)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    question, answer = generated_text.split(" [SEP] ")
    return question, answer

# Example usage
context = "JavaScript is a programming language that can be run on the browser and server side."
question, answer = generate_quiz(context)
print(f"Question: {question}")
print(f"Answer: {answer}")