<a href="https://colab.research.google.com/github/trinhtin/generative-ai-learning-resources/blob/main/webq_t5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install datasets transformers

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194

In [None]:
# Import libraries
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

In [None]:
# Load the dataset
data = {
    "question": [
        "What does HTML stand for?",
        "Which tag is used to create a hyperlink?",
        "What attribute specifies the URL of the page the link goes to?",
        "How do you create a numbered list?",
        "Which tag is used to create a table row?"
    ],
    "answer": [
        "Hyper Text Markup Language",
        "<a>",
        "href",
        "<ol>",
        "<tr>"
    ]
}

df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

In [None]:
# Tokenize the dataset
tokenizer = T5Tokenizer.from_pretrained('t5-small')

def preprocess_data(examples):
    inputs = ["generate quiz question: " + q for q in examples['question']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["answer"], max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_data, batched=True)

In [None]:
# Load the model
model = T5ForConditionalGeneration.from_pretrained('t5-small')

In [None]:
# Fine-tune the model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset
)

trainer.train()

In [None]:
# Function to generate quiz questions
def generate_quiz(model, tokenizer, prompt, num_questions=5):
    inputs = tokenizer.encode("generate quiz question: " + prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=512, num_return_sequences=num_questions, num_beams=4, early_stopping=True)

    questions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return questions

In [None]:
# Generate quiz based on HTML topic
quiz_questions = generate_quiz(model, tokenizer, "HTML")
for i, question in enumerate(quiz_questions):
    print(f"Question {i+1}: {question}")