In [8]:
from datasets import load_from_disk
from transformers import AutoTokenizer

dataset = load_from_disk("../data/raw_squad")

model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

dataset


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [9]:
small_train = dataset["train"].shuffle(seed=42).select(range(10000))
small_val   = dataset["validation"].shuffle(seed=42).select(range(2000))

dataset = {
    "train": small_train,
    "validation": small_val
}

print("Train size:", len(dataset["train"]))
print("Validation size:", len(dataset["validation"]))


Train size: 10000
Validation size: 2000


In [10]:
max_input_length = 512
max_target_length = 64

def preprocess(example):
    question = example["question"]
    context = example["context"]
    answer = example["answers"]["text"][0]

    input_text = f"question: {question}  context: {context}"

    model_inputs = tokenizer(
        input_text,
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        answer,
        max_length=max_target_length,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [11]:
tokenized = {
    "train": dataset["train"].map(preprocess, remove_columns=dataset["train"].column_names),
    "validation": dataset["validation"].map(preprocess, remove_columns=dataset["validation"].column_names)
}


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [12]:
from datasets import DatasetDict

tokenized = DatasetDict(tokenized)
tokenized.save_to_disk("../data/tokenized_squad_small")

print("Tokenized dataset saved.")


Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenized dataset saved.


In [13]:
sample = tokenized["train"][0]
print("Input length:", len(sample["input_ids"]))
print("Target length:", len(sample["labels"]))


Input length: 512
Target length: 64
