preprocessing on the GLUE SST-2 dataset
composed of single sentences

In [2]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "sst2")
raw_datasets

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [3]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences = tokenizer(raw_datasets["train"]["sentence"])

In [4]:
def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

In [5]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [6]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [8]:
samples = tokenized_datasets["train"][:100]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence"]}
max([len(x) for x in samples["input_ids"]])

51

In [11]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([100, 51]),
 'token_type_ids': torch.Size([100, 51]),
 'attention_mask': torch.Size([100, 51]),
 'labels': torch.Size([100])}

In [13]:
#dynamic preprocessing function

def preprocess_glue(example, task_name):
    if task_name in ["sst2", "cola"]:
        return tokenizer(example["sentence"], truncation=True)
    elif task_name == "qqp":
        return tokenizer(example["question1"], example["question2"], truncation=True)
    elif task_name == "qnli":
        return tokenizer(example["question"], example["sentence"], truncation=True)
    elif task_name in ["mnli", "ax"]:
        return tokenizer(example["premise"], example["hypothesis"], truncation=True)
    elif task_name in ["mrpc", "rte", "stsb", "wnli"]:
        return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
    else:
        raise ValueError(f"Unsupported task: {task_name}")


Key Takeaways:

Use batched=True with Dataset.map() for significantly faster preprocessing
Dynamic padding with DataCollatorWithPadding is more efficient than fixed-length padding
Always preprocess your data to match what your model expects (numerical tensors, correct column names)
The 🤗 Datasets library provides powerful tools for efficient data processing at scale