#### Suppose we need to pad sentences of different lengths to make batches
<div><img src="image/dynamic_padding1.png" width=1000></div>

#### The first way is to pad all the sentences in the whole dataset to the maximum length in the dataset
<div><img src="image/dynamic_padding2.png" width=1000></div>

#### Another way is to pad the sentences at the batch creation, to the length of the longest sentence, which is called as dynamic padding
<div><img src="image/dynamic_padding3.png" width=1000></div>

- pros: all the batches will have the smallest possible size

- cons: dynamic shapes doesn't work well on all accelerators

#### The dataloading process with out using dynamic padding in batch:

In [13]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
print(raw_datasets.column_names)
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True, max_length=128
    )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")
print(tokenized_datasets.column_names)

{'train': ['sentence1', 'sentence2', 'label', 'idx'], 'validation': ['sentence1', 'sentence2', 'label', 'idx'], 'test': ['sentence1', 'sentence2', 'label', 'idx']}
3668
{'train': ['labels', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['labels', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['labels', 'input_ids', 'token_type_ids', 'attention_mask']}




In [11]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True)

for step, batch in enumerate(train_dataloader):
    print(batch["input_ids"].shape)
    if step > 5:
        break

torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])


#### The DataCollatorXXX library is responsible for applying all the final processing needed before forrming a batch. 
#### To apply dynamic padding, we postpone the padding in the preprocessing function, use the ***DataCollatorWithPadding*** instance as the input of the ***collate_fn*** for the ***DataLoader*** parameter

In [15]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer  = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")



Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [None]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)

train_dataloader = DataLoader(
    tokenized_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator
)