In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

##### Example 1

In [None]:
data = {
    'input_ids': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
    'labels': [0, 1, 0]
}

In [None]:
data

{'input_ids': [[1, 2, 3], [4, 5, 6], [7, 8, 9]], 'labels': [0, 1, 0]}

Create a dataset from `data`

In [None]:
from datasets import Dataset

In [None]:
dataset = Dataset.from_dict(data)

In [None]:
dataset

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 3
})

##### Example 2

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("glue", "mrpc", split="train")

Found cached dataset glue (/Users/education/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [None]:
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [None]:
small_dataset = dataset.select(range(100))

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
small_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 100
})

In [None]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

Tokenizer the column `sentence1` in the `small_dataset`

In [None]:
def tokenize_func(x):
    return tokenizer(x["sentence1"])

In [None]:
tokenized_dataset = small_dataset.map(tokenize_func)

  0%|          | 0/100 [00:00<?, ?ex/s]

In [None]:
tokenized_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
    num_rows: 100
})

##### Example 3

In [None]:
small_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 100
})

Filter all items in the column `sentence1` from `small_dataset` that start with `F`

**Hint**: `x.startswith("F")`

In [None]:
def filter_func(x):
    return x["sentence1"].startswith("F")

In [None]:
new_dataset = small_dataset.filter(filter_func)

Loading cached processed dataset at /Users/education/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-e4ca0a64df9634ee.arrow


In [None]:
new_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 2
})

In [None]:
new_dataset["sentence1"][:3]

['FBI agents arrested a former partner of Big Four accounting firm Ernst & Young ERNY.UL on criminal charges of obstructing federal investigations , U.S. officials said on Thursday .',
 'Fewer than a dozen FBI agents were dispatched to secure and analyze evidence .']

##### Example 4

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

### Data Collator

In [None]:
# Load a dataset from Hugging Face
raw_datasets = load_dataset("onestop_english")

Found cached dataset onestop_english (/Users/education/.cache/huggingface/datasets/onestop_english/default/1.1.0/6b19eec5680862ad1cf1990e98b06a98d1fa4c85f3585dc4dfab93f52b89d9cf)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
context_length = 128

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    # return outputs
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [None]:
tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 3614
    })
})

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
tokenized_text = tokenized_datasets["train"][1]

In [None]:
tokenizer

GPT2TokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [None]:
tokenized_text.keys()

dict_keys(['input_ids'])

In [None]:
tokenized_text["input_ids"][:5]

[11, 17673, 2489, 290, 584]

Generate `attention_mask` and `labels` for training a masked autoregressive language model using Collator

**Hints**
- `[tokenized_text]`
- `mlm=True`

In [None]:
from transformers import DataCollatorForLanguageModeling

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
output = data_collator([tokenized_text])

In [None]:
output.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [None]:
output["labels"][0][:5]

tensor([   11, 17673,  2489,   290,   584])

##### Example 3

In [None]:
from torch.utils.data import IterableDataset

In [None]:
class ConstantLengthDataset(IterableDataset):
    def __init__(
        self,
        tokenizer, dataset, seq_len,
        num_of_sequences, chars_per_token
    ):
        self.tokenizer = tokenizer
        self.eos_token_id = tokenizer.eos_token_id
        self.dataset = dataset
        self.seq_len = seq_len
        self.input_characters = seq_len * chars_per_token * num_of_sequences
    
    def __iter__(self):
        