In [6]:
from transformers import AutoTokenizer
import itertools

In [7]:

tokenizer = AutoTokenizer.from_pretrained('gpt2', use_fast=True)
tokenizer.pad_token = tokenizer.eos_token



def pile_transform(tokenizer, max_length, seed=None):
    def transform(batch):
        # tokenize
        examples = tokenizer(batch['text'])

        # Concatenate all texts.
        examples = {k: list(itertools.chain(*examples[k])) for k in examples.keys()}
        total_length = len(examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= max_length:
            total_length = (total_length // max_length) * max_length
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
            for k, t in examples.items()
        }
        return result

    return transform
transform = pile_transform(tokenizer, 1024, 111)

from datasets import load_dataset
ds = load_dataset('EleutherAI/pile', 'all', streaming=True)['train']
ds

<datasets.iterable_dataset.IterableDataset at 0x7ff70ea302e0>

In [8]:
mapped_ds = ds.map(transform, batched=True, remove_columns=['text', 'meta'], batch_size=30)
mapped_ds = mapped_ds.with_format("torch")

o = list(mapped_ds.take(3))
o

Token indices sequence length is longer than the specified maximum sequence length for this model (3180 > 1024). Running this sequence through the model will result in indexing errors


[{'input_ids': tensor([1026,  318, 1760,  ..., 1088,  838, 4201]),
  'attention_mask': tensor([1, 1, 1,  ..., 1, 1, 1])},
 {'input_ids': tensor([ 329,  257, 7480,  ..., 6626,  656, 1811]),
  'attention_mask': tensor([1, 1, 1,  ..., 1, 1, 1])},
 {'input_ids': tensor([1180, 2628,   11,  ...,  284,  564,  250]),
  'attention_mask': tensor([1, 1, 1,  ..., 1, 1, 1])}]