In [None]:
!pip install datasets

Using Roneneldan dataset with 2M rows

In [None]:
from datasets import load_dataset

ds = load_dataset("roneneldan/TinyStories")

Using GPT2 tokenizer from tiktoken to create token IDs

In [None]:
!pip install tiktoken  #tiktoken installation
import tiktoken
import os
import numpy as np
from tqdm.auto import tqdm

enc = tiktoken.get_encoding("gpt2")    # GPT2 tokenizer

def tokenizeID(sampleTextObj):
    ids = enc.encode_ordinary(sampleTextObj['text']) # encode_ordinary ignore special tokens if any
    out = {'ids': ids, 'len': len(ids)}
    return out

if not os.path.exists("train.bin"):
    tokenized = ds.map(
        tokenizeID,
        remove_columns=['text'],
        desc="tokenizing each split",
        num_proc=8,
        )
    # concatinating the ids of each dataset in single file
    for split, dataset in tokenized.items():
        arr_len = np.sum(dataset['len'], dtype=np.uint64)
        filename = f'{split}.bin'
        dtype = np.uint16 # can do since enc.max_token_value == 50256 is < 2**16
        arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
        total_batches = 1024

        idx = 0
        for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
            # Batching the sampling together
            batch = dataset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
            arr_batch = np.concatenate(batch['ids'])
            # Write into mmap
            arr[idx : idx + len(arr_batch)] = arr_batch
            idx += len(arr_batch)
        arr.flush()