In [None]:
!pip install datasets

Using Roneneldan dataset with 2M rows

In [None]:
from datasets import load_dataset

ds = load_dataset("roneneldan/TinyStories")

Using GPT2 tokenizer using tiktoken repo (using byte-pair encodeing)

In [None]:
!pip install tiktoken  #tiktoken installation
import tiktoken
import os
import numpy as np
from tqdm.auto import tqdm

enc = tiktoken.get_encoding("gpt2")    # GPT2 tokenizer

def tokenizeID(sampleTextObj):
    ids = enc.encode_ordinary(sampleTextObj['text']) # encode_ordinary ignores special tokens if any
    out = {'ids': ids, 'len': len(ids)}
    return out

if not os.path.exists("train.bin"):
    tokenized = ds.map(
        tokenizeID,
        remove_columns=['text'],
        desc="tokenizing each split",
        num_proc=8,
        )
    # concatinating the ids of each dataset in single file
    for split, dataset in tokenized.items():
        arr_len = np.sum(dataset['len'], dtype=np.uint64)
        filename = f'{split}.bin'   # Here we will have 2 file train.bin and validation.bin
        dtype = np.uint16 # can do since enc.max_token_value == 50256 and is less than 2**16
        arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
        total_batches = 1024

        idx = 0
        for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
            # Batching the sampling together
            batch = dataset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
            arr_batch = np.concatenate(batch['ids'])
            # Write into mmap
            arr[idx : idx + len(arr_batch)] = arr_batch
            idx += len(arr_batch)
        arr.flush()

Creating input-output pairs for the dataset (as data is not specified into input and output section)

In [None]:
def get_batch(split):
    # Create np.memmap for every batch to avoid memory leak, as per
    # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
    if split == 'train':
        data = np.memmap('train.bin', dtype=np.uint16, mode='r')
    else:
        data = np.memmap('validation.bin', dtype=np.uint16, mode='r')

    ix = torch.randint(len(data) - block_size, (batch_size,))  # Selecting random sentences (batch)
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])  # Input matrix
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])  # Output matrix for the current input matrix
    if device_type == 'cuda':
        # pin array x,y this allows to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)  # The CPU is not being blocked
    else:
        x, y = x.to(device), y.to(device)
    return x, y   # Return the input and output matrix


Building the Model