In [None]:
!pip install datasets
!pip install tiktoken
!pip install torch
import torch

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datase

In [None]:
from tqdm import tqdm
import numpy as np
from datasets import load_dataset, DatasetDict
import tiktoken
import os

num_proc = 16 # Recommended as half the number of cpu cores by Andrej

enc = tiktoken.get_encoding("gpt2")

In [None]:
# num_proc is part of huggingface dataset loading multiprocessing using multiple cpu cores
dataset = load_dataset("openwebtext", cache_dir="/content/drive/MyDrive/Colab\ Notebooks/openwebtext", num_proc = num_proc)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.35k [00:00<?, ?B/s]

Loading dataset shards:   0%|          | 0/82 [00:00<?, ?it/s]

In [None]:
# Selecting new subset for the training data
shuffled_dataset = dataset["train"].shuffle(seed=2357)

train_dataset = shuffled_dataset.select(range(1_000_000))
val_dataset = shuffled_dataset.select(range(1_000_000, 1_001_000))

split_dataset = DatasetDict({
    "train": train_dataset,
    "val": val_dataset
})

split_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1000000
    })
    val: Dataset({
        features: ['text'],
        num_rows: 1000
    })
})

In [None]:
# Function to tokenize the dataset
def process(example):
  ids = enc.encode_ordinary(example['text'])
  ids.append(enc.eot_token)
  out = {'ids': ids, 'len': len(ids)}
  return out

# tokenizing the dataset using huggingface .map() function
tokenized = split_dataset.map(
    process,
    remove_columns=['text'],
    desc="tokenizing the splits",
    num_proc=num_proc
)

tokenizing the splits (num_proc=16):   0%|          | 0/1000000 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=16):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# For just in case, make sure the current working directory has its files and notebooks
%cd /content/drive/MyDrive/Colab Notebooks/Transformers
!ls

/content/drive/MyDrive/Colab Notebooks/Transformers
'Dataloader Playground'		 gpt_checkpoint_1epoch.pth   input.txt.1    val10.bin
 gpt-2-mini			 gpt_checkpoint.pth	     karpathy-gpt   val.bin
 gpt-2-training-pipeline.ipynb	 gpt_solo		     train10.bin
 gpt_checkpoint_1.53loss.pth	 input.txt		     train.bin


In [None]:
import numpy as np
import os

output_dir = '/content/drive/MyDrive/Colab Notebooks/Transformers'

# concatenating all the ids in each dataset into one large file we can use for training
for split, dset in tokenized.items():
    print(f"Processing split: {split}")
    arr_len = np.sum(dset['len'], dtype=np.uint64)
    filename = os.path.join(output_dir, f'{split}_new.bin')
    dtype = np.uint16 # 2**16 because its greater than largest token_id value
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
    total_batches = 20

    idx = 0
    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
        # Batch together samples for faster write
        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
        arr_batch = np.concatenate(batch['ids'])
        # Write the concatenated data into memmap
        arr[idx:idx+len(arr_batch)] = arr_batch
        idx += len(arr_batch)
    arr.flush()

# To read the bin files later, e.g. with numpy
# m = np.memmap('train.bin'. dtype=np.uint16, mode='r')

Processing split: train


writing /content/drive/MyDrive/Colab Notebooks/Transformers/train_new.bin: 100%|██████████| 20/20 [00:25<00:00,  1.28s/it]


Processing split: val


writing /content/drive/MyDrive/Colab Notebooks/Transformers/val_new.bin: 100%|██████████| 20/20 [00:00<00:00, 296.62it/s]


In [1]:
# Custom Dataset Class used to iterate through the memmap dataset
from torch.utils.data import Dataset
import numpy as np
import torch

class MemmapDataset(Dataset):
    def __init__(self, data_dir, seq_len):
        self.dataset = np.memmap(data_dir, dtype=np.uint16, mode='r')
        self.seq_len = seq_len
        self.total_length = (len(self.dataset) - 1) // (seq_len + 1)

    def __len__(self):
        return self.total_length

    def __getitem__(self, idx):
        start_idx = idx * (self.seq_len + 1)
        end_idx = start_idx + self.seq_len
        input_sequence = self.dataset[start_idx:end_idx].astype(np.int64)
        target_sequence = self.dataset[start_idx+1:end_idx+1].astype(np.int64)
        return torch.tensor(input_sequence, dtype=torch.long), torch.tensor(target_sequence, dtype=torch.long)

# Example usage
# dataset = MemmapDataset(data_dir='/path/to/data', seq_len=128)
# input_seq, target_seq = dataset[0]

In [2]:
from torch.utils.data import DataLoader

train_dir = '/content/drive/MyDrive/Colab Notebooks/Transformers/train_new.bin'
val_dir = '/content/drive/MyDrive/Colab Notebooks/Transformers/val_new.bin'

In [3]:
# How many seq_len sequences are their in total. This divided by batchsize will produce total iters
train_dataset = MemmapDataset(train_dir, seq_len=128)
val_dataset = MemmapDataset(val_dir, seq_len=128)

print(len(train_dataset))

8737803


In [4]:
# Initializing training and validation dataloaders
train_dataset = MemmapDataset(train_dir, seq_len=128)
val_dataset = MemmapDataset(val_dir, seq_len=128)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=0)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=0)

In [5]:
# How many iterations in the dataloaders
print(len(train_dataloader))
print(len(val_dataloader))

136529
143


In [None]:
# To observe a certain batch number inside the dataloader
from torch.utils.data import DataLoader

dataloader_iterator = iter(val_dataloader)

batch_number = 2
batch = None
for _ in range(batch_number):
    batch = next(dataloader_iterator)

input_sequences, target_sequences = batch

print("Input Sequences:", input_sequences)
print("Target Sequences:", target_sequences)

Input Sequences: tensor([[ 656,  616, 2607,  ...,  247,   82, 1107],
        [1107, 2041,  290,  ...,   82,  407, 1744],
        [ 345,  284, 4724,  ...,  318,  326,  345],
        ...,
        [1639,  447,  247,  ...,   11,  475,  339],
        [ 247,   82, 2282,  ...,   11, 7744, 7867],
        [ 632,  447,  247,  ...,  262, 1353,  286]])
Target Sequences: tensor([[ 616, 2607,   11,  ...,   82, 1107,   11],
        [2041,  290,  345,  ...,  407, 1744,  329],
        [ 284, 4724,  644,  ...,  326,  345,  910],
        ...,
        [ 447,  247,   67,  ...,  475,  339,  447],
        [  82, 2282,   11,  ..., 7744, 7867,   13],
        [ 447,  247,   82,  ..., 1353,  286,  262]])


In [7]:
# Measuring optimal num_worker time with one A100 gpu on Google Colab
import time

def time_dataloader(data_loader):
    start_time = time.time()
    for batch in data_loader:
        pass
    return time.time() - start_time

for num_worker in [0, 2, 4, 8, 12, 16]:
    dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=num_worker)
    loading_time = time_dataloader(dataloader)
    print(f'Num workers: {num_worker}, Loading time: {loading_time:.2f} seconds')

Num workers: 0, Loading time: 353.36 seconds
Num workers: 2, Loading time: 221.89 seconds
Num workers: 4, Loading time: 149.10 seconds
Num workers: 8, Loading time: 143.53 seconds
Num workers: 12, Loading time: 144.14 seconds




Num workers: 16, Loading time: 147.27 seconds
