In [1]:
!pip install datasets
!pip install tiktoken
!pip install torch
import torch

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m368.6/542.0 kB[0m [31m11.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-

In [2]:
from tqdm import tqdm
import numpy as np
from datasets import load_dataset
import tiktoken
import os

num_proc = 16 # Recommended as half the number of cpu cores by Andrej

enc = tiktoken.get_encoding("gpt2")

In [3]:
# num_proc is part of huggingface dataset loading multiprocessing using multiple cpu cores
dataset = load_dataset("openwebtext", cache_dir="/content/drive/MyDrive/Colab\ Notebooks/openwebtext", num_proc = num_proc)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.33k [00:00<?, ?B/s]

Loading dataset shards:   0%|          | 0/82 [00:00<?, ?it/s]

In [4]:
split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
split_dataset["val"] = split_dataset.pop('test')

split_dataset["train"] = split_dataset["train"].select(range(10000))
split_dataset["val"] = split_dataset["val"].select(range(100))

split_dataset

# With this configuration, it takes around 3500 mins to train, 60 hours
# DatasetDict({
#     train: Dataset({
#         features: ['text'],
#         num_rows: 100000
#     })
#     val: Dataset({
#         features: ['text'],
#         num_rows: 1000
#     })
# })

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 10000
    })
    val: Dataset({
        features: ['text'],
        num_rows: 100
    })
})

In [5]:
# Function to tokenize the dataset

def process(example):
  ids = enc.encode_ordinary(example['text'])
  ids.append(enc.eot_token)
  out = {'ids': ids, 'len': len(ids)}
  return out

# tokenizing the dataset using huggingface .map() function
tokenized = split_dataset.map(
    process,
    remove_columns=['text'],
    desc="tokenizing the splits",
    num_proc=num_proc
)

tokenizing the splits (num_proc=16):   0%|          | 0/10000 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

In [9]:
%cd /content/drive/MyDrive/Colab Notebooks/Transformers
!ls

/content/drive/MyDrive/Colab Notebooks/Transformers
'Dataloader Playground'		 gpt_checkpoint_1epoch.pth   input.txt.1    val10.bin
 gpt-2-mini			 gpt_checkpoint.pth	     karpathy-gpt   val.bin
 gpt-2-training-pipeline.ipynb	 gpt_solo		     train10.bin
 gpt_checkpoint_1.53loss.pth	 input.txt		     train.bin


In [8]:
import numpy as np
import os

output_dir = '/content/drive/MyDrive/Colab Notebooks/Transformers'

# concatenating all the ids in each dataset into one large file we can use for training
for split, dset in tokenized.items():
    print(f"Processing split: {split}")
    arr_len = np.sum(dset['len'], dtype=np.uint64)
    filename = os.path.join(output_dir, f'{split}10.bin')
    dtype = np.uint16 # 2**16 because its greater than largest token_id value
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
    total_batches = 20

    idx = 0
    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
        # Batch together samples for faster write
        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
        arr_batch = np.concatenate(batch['ids'])
        # Write the concatenated data into memmap
        arr[idx:idx+len(arr_batch)] = arr_batch
        idx += len(arr_batch)
    arr.flush()

# To read the bin files later, e.g. with numpy
# m = np.memmap('train.bin'. dtype=np.uint16, mode='r')

Processing split: train


writing /content/drive/MyDrive/Colab Notebooks/Transformers/train10.bin: 100%|██████████| 20/20 [00:00<00:00, 109.55it/s]

Processing split: val



writing /content/drive/MyDrive/Colab Notebooks/Transformers/val10.bin: 100%|██████████| 20/20 [00:00<00:00, 400.09it/s]


In [10]:
from torch.utils.data import Dataset
import numpy as np

class MemmapDataset(Dataset):
    def __init__(self, data_dir, seq_len):
        self.dataset = np.memmap(data_dir, dtype=np.uint16, mode='r')
        self.seq_len = seq_len
        self.total_length = len(self.dataset) - seq_len

    def __len__(self):
        return self.total_length

    def __getitem__(self, idx):
        input_sequence = self.dataset[idx:idx+self.seq_len].astype(np.int64)
        target_sequence = self.dataset[idx+1:idx+self.seq_len+1].astype(np.int64)
        return torch.tensor(input_sequence, dtype=torch.long), torch.tensor(target_sequence, dtype=torch.long)

In [11]:
from torch.utils.data import DataLoader

train_dir = '/content/drive/MyDrive/Colab Notebooks/Transformers/train10.bin'
val_dir = '/content/drive/MyDrive/Colab Notebooks/Transformers/val10.bin'

train_dataset = MemmapDataset(train_dir, seq_len=128)
val_dataset = MemmapDataset(val_dir, seq_len=128)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)

In [14]:
print(len(train_dataloader))
print(len(val_dataloader))

176615
2155
