In [1]:
!pip install datasets
!pip install tiktoken

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-

In [2]:
from tqdm import tqdm
import numpy as np
from datasets import load_dataset
import tiktoken
import os

num_proc = 16 # Recommended as half the number of cpu cores by Andrej

enc = tiktoken.get_encoding("gpt2")

In [3]:
# num_proc is part of huggingface dataset loading multiprocessing using multiple cpu cores
dataset = load_dataset("openwebtext", cache_dir="/content/drive/MyDrive/Colab\ Notebooks/openwebtext", num_proc = num_proc)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.33k [00:00<?, ?B/s]

Loading dataset shards:   0%|          | 0/82 [00:00<?, ?it/s]

In [24]:
split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
split_dataset["val"] = split_dataset.pop('test')

split_dataset["train"] = split_dataset["train"].select(range(100000))
split_dataset["val"] = split_dataset["val"].select(range(1000))

split_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 100000
    })
    val: Dataset({
        features: ['text'],
        num_rows: 1000
    })
})

In [25]:
# Function to tokenize the dataset

def process(example):
  ids = enc.encode_ordinary(example['text'])
  ids.append(enc.eot_token)
  out = {'ids': ids, 'len': len(ids)}
  return out

# tokenizing the dataset using huggingface .map() function
tokenized = split_dataset.map(
    process,
    remove_columns=['text'],
    desc="tokenizing the splits",
    num_proc=num_proc
)

tokenizing the splits (num_proc=16):   0%|          | 0/100000 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=16):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [33]:
%cd /content/drive/MyDrive/Colab Notebooks/Transformers
!ls

/content/drive/MyDrive/Colab Notebooks/Transformers
'Dataloader Playground'		 gpt_checkpoint_1.53loss.pth   gpt_solo      karpathy-gpt
 gpt-2-mini			 gpt_checkpoint_1epoch.pth     input.txt     train.bin
 gpt-2-training-pipeline.ipynb	 gpt_checkpoint.pth	       input.txt.1   val.bin


In [30]:
import numpy as np
import os

output_dir = '/content/drive/MyDrive/Colab Notebooks/Transformers'

# concatenating all the ids in each dataset into one large file we can use for training
for split, dset in tokenized.items():
    print(f"Processing split: {split}")
    arr_len = np.sum(dset['len'], dtype=np.uint64)
    filename = os.path.join(output_dir, f'{split}.bin')
    dtype = np.uint16 # 2**16 because its greater than largest token_id value
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
    total_batches = 200

    idx = 0
    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
        # Batch together samples for faster write
        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
        arr_batch = np.concatenate(batch['ids'])
        # Write the concatenated data into memmap
        arr[idx:idx+len(arr_batch)] = arr_batch
        idx += len(arr_batch)
    arr.flush()

# To read the bin files later, e.g. with numpy
# m = np.memmap('train.bin'. dtype=np.uint16, mode='r')

Processing split: train


writing /content/drive/MyDrive/Colab Notebooks/Transformers/train.bin: 100%|██████████| 200/200 [00:01<00:00, 118.88it/s]


Processing split: val


writing /content/drive/MyDrive/Colab Notebooks/Transformers/val.bin: 100%|██████████| 200/200 [00:00<00:00, 329.94it/s]


In [44]:
from torch.utils.data import Dataset
import numpy as np
import torch

class MemmapDataset(Dataset):
    def __init__(self, data_dir, seq_len):
        self.dataset = np.memmap(data_dir, dtype=np.uint16, mode='r')
        self.seq_len = seq_len
        self.total_length = len(self.dataset) - seq_len

    def __len__(self):
        return self.total_length

    def __getitem__(self, idx):
        input_sequence = self.dataset[idx:idx+self.seq_len].astype(np.int64)
        target_sequence = self.dataset[idx+1:idx+self.seq_len+1].astype(np.int64)
        return torch.tensor(input_sequence, dtype=torch.long), torch.tensor(target_sequence, dtype=torch.long)


In [47]:
from torch.utils.data import DataLoader

train_dir = '/content/drive/MyDrive/Colab Notebooks/Transformers/train.bin'
val_dir = '/content/drive/MyDrive/Colab Notebooks/Transformers/val.bin'

train_dataset = MemmapDataset(train_dir, seq_len=256)
val_dataset = MemmapDataset(val_dir, seq_len=256)

train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=4)

In [48]:
# Fetch a batch from the training DataLoader
sample_batch = next(iter(train_dataloader))

# Check and print the types of the elements in the batch
input_sequences, target_sequences = sample_batch
print(input_sequences.shape)
print(target_sequences.shape)
print(f"Input type: {type(input_sequences)}")
print(f"Target type: {type(target_sequences)}")
print(input_sequences)
print(target_sequences)

torch.Size([256, 256])
torch.Size([256, 256])
Input type: <class 'torch.Tensor'>
Target type: <class 'torch.Tensor'>
tensor([[  606,    11,   318,  ...,   262, 11241,  2569],
        [   13,   198,   198,  ...,    82,  2003,    13],
        [  247,  1776,  3469,  ...,   262,   781,  2283],
        ...,
        [23746,   326,   318,  ...,   470,   772,  1969],
        [  714,   307,   366,  ...,  3421,   262,  3173],
        [   13,   279,   463,  ...,   416,   257,  4618]])
tensor([[   11,   318,   625,  ..., 11241,  2569,  3241],
        [  198,   198, 34831,  ...,  2003,    13,   198],
        [ 1776,  3469,   357,  ...,   781,  2283,    13],
        ...,
        [  326,   318, 22368,  ...,   772,  1969,   284],
        [  307,   366,   259,  ...,   262,  3173,   329],
        [  279,   463,  3970,  ...,   257,  4618,  3693]])


In [56]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 256 # how many independent sequences will be processed in parallel
seq_len = 256 # the maximum context length for predictions
vocab_size = 50258
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_embd = 384 # same as d_model
n_head = 8
n_layer = 8
dropout = 0.2
EPOCHS = 20

In [57]:
class Head(nn.Module):
    '''one head of self-attention'''
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(seq_len, seq_len)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # inputs: (batch, time-step, channels) == (batch_size, seq_len, d_model)
        # outputs: (batch, time-step, head size) == (batch_size, seq_len, d_k)
        B, T, C = x.shape
        k = self.key(x)   # (B, T, hs)
        q = self.query(x) # (B, T, hs)

        attention_scores = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T) WAIT HOLD UP THE BIGGEST MISTAKE WAS HERE
        attention_scores = attention_scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        attention_scores = F.softmax(attention_scores, dim=-1)
        attention_scores = self.dropout(attention_scores)

        v = self.value(x) # (B, T, hs)
        out = attention_scores @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.w_o = nn.Linear(head_size * num_heads, n_embd) # (d_k * h, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.w_o(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd) # planning to make myself
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(seq_len, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx) # (B, T, C) == (Batchsize, seqlen, dmodel)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x) # (B, T, C)
        x = self.ln_f(x) # (B, T, C)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -seq_len:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [None]:
!pip install wandb -qU
import wandb
wandb.login()

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.3/277.3 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
wandb.init(
    project="gpt-2-training-project",

    config={
    "learning_rate": 0.0003,
    "architecture": "Decoder-only Transformer",
    "dataset": "OpenWebText",
    "EPOCHS": EPOCHS,
    "Iterations per Epoch": len(train_loader),
    }
)

In [59]:
model = GPTLanguageModel()
model = model.to(device)
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

52933970


In [60]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [63]:
# alexLLM dimensions: (batch, seqlen, d_model) (2240, 1024, 784)
total_batches = len(train_dataloader) # I think this number is somewhere around 30M for batchsize 256
log_interval = 10000

for epoch in range(EPOCHS):
    running_loss = 0.0

    for i, batch in enumerate(train_dataloader):
        input_sequences, target_sequences = batch
        input_sequences = input_sequences.to(device)
        target_sequences = target_sequences.to(device)

        logits, loss = model(input_sequences, target_sequences)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        if i % log_interval == 0:
            avg_loss = running_loss / log_interval
            print(f"Epoch: {epoch+1}/{EPOCHS}, Batch: {i+1}/{total_batches}, Loss: {avg_loss}")
            # wandb.log({"Training Loss": avg_loss})
            running_loss = 0.0

    val_loss = estimate_loss()
    print(f"Epoch: {epoch+1}/{EPOCHS}, Validation Loss: {val_loss}")
    # wandb.log({"Validation Loss": val_loss})

    torch.save(model.state_dict(), f"model_epoch_{epoch+1}.pth")

wandb.finish()

  self.pid = os.fork()


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.27 GiB. GPU 0 has a total capacity of 22.17 GiB of which 2.70 GiB is free. Process 4888 has 19.46 GiB memory in use. Of the allocated memory 18.89 GiB is allocated by PyTorch, and 354.44 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [61]:
@torch.no_grad()
def estimate_loss():
    model.eval()
    running_loss = 0.0;
    for batch in val_dataloader:
        val_input_sequences, val_target_sequences = batch
        val_input_sequences = val_input_sequences.to(device)
        val_target_sequences = val_target_sequences.to(device)

        logits, loss = model(val_input_sequences, val_target_sequences)
        running_loss += loss.item()

    avg_loss = running_loss / len(val_dataloader)
    model.train()
    return avg_loss

In [None]:
def load_checkpoint(model, optimizer, filename="gpt_checkpoint_1.53loss.pth"):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print(f"Checkpoint loaded from {filename}")

load_checkpoint(model, optimizer)

In [None]:
def save_checkpoint(model, optimizer, filename="gpt_checkpoint_1.53loss.pth"):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, filename)
    print(f"Checkpoint saved to {filename}")

save_checkpoint(model, optimizer)

In [None]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))