# Learning how Large Lanugage Models work - GPT1 From Scratch

## Introduction

With the rise of Large Language Models (LLMs) in recent years, I wanted to dive deeper into how they work. This project is a personal exploration of the GPT architecture, not focused on evaluating model performance but rather on breaking down the key components and understanding the process. The goal is to implement a GPT-1 model from scratch using the PyTorch library to grasp the fundamental concepts behind its architecture.

## Importing Libraries

In [1]:
# Pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F

#Other libraries
import re
from datasets import load_dataset
from collections import Counter
import itertools

## Parameters

The parameters used here are to be used to adjust the tokenization parameters, or top adjust the model hyperparameters.

In [2]:
#Tokenization Parameters
tokenize_setting = 'chars' # tokenize either by character or words
token_length = 2 # currently only 1 or 2
iterations = 1000000 # early stopping iterations of the streaming dataset to limit the amount of memory usage

# Model Hyperparameters
blocksize = 8 # how many blocks that the model will be processing each iteration
batchsize = 8 # how many batches that the model will be processing each iteration
max_iters = 10000 # iterations of the number of 
learning_rate = 3e-4 # incremental rate for gradient descent
eval_iters = 200 # used to show the training iterations for evaluating the val loss
device = 'cpu'
n_embd = 784 # dimensions of the embeddings for each token
n_layer = 4 # number of neural network layers that the encoded data will be passed through
n_head = 8 # number of multi-attention heads that will be looking at each query/key
dropout = 0.2 # percentage that will be zero-ed out to regularize the tensors  

## Text Corpus

For the training dataset, I'm using the streaming dataset openwebtext from HuggingFace as the training corpus. 

The dataset is stored as an iterable dataset for as it is very large in size, and iterating over the data will ensure that the text won't fully crash my RAM as a result.

In [3]:
# Loading the dataset from huggingface
dataset = load_dataset('Skylion007/openwebtext', split = 'train', streaming = True)



In [4]:
dataset

IterableDataset({
    features: ['text'],
    n_shards: 21
})

In [5]:
# text = iter(dataset)

# for i, sample in zip(itertools.count(), dataset):
#     sample = next(text)
#     print(sample['text'])

## Tokenization of Corpus

In this project, I chose to create a custom tokenizer and encoding/decoding functions rather than using the built-in auto-tokenizer from the HuggingFace library, as part of my effort to gain a deeper understanding of how these components function.

In [6]:
# Tokenizer function
def tokenize(text, token_length=token_length, tokenize_by=tokenize_setting):
    tokens = []
            
    if tokenize_by == 'chars':
        # Tokenize by characters
        count = 0
        while count < len(text):
            if text[count].isspace() or not text[count].isalnum():  # Treat spaces and punctuation as individual tokens
                tokens.append(text[count])  # Correctly append the token from the sample
                count += 1
            else:
                if count + token_length <= len(text):
                    tokens.append(text[count:count + token_length])
                    count += token_length
                else:
                    tokens.append(text[count])
                    count += 1
    
    elif tokenize_by == 'words':
        # Tokenize by words
        words = text.split()
        if token_length == 1:
            tokens.extend(words)  # Append word tokens to the list
        elif token_length == 2:
            # Combine every two words into a token
            for j in range(0, len(words), 2):
                if j + 1 < len(words):
                    tokens.append(words[j] + ' ' + words[j + 1])
                else:
                    tokens.append(words[j])
    
    else:
        raise ValueError("Invalid tokenize_by parameter. Use 'chars' or 'words'.")
    
    return tokens



# Build vocabulary

def build_vocab(dataset, token_length = token_length, tokenize_by=tokenize_setting):
    vocab_counter = Counter()
    text = iter(dataset)  # Streaming dataset as an iterator

    # Add special tokens
    special_tokens = ['<SOS>', '<UNK>']
    vocab_counter.update(special_tokens)
    
    # Process each sample in the streaming dataset

    for i, sample in zip(itertools.count(), dataset):
        sample = next(text)['text']  # Get the actual text data from the sample for that iteration

        tokens =tokenize(sample, token_length, tokenize_by)
    
        
        # Update the vocab with token counts
        vocab_counter.update(tokens)
        
        # Limit the number of samples for building the vocab to avoid processing the entire dataset (optional)
        if i > iterations: 
            break
    
    # Create vocab mapping (token to integer)
    word_to_id = {word: idx for idx, (word, _) in enumerate(vocab_counter.items())}
    id_to_word = {idx: word for word, idx in word_to_id.items()}
    
    return word_to_id, id_to_word


def encode(text, word_to_id, token_length=token_length, tokenize_by=tokenize_setting):
 
    if tokenize_by == 'chars':
        tokens = tokenize(text, token_length=token_length, tokenize_by='chars')
    elif tokenize_by == 'words':
        tokens = tokenize(text, token_length=token_length, tokenize_by='words')
    else:
        raise ValueError("Invalid tokenize_by parameter. Use 'chars' or 'words'.")
    
    # Encode tokens into IDs, using <UNK> for unknown tokens
    return [word_to_id.get(token, word_to_id['<UNK>']) for token in tokens]  # Use <UNK> for unknown tokens


# Decode token IDs back to text
def decode(encoded_tokens, id_to_word, tokenize_by=tokenize_setting):

    if tokenize_by == 'chars':
        # Decode character-based tokens
        tokens = [id_to_word.get(token_id, '<UNK>') for token_id in encoded_tokens]
        text = ''.join(tokens)  # Join characters without spaces
    elif tokenize_by == 'words':
        # Decode word-based tokens
        tokens = [id_to_word.get(token_id, '<UNK>') for token_id in encoded_tokens]
        text = ' '.join(tokens)  # Join words with a space
    else:
        raise ValueError("Invalid tokenize_by parameter. Use 'chars' or 'words'.")
    
    return text


After which, we start to build our own dictionary based on the text corpus provided from our dataset. This will allow us to decode the outputs of the model back into plain english

In [7]:
# Building our dictionary of tokens and IDs that will be used as reference
word_to_id, id_to_word = build_vocab(dataset, token_length=token_length, tokenize_by=tokenize_setting)

In [None]:
# Setting the vocab_size
vocab_size = len(word_to_id)

In [None]:
vocab_size

445624

## Encoding our training dataset 

We now have our dictionary of tokens, and now we will start encoding our data. Seeing as it is a streaming dataset, we will need to loop through to obtain the required samples needed.

In [None]:
text = iter(dataset)
encoded_data_list = []
for i, sample in zip(itertools.count(), dataset):
    sample = next(text)['text']
    encoded_sample = torch.tensor(encode(sample, word_to_id), dtype = torch.long)

    encoded_data_list.append(encoded_sample)

    if i > iterations:
        break

: 

In [None]:
# Assuming encoded_data_list contains your encoded tensors
total_samples = len(encoded_data_list)

# Calculate the index for 99% of the data
split_idx = int(0.99 * total_samples)
train_data = encoded_data_list[:split_idx]
val_data = encoded_data_list[split_idx:]

# def get_batch(split):
#     data = train_data if split =='train' else val_data
#     ix = torch.randint(len(data) - blocksize, (batchsize,))
#     x = torch.stack([data[i:i+blocksize] for i in ix])
#     y = torch.stack([data[i+1:i+blocksize+1] for i in ix])
#     x, y = x.to(device), y.to(device)
#     return x, y

def get_batch(split):
    # Select the appropriate dataset (train or validation)
    data = train_data if split == 'train' else val_data
    
    # Randomly select batchsize samples from the data
    ix = torch.randint(0, len(data), (batchsize,))
    
    # Initialize lists to collect batches
    x_batch = []
    y_batch = []
    
    for i in ix:
        sample = data[i]
        
        # Ensure the sample has enough length for the blocksize
        if len(sample) > blocksize:
            # Randomly select a start point within the sample
            start_idx = torch.randint(0, len(sample) - blocksize, (1,)).item()
            
            # Extract the input (x) and the target (y) sequences
            x_sample = sample[start_idx:start_idx + blocksize]
            y_sample = sample[start_idx + 1:start_idx + blocksize + 1]
        else:
            # If the sample is shorter than blocksize, use the whole sample
            x_sample = sample[:-1]  # Exclude last token
            y_sample = sample[1:]   # Shift by one
            
        # Add to batch lists
        x_batch.append(x_sample)
        y_batch.append(y_sample)
    
    # Stack the tensors to form a batch
    x = torch.stack(x_batch)
    y = torch.stack(y_batch)
    
    # Move tensors to the appropriate device (GPU/CPU)
    x, y = x.to(device), y.to(device)
    
    return x, y

x, y = get_batch('train')

print(x)
print(y)

In [None]:
# Function to estimate the loss of the model
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y  = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
class Head(nn.Module):

    def __init__(self,head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.query = nn.Linear(n_embd, head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)

        self.register_buffer('tril', torch.tril(torch.ones(blocksize, blocksize)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2,-1) * k.shape[-1] ** -0.5
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))
        wei = F.softmax(wei, dim = -1)
        wei = self.dropout(wei)

        v = self.value(x)
        out = wei @ v
        return out


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out

In [None]:
class FeedForward(nn.Module):

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(n_embd, 4 * n_embd), nn.ReLU(), nn.Linear(4 * n_embd, n_embd), nn.Dropout(dropout),)

    def forward(self, x):
        return self.net(x)

In [None]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x+y)
        y = self.ffwd(x)
        x = self.ln2(x+y)
        return x

In [None]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) #create embedding table
        self.position_embedding_table = nn.Embedding(blocksize, n_embd) #create position embedding table
        self.blocks = nn.Sequential(*[Block(n_embd, n_head = n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) #final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size) #Linearlize at the end

        self.apply(self._init_weights)

    def _init_weights (self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)

    def forward(self, index, targets=None):
        B, T = index.shape
        
        
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            index_cond = index[:, -blocksize:]
            # get the predictions
            logits, loss = self.forward(index_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index



In [None]:
model = GPTLanguageModel(vocab_size)
m = model.to(device)

In [None]:
# Initialize parameters for early stopping
early_stopping_patience = 3
best_val_loss = float('inf')  # Initialize with infinity
no_improve_count = 0  # Counter for consecutive steps without improvement

optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        train_loss = losses['train']
        val_loss = losses['val']
        print(f"step: {iter}, train loss: {train_loss:.3f}, val loss: {val_loss:.3f}")
        
        # Check for improvement
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            no_improve_count = 0  # Reset counter if improvement
        else:
            no_improve_count += 1  # Increment counter if no improvement
        
        # Check early stopping condition
        if no_improve_count >= early_stopping_patience:
            print(f"Early stopping at step {iter} due to no improvement in validation loss.")
            break  # Exit the training loop if early stopping criteria met

    xb, yb = get_batch('train')
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(f"Final loss: {loss.item()}")

In [None]:
prompt = "There has been a fire."
context = torch.tensor(encode(prompt, word_to_id), dtype = torch.long, device = device)
generated_chars = decode(m.generate(context.unsqueeze(0) , max_new_tokens = 500)[0].tolist(), id_to_word)
print(generated_chars)