# Tutorial building our own GPT by Andrey Karpathy

https://www.youtube.com/watch?v=kCc8FmEb1nY&t=2581s

In [1]:
with open("input.txt", "r", encoding="utf-8") as file:
    text = file.read()

In [7]:
print(f"Length of dataset in characters: {len(text):_}")

Length of dataset in characters: 1_115_394


In [None]:
# Let look at the first 1000 characters
print(text[:1000])

In [9]:
# All unique characters in the dataset
unique_characters = sorted(set(text))
vocab_size = len(unique_characters)
print(f"Unique characters in dataset: {vocab_size:_}")
print(''.join(unique_characters))

Unique characters in dataset: 65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [10]:
# Tokenise text
# For our usecase we are simply going to use a single character as a token

# Create a mapping from character to integer and vice versa
str_to_int = {char: i for i, char in enumerate(unique_characters)}
int_to_str = {i: char for i, char in enumerate(unique_characters)}

# Define our encoding and decoding functions
def encode(text):
    """Convert given text characters to a list of integers using the str_to_int mapping."""
    return [str_to_int[char] for char in text]

def decode(text):
    """Convert given list of integers to text using the int_to_str mapping."""
    return ''.join([int_to_str[i] for i in text])

# Example of encoding and decoding
print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [16]:
# Encode our dataset and store it in a torch.Tensor
# A torch.Tensor is a multi-dimensional matrix containing elements of a single data type.
import torch

# The tensor is basically a single long list of integers representing the characters in the text
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(f"First 30 characters in dataset:\n\n{text[:30]}\n\n")
print(f"First 30 characters encoded:\n{data[:30]}")


torch.Size([1115394]) torch.int64
First 30 characters in dataset:

First Citizen:
Before we proce


First 30 characters encoded:
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43])


In [17]:
# We will split up our data into training (90%) and validation sets (10%)
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:]

In [25]:
# We now want to feed in our training data into the transformer model.
# It's too computationally expensive to load the entire dataset into the model at once.
# Instead, we will take 'random' chunks (of block_size) of the dataset and feed them into the model.

block_size = 8

print("Take a single sample from the dataset + 1")
print("Note that this has multiple examples packed into it (in a chunk of 9 chars, there are 8 examples):")
sample = train_data[:block_size + 1]
print(f"Total sample: {sample}\n")

# Hardcoded printing out the examples
# print(f"After {sample[:1]} follows {sample[1:2]}")
# print(f"After {sample[:2]} follows {sample[2:3]}")
# print(f"After {sample[:3]} follows {sample[3:4]}")
# print("etc...")

# Dynamically run through a sample
def print_examples(sample, block_size=None):
    """Function to dynamically run though all examples of a given sample."""
    if block_size is None:
        block_size = len(sample) - 1

    for idx in range(block_size):
        context = sample[:idx+1]
        target = sample[idx+1]
        print(f"When input is {context} the target is: {target}")

print_examples(sample)

Take a single sample from the dataset + 1
Note that this has multiple examples packed into it (in a chunk of 9 chars, there are 8 examples):
Total sample: tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

When input is tensor([18]) the target is: 47
When input is tensor([18, 47]) the target is: 56
When input is tensor([18, 47, 56]) the target is: 57
When input is tensor([18, 47, 56, 57]) the target is: 58
When input is tensor([18, 47, 56, 57, 58]) the target is: 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target is: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is: 58


In [32]:
# To efficiently train our model, we will load in the samples in batches.
# This is because GPU's are good at parallel processing and can process multiple samples at once.
# Note that there is no dependency between samples in a batch, so they can easily be processed in parallel.

# Set the seed for reproducibility. This will ensure the torch.randint
# will always generate the same random numbers and we can compare our
# output to the video.
torch.manual_seed(1337)
batch_size = 4 # How many independent sequences to process in parallel
block_size = 8 # What is the maximun context length (size of a single sample) for predictions

def get_batch(data: torch.Tensor, batch_size: int, block_size: int):
    """Retrieve a single random batch of samples/sequences.
    
    :param data: The data to sample from.
    :param batch_size: The number of independent sequences to process in parallel.
    :param block_size: The maximum context length (size of a single sample) for predictions.

    :return: A tuple containing the input and target batches.
    """
    random_data_idx = torch.randint(len(data) - block_size, (batch_size,))

    input_batch = torch.stack([data[i:i + block_size] for i in random_data_idx])
    target_batch = torch.stack([data[i + 1:i + block_size + 1] for i in random_data_idx])

    return input_batch, target_batch

input_batch, target_batch = get_batch(train_data, batch_size, block_size)
print(f"Batch input shape: {input_batch.shape}\n{input_batch}\n\n")
print(f"Batch target shape: {target_batch.shape}\n{target_batch}\n\n")

print("------")

for batch in range(batch_size):
    for idx in range(block_size):
        context = input_batch[batch, :idx+1]
        target = target_batch[batch, idx]
        print(f"When input is {context.tolist()} the next likely token (target) is {target}")


input_start_idx=tensor([ 76049, 234249, 934904, 560986])
Batch input shape: torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


Batch target shape: torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


------
When input is [24] the next likely token (target) is 43
When input is [24, 43] the next likely token (target) is 58
When input is [24, 43, 58] the next likely token (target) is 5
When input is [24, 43, 58, 5] the next likely token (target) is 57
When input is [24, 43, 58, 5, 57] the next likely token (target) is 1
When input is [24, 43, 58, 5, 57, 1] the next likely token (target) is 46
When input is [24, 43, 58, 5, 57, 1, 46] the next likely token (target) is 43
When input is [24, 43, 58, 5, 57, 1, 46, 43] the 

In [45]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

# Now we have our data ready to be fed into the model.
print(f"Input batch:\n{input_batch}\n\n")

# Start with the simplest model (according to Andrej), a bigram model.
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        # Create token embedding table.
        # It will basically be a matrix of size (vocab_size, vocab_size)
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx: torch.Tensor, targets: torch.Tensor | None = None):
        # idx and targets are both (B, T) tensors of integers

        # We retrieve the token embeddings by index.
        # An embedding is 'Batch by Time by Channel'
        # In this case batch size is 4, time is 8 and channel is vocal_size=65
        # Logits are basically the scores for the next character in the sequence.
        logits: torch.Tensor = self.token_embedding_table(idx) # (Batch, Time, Channel)

        # We want to calculate the loss for the model.
        # Basically we are asking the question: How well did the model predict the
        # targets, based on the logits.
        if targets is None:
            loss = None
        else:
            # NOTE: cross_entropy expects Batch*Time by Channel
            batch, time, channel = logits.shape
            logits = logits.view(batch*time, channel)
            targets = targets.view(batch*time)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx: torch.Tensor, max_new_tokens: int):
        """Generate model predictions for a given input sequence.
        
        :param idx: The current index for a batch (Batch by Time)
        :param max_new_tokens: The maximum number of new tokens to generate
        """

        for _ in range(max_new_tokens):
            # Get the predictions
            # NOTE: self(idx) will end up calling the self.forward method
            logits, _ = self(idx)

            # Focus only on the last time step. 
            logits = logits[:, -1, :] # becomes (Batch, Channel)

            # Apply softmax to get probabilities
            probabilities = F.softmax(logits, dim=-1) # (Batch, Channel)

            # Sample from the distribution (probabilities)
            # We will get a batch by 1 tensor of integers where each batch will
            # have a single prediction for the next token
            idx_next = torch.multinomial(probabilities, num_samples=1) # (Batch, 1)

            # Append sampled index to the running sequence
            idx = torch.cat([idx, idx_next], dim=1) # (Batch, Time+1)
        
        return idx
    

model = BigramLanguageModel(vocab_size)
logits, loss = model(input_batch, target_batch)
print(f"Output shape: {logits.shape}")
print(f"Loss: {loss}")

# Generate an index with zeros to kick off the generation process
idx = torch.zeros(1,1, dtype=torch.long)

# Generate a sequence of max 100 new tokens and convert it to a python list
# so we can decode it using our decode function.
# We haven't trained the model at all yet so the output will be random.
new_tokens = model.generate(idx, max_new_tokens=100)[0].tolist()
print(f"Generated new tokens:\n{decode(new_tokens)}")


Input batch:
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


Output shape: torch.Size([32, 65])
Loss: 4.878634929656982
Generated new tokens:

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [49]:
# Lets train the model

# Define the optimizer
# For a typical learning rate he suggest 3e-4, (pronounced 3 times 10 to the power of -4)
# but since we're having a very small network we can use 
# a higher learning rate.
learning_rate = 1e-3
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
print(optimizer)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)


In [51]:
# Run through steps of training
# Note that the model class instance will be updated every time you run this cell.

batch_size = 32

training_steps = 10_000
for steps in range(training_steps):

    # Sample a batch of data
    input_batch, target_batch = get_batch(train_data, batch_size, block_size)

    # Evaluate the loss and backpropagate
    logits, loss = model(input_batch, target_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


input_start_idx=tensor([137792, 214152, 848625,  30643,  30298,  67982, 199748,  80731,  27473,
         51899, 529598, 500284, 193394, 770456, 708670, 145570, 370351, 980757,
        773155, 181893, 758929, 161876,   8548, 912856, 181962, 517749, 569949,
        718933, 360846, 938909, 926620, 444214])
input_start_idx=tensor([ 975369,  416293,  333760,  696196,  619234,  871899,  398106,   61981,
         491652,   90964,  317771,  135554,  320146,   53682,   12146,  671038,
         711547,  339431,   97088,  421241,  170311,  264502,   64217,  137060,
         637587,  617922,  680025, 1003297,  558987,  186512,  935401,  703186])
input_start_idx=tensor([  6625, 564559, 505874,  32958, 337818, 632596, 678822, 288075, 297793,
        325242, 996241,  92067, 458510, 134853, 946627, 926964, 241163, 736501,
        954914, 580087,    810, 508181, 206606, 482121, 293367, 453614, 305920,
        928539, 638353, 839953, 219038, 186070])
input_start_idx=tensor([885955, 948045, 200154, 99062

In [60]:
# Retrieve some next tokens. Since it's a simple bigram model it will be pretty bad, but
# it should be a lot better already than the random output we got before training.
# The bigram model is only looking at the last token and trying to predict the next token.
# what we want is to look at the entire context (so multiple tokens stringed together) and predict the next token.
new_tokens = model.generate(idx, max_new_tokens=300)[0].tolist()
print(f"Generated new tokens:\n{decode(new_tokens)}")


Generated new tokens:


y, s t sthitathy hf womiay netinpreremarer t,
FReymar t, wifolin.

Thevio wous har hilonto thomand an w h witis, me
Dyoverdoiso
JD per onondwhen;
Ans KI andatald notRTuctarvefre me for eitthay verl fat,


A: ot bele er; Buco,
L:
NUE sq sethaittthy yayit tifod rer; y e ined guratosoulyequg.
BUEEd ta


# At this point I have transferred above code code to bigram.py. Video timestamp +/- 38:00

https://youtu.be/kCc8FmEb1nY?t=2328