# Build GPT style transformer from scratch
1. This time just using the Karpathy codebase as a guide, not following it step by step

In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
import pandas as pd
import numpy as np
import polars as pl
import os
from pathlib import Path

from trav_gpt import ROOT_DIR


In [4]:
from hydra import compose, initialize
from omegaconf import DictConfig, OmegaConf

In [7]:
with initialize(config_path="../conf", version_base=None):
    cfg = compose(config_name="config")
cfg.paths.root = ROOT_DIR

In [8]:
cfg.paths

{'root': '/Users/traviswhitfield/Documents/github/trav_gpt', 'data': '${paths.root}/data', 'external': '${paths.data}/external', 'interim': '${paths.data}/interim', 'processed': '${paths.data}/processed', 'raw': '${paths.data}/raw'}

In [9]:
cfg.paths.external

'/Users/traviswhitfield/Documents/github/trav_gpt/data/external'

In [11]:
text_path = Path(cfg.paths.external) / 'input.txt'

with open(text_path, 'r') as f:
    text = f.read()

In [13]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


# Tokenizer

In [None]:
from trav_gpt.gpt2 import CharTokenizer, get_batch



In [18]:
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [67]:
tokenizer = CharTokenizer()
tokenizer.fit(text)

In [68]:
tokenizer.vocab_size

65

In [31]:
tokenizer.encode(text[:100])

[18,
 47,
 56,
 57,
 58,
 1,
 15,
 47,
 58,
 47,
 64,
 43,
 52,
 10,
 0,
 14,
 43,
 44,
 53,
 56,
 43,
 1,
 61,
 43,
 1,
 54,
 56,
 53,
 41,
 43,
 43,
 42,
 1,
 39,
 52,
 63,
 1,
 44,
 59,
 56,
 58,
 46,
 43,
 56,
 6,
 1,
 46,
 43,
 39,
 56,
 1,
 51,
 43,
 1,
 57,
 54,
 43,
 39,
 49,
 8,
 0,
 0,
 13,
 50,
 50,
 10,
 0,
 31,
 54,
 43,
 39,
 49,
 6,
 1,
 57,
 54,
 43,
 39,
 49,
 8,
 0,
 0,
 18,
 47,
 56,
 57,
 58,
 1,
 15,
 47,
 58,
 47,
 64,
 43,
 52,
 10,
 0,
 37,
 53,
 59]

# Load the data into a tensor

In [32]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
data = torch.tensor(tokenizer.encode(text), dtype=torch.long)

TRAIN_RATIO = 0.9
n = int(TRAIN_RATIO * len(data))

# Split the data first into the train and test datasets
# There's certainly a better way of doing this with textual data, but we'll do it like this for now.
train = data[:n]
test = data[n:]

In [40]:
len(data), len(train), len(test)

(1115394, 1003854, 111540)

In [None]:
# So the batches of data that I need should be in a single tensor object. They should just be <batch_size> different sets of text of <context_size> length
# So I'll just randomly sample starting points in my giant, tokenized dataset and then grab the appropriate length vector from each of those locations
# and stack them together. 

In [61]:
context_size = 8
batch_size = 4


x, y = get_batch('train')

# Initial network
1. Let's start with just a simple multilayer perceptron (i.e. fully connected feedforward network)
    - Can I just passed the tokenized inputs into this? It seems like that should work right?
    - I can do that as long as I only pass in one input at a time I guess. 
    - 

In [107]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size, embed_size = 10):
        super().__init__()

        # The embedding dim needs to be the same size as the vocab, because that's the
        # output of this step. It should output the logit associated with each possible
        # character. 

        # If I wanted to use a different embedding dimension, then I'd need to first
        # embed the characters to that dimension, then have an additional step which
        # generates the output logits associated with each character.
        self.token_embedding_table = nn.Embedding(num_embeddings = vocab_size,
                                                  embedding_dim = embed_size)
        
        self.fc1 = nn.Linear(embed_size, vocab_size)

    def forward(self, x, targets = None):

        logits = self.token_embedding_table(x) # (B,T,E)
        logits = self.fc1(logits) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            
            # To calculate the loss across the whole batch, we just reshape the 
            # logits such that the batches are basically combined. Then we calculate the
            # loss on each of the individual token predictions. 
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    
    def generate(self, idx, max_new_tokens = 50):

        for _ in range(max_new_tokens):
            logits, loss = self(idx) # (B,T,C) where B = batch size, T = context size, C = vocabulary size
            
            logits = logits[:, -1, :] # becomes (B, C)

            probs = F.softmax(logits, dim=-1) # Perform softmax on the C dimension

            idx_next = torch.multinomial(probs, num_samples=1) # (B,1)

            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx


In [None]:
EVAL_ITERS = 200

@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
EVAL_ITERS = 200
LEARNING_RATE = 1e-2
MAX_ITERS = 3000
EMBED_SIZE = 10
EVAL_INTERVAL = 300

model = BigramLanguageModel(vocab_size=tokenizer.vocab_size, embed_size=EMBED_SIZE)

optimizer = torch.optim.AdamW(model.parameters(), lr = LEARNING_RATE)



for iter in range(MAX_ITERS):

    if iter % EVAL_ITERS == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, test loss {losses['val']:.4f}")
    
    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad() # zero out the previous gradients
    loss.backward() # Backpropagate the loss through the NN
    optimizer.step() # Update the model parameters using those gradients

step 0: train loss 4.3124, test loss 4.2986
step 200: train loss 2.8225, test loss 2.8457
step 400: train loss 2.6570, test loss 2.6771
step 600: train loss 2.6349, test loss 2.6022
step 800: train loss 2.5878, test loss 2.6145
step 1000: train loss 2.6187, test loss 2.6179
step 1200: train loss 2.5668, test loss 2.6191
step 1400: train loss 2.5807, test loss 2.5988
step 1600: train loss 2.5414, test loss 2.5583
step 1800: train loss 2.5630, test loss 2.5593
step 2000: train loss 2.5758, test loss 2.5694
step 2200: train loss 2.5320, test loss 2.5770
step 2400: train loss 2.5686, test loss 2.5289
step 2600: train loss 2.5492, test loss 2.5800
step 2800: train loss 2.5807, test loss 2.5579


In [114]:
output = model.generate(torch.zeros((1,1), dtype=torch.long))[0].tolist()

print(tokenizer.decode(output))


I litak g, titovyolofad, s!? nowanthamere; spiarar


In [115]:
model.register_buffer?

[0;31mSignature:[0m
[0mmodel[0m[0;34m.[0m[0mregister_buffer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtensor[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpersistent[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Add a buffer to the module.

This is typically used to register a buffer that should not to be
considered a model parameter. For example, BatchNorm's ``running_mean``
is not a parameter, but is part of the module's state. Buffers, by
default, are persistent and will be saved alongside parameters. This
behavior can be changed by setting :attr:`persistent` to ``False``. The
only difference between a persistent buffer and a non-persistent buffer
is that the latter will no

In [116]:
nn.Transformer?

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mTransformer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0md_model[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m512[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnhead[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m8[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnum_encoder_layers[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m6[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnum_decoder_layers[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m6[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdim_feedforward[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m2048[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdropout[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0.1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mactivation[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mCallable[0m[0;34m[[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m[0;34m,[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m]

In [117]:
nn.TransformerDecoder?

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mTransformerDecoder[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdecoder_layer[0m[0;34m:[0m [0;34m'TransformerDecoderLayer'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnum_layers[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnorm[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mnn[0m[0;34m.[0m[0mmodules[0m[0;34m.[0m[0mmodule[0m[0;34m.[0m[0mModule[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
TransformerDecoder is a stack of N decoder layers.

.. note::
    See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
    for an in depth discussion of the performant building blocks PyTorch offers for building your own
    transformer layers.

Args:
    decoder_layer: an instance of the TransformerDecoderLay

In [118]:
nn.TransformerDecoderLayer?

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mTransformerDecoderLayer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0md_model[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnhead[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdim_feedforward[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m2048[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdropout[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0.1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mactivation[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mCallable[0m[0;34m[[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m[0;34m,[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;34m<[0m[0mfunction[0m [0mrelu[0m [0mat[0m [0;36m0x166c514e0[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlayer_norm_eps[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m1e-05[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    

In [119]:
nn.ModuleList?

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mModuleList[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmodules[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mIterable[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mnn[0m[0;34m.[0m[0mmodules[0m[0;34m.[0m[0mmodule[0m[0;34m.[0m[0mModule[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Holds submodules in a list.

:class:`~torch.nn.ModuleList` can be indexed like a regular Python list, but
modules it contains are properly registered, and will be visible by all
:class:`~torch.nn.Module` methods.

Args:
    modules (iterable, optional): an iterable of modules to add

Example::

    class MyModule(nn.Module):
        def __init__(self) -> None:
            super().__init__()
            self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])

        def forward(self, x):
      

In [120]:
nn.Linear?


[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mLinear[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0min_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mout_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbias[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdevice[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Applies an affine linear transformation to the incoming data: :math:`y = xA^T + b`.

This module supports :ref:`TensorFloat32<tf32_on_ampere>`.

On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

Args:
    in_features: size of each input sample
    out_features: size of each output sample
    bias:

In [7]:
import torch
import torch.nn as nn

In [1]:
from trav_gpt.gpt2 import GPTLanguageModel

In [3]:
model = GPTLanguageModel(65)

In [4]:
print(model)

GPTLanguageModel(
  (token_embedding_table): Embedding(65, 32)
  (position_embedding_table): Embedding(8, 32)
  (blocks): Sequential(
    (0): Block(
      (mha): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (key): Linear(in_features=32, out_features=8, bias=False)
            (query): Linear(in_features=32, out_features=8, bias=False)
            (value): Linear(in_features=32, out_features=8, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=32, out_features=32, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedForward(
        (ffwd): Sequential(
          (0): Linear(in_features=32, out_features=128, bias=True)
          (1): ReLU()
          (2): Linear(in_features=128, out_features=32, bias=True)
        )
      )
    )
    (1): Block(
      (mha): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
        

In [None]:
class GPTFromPytorch(nn.Module):
    
    def __init__(self):
        super().__init__()

        

In [9]:
nn.TransformerDecoderLayer?

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mTransformerDecoderLayer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0md_model[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnhead[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdim_feedforward[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m2048[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdropout[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0.1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mactivation[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mCallable[0m[0;34m[[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m[0;34m,[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;34m<[0m[0mfunction[0m [0mrelu[0m [0mat[0m [0;36m0x3104b2d40[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlayer_norm_eps[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m1e-05[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    

In [10]:
torch.chunk?

[0;31mDocstring:[0m
chunk(input: Tensor, chunks: int, dim: int = 0) -> Tuple[Tensor, ...]

Attempts to split a tensor into the specified number of chunks. Each chunk is a view of
the input tensor.


.. note::

    This function may return fewer than the specified number of chunks!

.. seealso::

    :func:`torch.tensor_split` a function that always returns exactly the specified number of chunks

If the tensor size along the given dimension :attr:`dim` is divisible by :attr:`chunks`,
all returned chunks will be the same size.
If the tensor size along the given dimension :attr:`dim` is not divisible by :attr:`chunks`,
all returned chunks will be the same size, except the last one.
If such division is not possible, this function may return fewer
than the specified number of chunks.

Arguments:
    input (Tensor): the tensor to split
    chunks (int): number of chunks to return
    dim (int): dimension along which to split the tensor

Example:
    >>> torch.arange(11).chunk(6)
    (tenso

In [20]:
import torch
import torch.nn as nn
import time

embed_dim = 768
batch_size = 16
seq_len = 128

x = torch.randn(batch_size, seq_len, embed_dim)

# Single projection (efficient way)
class SingleProjection(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.qkv_proj = nn.Linear(embed_dim, 3 * embed_dim)
    
    def forward(self, x):
        qkv = self.qkv_proj(x)  # One matrix multiplication
        return torch.chunk(qkv, 3, dim=-1)  # Split into Q, K, V

# Separate projections (inefficient way)
class SeparateProjections(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
    
    def forward(self, x):
        q = self.q_proj(x)  # Three separate matrix multiplications
        k = self.k_proj(x)
        v = self.v_proj(x)
        return q, k, v

single_proj_model = SingleProjection(embed_dim)
separate_proj_model = SeparateProjections(embed_dim)

# Measure execution time
start_time = time.time()
q, k, v = single_proj_model(x)
print(f"Single Projection Time: {time.time() - start_time:.6f} sec")

start_time = time.time()
q, k, v = separate_proj_model(x)
print(f"Separate Projections Time: {time.time() - start_time:.6f} sec")

Single Projection Time: 0.008170 sec
Separate Projections Time: 0.006761 sec
