# Build GPT style transformer from scratch
1. This time just using the Karpathy codebase as a guide, not following it step by step

In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
import pandas as pd
import numpy as np
import polars as pl
import os
from pathlib import Path

from trav_gpt import ROOT_DIR


In [4]:
from hydra import compose, initialize
from omegaconf import DictConfig, OmegaConf

In [7]:
with initialize(config_path="../conf", version_base=None):
    cfg = compose(config_name="config")
cfg.paths.root = ROOT_DIR

In [8]:
cfg.paths

{'root': '/Users/traviswhitfield/Documents/github/trav_gpt', 'data': '${paths.root}/data', 'external': '${paths.data}/external', 'interim': '${paths.data}/interim', 'processed': '${paths.data}/processed', 'raw': '${paths.data}/raw'}

In [9]:
cfg.paths.external

'/Users/traviswhitfield/Documents/github/trav_gpt/data/external'

In [11]:
text_path = Path(cfg.paths.external) / 'input.txt'

with open(text_path, 'r') as f:
    text = f.read()

In [13]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


# Tokenizer

In [None]:
from 



In [18]:
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [67]:
tokenizer = CharTokenizer()
tokenizer.fit(text)

In [68]:
tokenizer.vocab_size

65

In [31]:
tokenizer.encode(text[:100])

[18,
 47,
 56,
 57,
 58,
 1,
 15,
 47,
 58,
 47,
 64,
 43,
 52,
 10,
 0,
 14,
 43,
 44,
 53,
 56,
 43,
 1,
 61,
 43,
 1,
 54,
 56,
 53,
 41,
 43,
 43,
 42,
 1,
 39,
 52,
 63,
 1,
 44,
 59,
 56,
 58,
 46,
 43,
 56,
 6,
 1,
 46,
 43,
 39,
 56,
 1,
 51,
 43,
 1,
 57,
 54,
 43,
 39,
 49,
 8,
 0,
 0,
 13,
 50,
 50,
 10,
 0,
 31,
 54,
 43,
 39,
 49,
 6,
 1,
 57,
 54,
 43,
 39,
 49,
 8,
 0,
 0,
 18,
 47,
 56,
 57,
 58,
 1,
 15,
 47,
 58,
 47,
 64,
 43,
 52,
 10,
 0,
 37,
 53,
 59]

# Load the data into a tensor

In [32]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
data = torch.tensor(tokenizer.encode(text), dtype=torch.long)

TRAIN_RATIO = 0.9
n = int(TRAIN_RATIO * len(data))

# Split the data first into the train and test datasets
# There's certainly a better way of doing this with textual data, but we'll do it like this for now.
train = data[:n]
test = data[n:]

In [40]:
len(data), len(train), len(test)

(1115394, 1003854, 111540)

In [None]:
# So the batches of data that I need should be in a single tensor object. They should just be <batch_size> different sets of text of <context_size> length
# So I'll just randomly sample starting points in my giant, tokenized dataset and then grab the appropriate length vector from each of those locations
# and stack them together. 

In [60]:
def get_batch(split, context_size = 8, batch_size = 4):
    """This will convert the input tensor (of the whole text) into the appropriate 
    inputs and target labels (x and y)
    """
    
    data = train if split == 'train' else test

    # Grab the starting points. This returns a tensor of shape (batch_size,)
    ix = torch.randint(0, len(data) - context_size - 1, (batch_size, ))

    # Once I've grabbed those starting points, then I need to just grab the contexts associated with
    # each one (and also the targets, which will be shifted over by 1)
    x = torch.stack([data[ix[i]: ix[i] + context_size] for i in range(batch_size)])
    y = torch.stack([data[ix[i] + 1: ix[i] + context_size + 1] for i in range(batch_size)])

    return x, y

In [61]:
context_size = 8
batch_size = 4


x, y = get_batch('train')

# Initial network
1. Let's start with just a simple multilayer perceptron (i.e. fully connected feedforward network)
    - Can I just passed the tokenized inputs into this? It seems like that should work right?
    - I can do that as long as I only pass in one input at a time I guess. 
    - 

In [107]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size, embed_size = 10):
        super().__init__()

        # The embedding dim needs to be the same size as the vocab, because that's the
        # output of this step. It should output the logit associated with each possible
        # character. 

        # If I wanted to use a different embedding dimension, then I'd need to first
        # embed the characters to that dimension, then have an additional step which
        # generates the output logits associated with each character.
        self.token_embedding_table = nn.Embedding(num_embeddings = vocab_size,
                                                  embedding_dim = embed_size)
        
        self.fc1 = nn.Linear(embed_size, vocab_size)

    def forward(self, x, targets = None):

        logits = self.token_embedding_table(x) # (B,T,E)
        logits = self.fc1(logits) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            
            # To calculate the loss across the whole batch, we just reshape the 
            # logits such that the batches are basically combined. Then we calculate the
            # loss on each of the individual token predictions. 
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    
    def generate(self, idx, max_new_tokens = 50):

        for _ in range(max_new_tokens):
            logits, loss = self(idx) # (B,T,C) where B = batch size, T = context size, C = vocabulary size
            
            logits = logits[:, -1, :] # becomes (B, C)

            probs = F.softmax(logits, dim=-1) # Perform softmax on the C dimension

            idx_next = torch.multinomial(probs, num_samples=1) # (B,1)

            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx


In [None]:
EVAL_ITERS = 200

@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
EVAL_ITERS = 200
LEARNING_RATE = 1e-2
MAX_ITERS = 3000
EMBED_SIZE = 10
EVAL_INTERVAL = 300

model = BigramLanguageModel(vocab_size=tokenizer.vocab_size, embed_size=EMBED_SIZE)

optimizer = torch.optim.AdamW(model.parameters(), lr = LEARNING_RATE)



for iter in range(MAX_ITERS):

    if iter % EVAL_ITERS == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, test loss {losses['val']:.4f}")
    
    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad() # zero out the previous gradients
    loss.backward() # Backpropagate the loss through the NN
    optimizer.step() # Update the model parameters using those gradients

step 0: train loss 4.3124, test loss 4.2986
step 200: train loss 2.8225, test loss 2.8457
step 400: train loss 2.6570, test loss 2.6771
step 600: train loss 2.6349, test loss 2.6022
step 800: train loss 2.5878, test loss 2.6145
step 1000: train loss 2.6187, test loss 2.6179
step 1200: train loss 2.5668, test loss 2.6191
step 1400: train loss 2.5807, test loss 2.5988
step 1600: train loss 2.5414, test loss 2.5583
step 1800: train loss 2.5630, test loss 2.5593
step 2000: train loss 2.5758, test loss 2.5694
step 2200: train loss 2.5320, test loss 2.5770
step 2400: train loss 2.5686, test loss 2.5289
step 2600: train loss 2.5492, test loss 2.5800
step 2800: train loss 2.5807, test loss 2.5579


In [114]:
output = model.generate(torch.zeros((1,1), dtype=torch.long))[0].tolist()

print(tokenizer.decode(output))


I litak g, titovyolofad, s!? nowanthamere; spiarar
