# GPT playground

Implementation of a Decoder only transformer with self-attention as a basic language model

Following the tutorial Let's build GPT, from sratch: https://youtu.be/kCc8FmEb1nY?si=0wP9Tk7SHDYwhlqh

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import torch
import torch.nn as nn
from torch.nn import functional as F
import logging

In [2]:
torch.manual_seed(1337)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [4]:
block_size = 8 # max context length
eval_interval = 500
max_iters = 10_000
batch_size = 4 # number of squences to process in parallel
eval_iters = 500
lr = 1e-3

In [5]:
imput_path = './data/input.txt'
if not os.path.exists('./data/input.txt'):
    !wget -P ./data https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [6]:
# load the text
with open('./data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [7]:
print(f"length of dataset in characters: \n{len(text)}\n")
print(f"Sample text: \n{text[:100]}")

length of dataset in characters: 
1115394

Sample text: 
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [8]:
# unique characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [9]:
# simple tokenization: a map from characters to integers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [10]:
# encode the text and create a torch tensor
data = torch.tensor(encode(text), dtype=torch.int64)
print(data.shape, data.dtype)
print(data[:15], decode(data[:15].numpy()))

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0]) First Citizen:



In [11]:
# train and validation split
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [12]:
print(train_data[:block_size+1])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])


In [13]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"context = {context}, target = {target}")

context = tensor([18]), target = 47
context = tensor([18, 47]), target = 56
context = tensor([18, 47, 56]), target = 57
context = tensor([18, 47, 56, 57]), target = 58
context = tensor([18, 47, 56, 57, 58]), target = 1
context = tensor([18, 47, 56, 57, 58,  1]), target = 15
context = tensor([18, 47, 56, 57, 58,  1, 15]), target = 47
context = tensor([18, 47, 56, 57, 58,  1, 15, 47]), target = 58


In [14]:
def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [15]:
xb , yb = get_batch("train")
print(f"input shape = {xb.shape}")
print(f"input = {xb}")
print(f"target shape = {yb.shape}")
print(f"target = {yb}")

input shape = torch.Size([4, 8])
input = tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
target shape = torch.Size([4, 8])
target = tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [16]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        logger.debug(f"Context = {context}, target = {target}")

In [17]:
class BigramLanguageModel(nn.Module):
    """
    B: batch_size
    T: time: block_size
    C: channel or number of clasification classes: vocab_size
    """
    def __init__(self, vocab_size):
        super().__init__()
        # each token reads the logits for the next token
        # from a torch lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) shape integer tensors
        logits = self.token_embedding_table(idx) # (B, T, C)
        
        if targets is None:
            loss = None
        else:
            # convert to (B*C, T) for log-likelihood loss calculation
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_tokens):
        """
        Generates (B, T+1), (B, T+2), ..., (B, T+max_tokens)
            with the next characters generated given the context
        
        To be able to generate text using the model
        idx: the current context array of characters in a batch of size (B, T)
        """
        for _ in range(max_tokens):
            # get the predictions
            logits, loss = self(idx) # (B, T, C)

            # last time step is the predictor of the next character
            # since this is a bigram model
            logits = logits[:, -1, :] # (B, C)

            # calculate the probabilities along the channel
            probs = F.softmax(logits, dim=-1) # (B, C)

            # sample from the probability distribution
            id_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            
            # append sampled index to the running sequence
            # along the time dimension
            idx = torch.cat((idx, id_next), dim=1) # (B, T+1)

        return idx

In [18]:
model = BigramLanguageModel(vocab_size=vocab_size)
m = model.to(device)

logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(5.0364, grad_fn=<NllLossBackward0>)


In [19]:
# expected initial loss of untrained network
- np.log(1/vocab_size)

4.174387269895637

In [20]:
# Untrained results
idx = torch.zeros((1,1), dtype=torch.int64) # (B=1, max_tokens)
idx = m.generate(idx, max_tokens=10) # (B=1, max_tokens+1)
print(decode(idx[0].tolist()))


l-QYjt'CL?


In [21]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    
    model.train()
    return out

In [22]:
optimizer = torch.optim.Adam(m.parameters(), lr=lr)

In [23]:
_max_iters = 5000
for iter in range(_max_iters):
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # smaple a batch of data
    xb, yb = get_batch("train")

    # evaluate the loss
    logits, loss = m(xb, yb)
    
    # zero out the grads
    optimizer.zero_grad(set_to_none=True)

    # backprop
    loss.backward()

    # take an optimization step
    optimizer.step()

print(loss.item())

step 0: train loss 4.6399, val loss 4.6334
step 500: train loss 4.2926, val loss 4.3166
step 1000: train loss 4.0081, val loss 4.0004
step 1500: train loss 3.7539, val loss 3.7596
step 2000: train loss 3.5190, val loss 3.5528
step 2500: train loss 3.3451, val loss 3.3689
step 3000: train loss 3.2006, val loss 3.2384
step 3500: train loss 3.0891, val loss 3.1072
step 4000: train loss 2.9763, val loss 2.9871
step 4500: train loss 2.8955, val loss 2.9154
3.0494678020477295


In [24]:
# Trained results
idx = torch.zeros((1,1), dtype=torch.int64, device=device) # (B=1, max_tokens)
idx = m.generate(idx, max_tokens=500) # (B=1, max_tokens+1)
print(decode(idx[0].tolist()))


Bfrar,
FLnqMil-GERg veim TwXisher gerARSrou,OMWt me,e:PHM-Smerer kwaL,,jGPy wowamstes zns,
Allomu?'qacn pr jGGH.
;'I'

Unq$d sugg
PjTEDis,-acZJX

WtZzhooo,OfJeder m bonk?RKEFYvinud mumanst!qThisl?MBGEY are mm;tdoq$CIF?nOdekviwncethe.eshamy fo lllexvin;IKkg,-kl;d.xermsur d'3g-spXchacrry ardk'm'Ty
A sqZl m?p a aXm her pr: wPUEDm;.lilfromj;As
BR
Tl!wprPWI'G iRuh iminifkltsf
HHENUrssnohoMxJqRTRle fisthFgoAll.
w
THY doCA,
LRM-a-Thnasi'Wptoc-GBuido'm I tst o h tunckxy t:,
LLf.
XGbemyed-$OMEhiBENILRfgy
