# shakespeare_GPT

Building Andrej's Shakespeare_GPT from scratch

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

<torch._C.Generator at 0x7a13a2e15b50>

In [1]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-10-13 17:44:50--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-10-13 17:44:50 (46.6 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [4]:
# let's look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
# All unique characters that appear in text (vocab)
chars = list(sorted(set(text)))
vocab_size = len(chars)

print("".join(chars))
print("vocab_size: ", vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab_size:  65


In [6]:
# create a mapping from characters to integers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for ch, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s] # encoder: take a string and output list of integers
decode = lambda l: "".join([itos[i] for i in l])

print(encode("O'hoy there pirate!"))
print(decode(encode("O'hoy there pirate!")))

[27, 5, 46, 53, 63, 1, 58, 46, 43, 56, 43, 1, 54, 47, 56, 39, 58, 43, 2]
O'hoy there pirate!


In [7]:
# Let's encode (tokenize) the entire dataset
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # same characters we looked at above

NameError: name 'torch' is not defined

In [8]:
# train/test - split
n = int(0.9*len(data))
train = data[:n]
val = data[n:]

NameError: name 'data' is not defined

In [9]:
block_size = 8
train[:block_size + 1]

NameError: name 'train' is not defined

In [10]:
x = train[:block_size]
y = train[1:block_size + 1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"when input is {context.tolist()}, target is: {target}")

when input is [18], target is: 47
when input is [18, 47], target is: 56
when input is [18, 47, 56], target is: 57
when input is [18, 47, 56, 57], target is: 58
when input is [18, 47, 56, 57, 58], target is: 1
when input is [18, 47, 56, 57, 58, 1], target is: 15
when input is [18, 47, 56, 57, 58, 1, 15], target is: 47
when input is [18, 47, 56, 57, 58, 1, 15, 47], target is: 58


In [12]:
# ix = torch.randint(len(data) - block_size, (batch_size,))
# xs = [data[i:i+block_size].tolist() for i in ix]

# for i,x in enumerate(xs):
#   for j,char in enumerate(x):
#     char = itos[char]
#     # print(i,j,char)



In [13]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in paralell (B)
block_size = 8 # What is the maximum length of said sequences (T)
vocab_size = vocab_size # Number of unique tokens (C)

def get_batch(split):
  """Generate a small batch of data with inputs:x and outputs:y"""
  data = train if split == "train" else val
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x,y

xb, yb = get_batch("train")

print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print("-----")

for b in range(batch_size): # batch dimension
  for t in range(block_size): # time dimension
    context = xb[b, :t+1]
    target = yb[b,t]
    print(f"when input is: {context.tolist()}, target is: {target}")


inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
-----
when input is: [24], target is: 43
when input is: [24, 43], target is: 58
when input is: [24, 43, 58], target is: 5
when input is: [24, 43, 58, 5], target is: 57
when input is: [24, 43, 58, 5, 57], target is: 1
when input is: [24, 43, 58, 5, 57, 1], target is: 46
when input is: [24, 43, 58, 5, 57, 1, 46], target is: 43
when input is: [24, 43, 58, 5, 57, 1, 46, 43], target is: 39
when input is: [44], target is: 53
when input is: [44, 53], target is: 56
when input is: [44, 53, 56], target is: 1
when input is: [44, 53, 56, 1], target is: 58
when input is: [44, 53, 56, 1, 58], target is: 46
when in

In [14]:
import numpy as np
expected_loss = np.log(1/65)
print(expected_loss)

-4.174387269895637


In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):

    # idx and targets are both (B,T) tensors of integers
    logits = self.token_embedding_table(idx) # (B,T,C)

    if targets is None:
        loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B,T) array of indicies in the current context
    for _ in range(max_new_tokens):
      # Get predictions
      logits, loss = self(idx)
      # focus only on the last time step
      logits = logits[:, -1, :] # Becomes (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=1) # (B,C)
      # Sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
      # append sampled index to running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [16]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [17]:
batch_size = 32
for steps in range(100000):

  # sample batch of data
  xb, yb = get_batch('train')

  # evaluate loss
  logits, loss = m(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())

2.404492139816284


In [19]:
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=1000)[0].tolist()))



K: y sooninde, thind w:
Qurrsad muee?
BYoverd le
ANVINThaich an, Se dife bl, t hino l. ghat aven's awis.
Wh malloo be ndema pusthr
WADINEDom osw sale t ve,
IR:
Whwarr cha mandu d tours nofok s t f is fo wighathalt
SSis.
WAsper;
ce g CHA:
Whamasomasthens Joredasthereath istor itoutourtoumervesofechespeaksts
I the'tou n:
ARThur o ay
A:
HELARindend s, foiowlonifene m atho pl:
ler:
Y ar.
Thicheangoond ybowim yol ar;

METHithe mit RYisartind alinthle lo bl me witoondow mine l'd brerthaved fo oout?
Wheis Merorurexetou tie, oucithtolorstharveang u thime u recesuncavends car t:
GREE:
tyod bjoouistour. yofo ices nthedilouns INAn fanchanie bet de sous ourn

Nowaly list mef, itod sin o, thicot hou won!
SOMAnsou we hasu tee, nthifouputhoncous bbrd ay iny po way pis t, IZAnd au f whano dasp,
And ghe mard t.
I'be tsivouryortthe sepust hesapred re ome; hithe s teaprrehey whime: s, nsststhinofrstotheley ctofar;
CINI ppl mive IInddn, chmy. s c;
D:
BRI mit EDathaicouso he He ngou tofon thino l bo n,
T

## The mathematical trick in self-attention

In [73]:
# Consider the following example

torch.manual_seed(1337)

B,T,C = 4, 8, 2 # batch, time, channels (batch, context_length, vocab_size)
x = torch.randn(B,T,C)
x.shape


torch.Size([4, 8, 2])

In [23]:
# Version 1
# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
  for t in range(T):
    xprev = x[b,:t+1] # (t,C)
    xbow[b,t] = torch.mean(xprev, 0)

In [83]:
# Version 2
wei = torch.tril(torch.ones(T,T))
wei = wei / torch.sum(wei, 1, keepdim=True)
xbow2 = wei @ x # ((B), T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2, rtol=1e-5, atol=1e-5)

True

In [102]:
# Version 3: using softmax
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3, rtol=1e-5, atol=1e-5)

True

In [107]:
# Version 4: self-attention!!
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# a single head performing self-attention
head_size = 16
query = nn.Linear(C, head_size, bias=False)
key = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
q = query(x) # (B,T,16)
k = key(x)   # (B,T,16)
wei = q @ k.transpose(-2,-1) # (B,T,16) @ (B,16,T) --->  (B,T,T)

tril = torch.tril(torch.ones(T,T))
# wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
# out = wei @ x

out.shape

torch.Size([4, 8, 16])

**Notes:**
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by $\frac{1}{\sqrt{head\_size}}$. This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [114]:
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2,-1) * head_size**-0.5

In [115]:
k.var()

tensor(1.0966)

In [116]:
q.var()

tensor(0.9416)

In [117]:
wei.var()

tensor(1.0065)

In [120]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [125]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*10, dim=-1)

tensor([1.5851e-02, 7.8918e-04, 1.1713e-01, 7.8918e-04, 8.6545e-01])

In [137]:
class LayerNorm1d:

    def __init__(self, dim, eps=1e-5):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        # calculate forward pass
        xmean = x.mean(1, keepdim=True) # change from 0 to 1 for layer norm
        xvar = x.var(1, keepdim=True) # change from 0 to 1 for layer norm
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size of 32 100-dim vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [138]:
x[:,0].mean(), x[:,0].std() # mean, std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [139]:
x[0,:].mean(), x[0,:].std() # mean, std of a single input from the batch, of its features>

(tensor(-9.5367e-09), tensor(1.0000))

## Let's Train This Bitch

In [12]:
torch.cuda.is_available()

True

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Hyperparameters
batch_size = 64
block_size = 256
max_iters = 5000
eval_interval = 500
learning_rate = 3e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ---------------

torch.manual_seed(1337)

!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# All unique characters that appear in text (vocab)
chars = list(sorted(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for ch, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s] # encoder: take a string and output list of integers
decode = lambda l: "".join([itos[i] for i in l])

# Let's encode (tokenize) and split the entire dataset
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train = data[:n]
val = data[n:]

# Create a dataloader
def get_batch(split):
    """Generate a small batch of data with inputs:x and outputs:y"""
    data = train if split == "train" else val
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y

xb, yb = get_batch("train")

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B,T,C = x.shape
    q = self.query(x) # (B,T,C)
    k = self.key(x)   # (B,T,C)
    # compute attention scores ('affinities)
    wei = q @ k.transpose(-2,-1) * C**-0.5 # (B,T,C) @ (B,16,C) --->  (B,T,T)
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B,T,T)
    wei = F.softmax(wei, dim=-1) # (B,T,T)
    wei = self.dropout(wei)
    # perform the weighted aggregation of the values
    v = self.value(x) # (B,T,C)
    out = wei @ v # (B,T,T) @ (B,T,C) ---> (B,T,C)
    return out

class MultiHeadAttention(nn.Module):
  """ multiple heads of self-attention in paralell """

  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)


  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(self.proj(out))
    return out

class FeedForward(nn.Module):
  """ a simple linear layer followed by a non-linearity """

  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(n_embd, 4*n_embd),
      nn.ReLU(),
      nn.Linear(4*n_embd, n_embd), # projection-layer
      nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)

class Block(nn.Module):
  """ Transformer block: communication followed by computation """
  def __init__(self, n_embd, n_head):
    # n_embd: embedding dimension, n_head: the number heads we'd like
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.sa(self.ln1(x)) # Using the pre-norm formulation (deviates from attention is all you need)
    x = x + self.ffwd(self.ln2(x))
    return x

# Basic Bitch Bigram
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd) # Final layer norm before decoding
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    B,T = idx.shape

    # idx and targets are both (B,T) tensors of integers
    tok_emb = self.token_embedding_table(idx) # (B,T,C)
    pos_embedding = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
    x = tok_emb + pos_embedding # (B,T,C)
    x = self.blocks(x) # apply several blocks multiheaded self-attention and MLP . (B,T,C)
    x = self.ln_f(x) # (B,T,C)
    logits = self.lm_head(x) # (B,T,vocab_size)

    if targets is None:
        loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B,T) array of indicies in the current context
    for _ in range(max_new_tokens):
      # crop idx to the last block_size tokens
      idx_cond = idx[:, -block_size:]
      # Get predictions
      logits, loss = self(idx_cond)
      # focus only on the last time step
      logits = logits[:, -1, :] # Becomes (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=1) # (B,C)
      # Sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
      # append sampled index to running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

   # every once in a while evaluate train/val-loss
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"Step: {iter}, Train loss: {losses['train']:.4f}, Val loss: {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

Step: 0, Train loss: 4.3358, Val loss: 4.3322
Step: 500, Train loss: 1.9854, Val loss: 2.0973
Step: 1000, Train loss: 1.5254, Val loss: 1.7382
Step: 1500, Train loss: 1.3855, Val loss: 1.6200
Step: 2000, Train loss: 1.2937, Val loss: 1.5624
Step: 2500, Train loss: 1.2181, Val loss: 1.5269
Step: 3000, Train loss: 1.1512, Val loss: 1.5100
Step: 3500, Train loss: 1.0922, Val loss: 1.5093
Step: 4000, Train loss: 1.0359, Val loss: 1.5150
Step: 4500, Train loss: 0.9814, Val loss: 1.5305

BUCKINGHAM:
Good morrow, tender comes for ten;
And Catesby, the Lord Gaunt of Restmory.

QUEEN MARGARET:
Antony, true, proud homely I mean till usurps,
That beg the head yet made of the king's safeguard.

Second Murderer:
How since the Lord Hastings of stand he field!

CLARENCE:
How! what now, who few, thy wife?

STANLEY:
No, my lord; troth, ere come to meet you not?

FRIAR LATCLIFF:
'sall, 'tis in Capule, well'd brield hell be.
You are in the heaven there lies yeveral tongue:
But murder more na


In [15]:
print(sum(p.numel() for p in m.parameters())/1e6, 'Million Parameters')

10.788929 Million Parameters


In [17]:
with open('output.txt', 'w') as f:
    text = f.write(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))