<a href="https://colab.research.google.com/github/yjianpen/MlePractice/blob/main/gpt_andrej_karpathy_bigram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
## Let's start with a dataset called "tiny shakespear" for training
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-07-05 20:44:27--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-07-05 20:44:27 (26.5 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [3]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text= f.read()


In [4]:
print(f"length of dataset in characters, len(text): {len(text)}")

length of dataset in characters, len(text): 1115394


In [5]:
# let's look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [6]:
# herer are all the unique chars that occur in this text

chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [28]:
## tokenize input texts, translating strings into numbers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hii there"))
print(decode(encode("hii there")))
## there are many other tokenizers such as tiktok from oai, sentencepiece from google
## ideally, if we add a constant to all encoded integers, our model should not change (called gauge property)

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [8]:
## let's encode the entire text dataset and store it into a torch.tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [10]:
## split our data into train set and val set
n = int(0.9*len(data)) ## first 90% training
train_data = data[:n]
val_data = data[n:]

In [11]:
## train the transformer one chunk per time, split training data into chunks

block_size = 8
train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [12]:
x =train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"when input is {context}, the next token target is {target}")

when input is tensor([18]), the next token target is 47
when input is tensor([18, 47]), the next token target is 56
when input is tensor([18, 47, 56]), the next token target is 57
when input is tensor([18, 47, 56, 57]), the next token target is 58
when input is tensor([18, 47, 56, 57, 58]), the next token target is 1
when input is tensor([18, 47, 56, 57, 58,  1]), the next token target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), the next token target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the next token target is 58


In [35]:
## when we actually train the gpu, we put |batch_size| * |chunk_size| tokens into the gpu for parallelization efficiency

torch.manual_seed(1337)
batch_size = 4 ## how many chunks we send into gpu every time
block_size = 8 ## size of each chunk
device = 'cuda' if torch.cuda.is_available() else 'cpu'
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
eval_iters = 200

def get_batch(split):
  ## generate a small batch of data of inputs x and targets y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  ## pick a random int in range of 0 , len(data) - block_size for batch_size times
  x = torch.stack([data[i:i+block_size] for i in ix])
  ## generate context x for each random int in ix
  y = torch.stack([data[i+1: i+block_size+1] for i in ix])
  ## if device becomes cuda, we need to move data to our gpu
  x, y = x.to(device), y.to(device)
  ## genereate target y for each random int in ix where each y[a][b] is the target for input x[:a+1][b]
  return x, y
xb, yb = get_batch('train')
print(f"inputs shape: {xb.shape}, data: {xb}, output shape {yb.shape} output value {yb}")


inputs shape: torch.Size([4, 8]), data: tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]]), output shape torch.Size([4, 8]) output value tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [30]:
## start a simple nn
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    # the lookup table is randomly initialized, like a random guesser
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets = None):
    # idx and targets are both (B,T) tensor of integers
    logits = self.token_embedding_table(idx) # (B,T,C)
    ## loss = F.cross_entropy(logits, targets)
    ## cross_entropy is defined as sum(-xln(x)), if i have a prob vector [0.65: cat, 0.35: dog]
    ## and target is dog, the loss will be -ln(0.35) = 1.04
    ## if the target is cat, then loss will be -ln(0.65) = 0.43
    ## you can see the loss decreases when our prob distribution gets closer to target
    ## in pytorch, the dimension for loss is # (B,C,T) instead of #(B,T,C)
    ## threfore we need to reshape loss func
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      # get the predictions
      logits, loss = self(idx)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C), here only 1 previous token is used, history is skipped
      # apply softmax to get probabilities from logits
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

model = BigramLanguageModel(vocab_size)
## if device is cuda:
m = model.to(device)
logits, loss = m(xb, yb)
## logits is the probability distribution tensor for each |batch_size| * |block_size| input
print(logits.shape)
print(loss)
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [31]:
## create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3) ## higher learning rate for smaller models
## adamw refers to adaptive moment estimation with weight decay using moving averate of gradients and squared
## gradients to update parameters


In [37]:
@torch.no_grad()
def estimate_loss():
  out = {}
  m.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = m(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  m.train()
  return out

In [40]:
batch_size = 32
for iter in range(max_iters):
  ## sample a batch of data
  ## estimate loss for each eval_interval
  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")
  xb, yb = get_batch('train')

  ## evaluate the loss
  logits, loss = m(xb, yb)
  ## cleaning up previous gradients
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  ##print(loss.item()) ## use averaged loss per eval_interval to get a smoother result

step: 0, train loss: 2.4610090255737305, val loss: 2.4817562103271484
step: 300, train loss: 2.468789577484131, val loss: 2.4772789478302
step: 600, train loss: 2.456315755844116, val loss: 2.4826414585113525
step: 900, train loss: 2.4573965072631836, val loss: 2.487539291381836
step: 1200, train loss: 2.4704136848449707, val loss: 2.484560489654541
step: 1500, train loss: 2.460576057434082, val loss: 2.477604627609253
step: 1800, train loss: 2.462728500366211, val loss: 2.4868149757385254
step: 2100, train loss: 2.464789390563965, val loss: 2.473625898361206
step: 2400, train loss: 2.452817916870117, val loss: 2.488725185394287
step: 2700, train loss: 2.457933187484741, val loss: 2.477743625640869


In [41]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


IVIfis.
Genughees, ga t that,
we wongoy; unde nin arothig, h houd
HEN:
yoreay'e ENTEriouroof!

LEsthouns am th:
MEMPuniowe t s hinot s apadstr dsths su,
Allowrcr couromazes, ar-nose mothe.

S:

IFidgngut t ve ng whove, s thithes bin the:
S:
R:
LO:
I to ICLer ily, lanirsknorue:

WAs or.
DY:
Ronry teldos ond f h ay.
Be n'le;
Pald! l-ct Whin-

Asprakarther3KEThare.
Fieve?
Tre
Fiverere.


Bonchetomowimy as:
O:
Pr ownt je thopryorfy n war nany habine ourncaveie per'sthif 'ras,
I,
ALKEROK:
S: hand me



*The mathematical trick in self attention::*

In [42]:
## consider the following toy example:

torch.manual_seed(1337)
B,T,C = 4,8,2 #batch, time, channels(number of possible tokens)
## we want the 8 tokens in same Batch to each other
## the 7th token cannot know anything about 8th token
## but the reverse is true

x = torch.randn(B,T,C)
x.shape


torch.Size([4, 8, 2])

In [49]:
## averaging previous words x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
  for t in range(T):
    xprev = x[b,:t+1] ## (t,C), can be optimized by prefix sum
    xbow[b,t] = torch.mean(xprev, 0)
print("X",x)
print(x[0] == xbow[0]) ## same for position 0 but different since position 1

X tensor([[[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]],

        [[ 1.3488, -0.1396],
         [ 0.2858,  0.9651],
         [-2.0371,  0.4931],
         [ 1.4870,  0.5910],
         [ 0.1260, -1.5627],
         [-1.1601, -0.3348],
         [ 0.4478, -0.8016],
         [ 1.5236,  2.5086]],

        [[-0.6631, -0.2513],
         [ 1.0101,  0.1215],
         [ 0.1584,  1.1340],
         [-1.1539, -0.2984],
         [-0.5075, -0.9239],
         [ 0.5467, -1.4948],
         [-1.2057,  0.5718],
         [-0.5974, -0.6937]],

        [[ 1.6455, -0.8030],
         [ 1.3514, -0.2759],
         [-1.5108,  2.1048],
         [ 2.7630, -1.7465],
         [ 1.4516, -1.5103],
         [ 0.8212, -0.2115],
         [ 0.7789,  1.5333],
         [ 1.6097, -0.4032]]])
tensor([[ True,  True],
        [False, False],
        [False

In [50]:
## C*X = Xbow? Let's consider an easy 2*2 case
## suppose X is [[a,b]
##               [c,d]]
## Xbow is      [[a,b]
##               [(a+c)/2,(b+d)/2]]
## C is         [[x11, x12]
##               [x21, x22]]
##             x11a + x12c = a             --->x12 = 0, x11 = 1
##             x11b + x12d = b             --->x11 = 1, x12 = 0
##             x21a + x22c = (a+c)/2       ----> x21 = 1/2, x22 = 1/2
##             x21b + x22d = (b+d)/2       --->  x21 = 1/2, x22 = 1/2
##             C looks like
##              [1,0]
##              [1/2,1/2]
## a more general form is triangular [1,   0,   0,    0]
##                                   [1/2,1/2,  0,    0]
##                                   [1/3,1/3, 1/3,   0]
##                                   [1/4,1/4, 1/4, 1/4]

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [54]:
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x ## (B, T, T) @ (B, T, C) ----> (B, T, C) ## paralleled matrix multiplication
print("xbow - xbow2", xbow - xbow2)
torch.allclose(xbow, xbow2, atol=1e-6)


xbow - xbow2 tensor([[[ 0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  2.9802e-08],
         [ 0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  7.4506e-09],
         [-7.4506e-09,  0.0000e+00],
         [ 7.4506e-09, -1.4901e-08],
         [ 0.0000e+00,  0.0000e+00]],

        [[ 0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [-1.4901e-08,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [-1.4901e-08,  2.2352e-08],
         [-4.6566e-09,  3.2363e-08],
         [-7.4506e-09,  0.0000e+00],
         [ 2.9802e-08,  0.0000e+00]],

        [[ 0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [-1.4901e-08, -2.9802e-08],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.4901e-08, -7.4506e-09],
         [ 7.4506e-09,  0.0000e+00],
         [ 2.9802e-08,  2.9802e-08],
         [-2.9802e-08,  2.9802e-08]],

        [[ 0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 0.0000e+

True

In [58]:
## another approach: use softmax to create C where C*X = X_bow
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [69]:
n_embed = 32
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    # the lookup table is randomly initialized, like a random guesser
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.lm_head = nn.Linear(n_embed, vocab_size)

  def forward(self, idx, targets = None):
    # idx and targets are both (B,T) tensor of integers
    B, T = idx.shape
    tok_embed = self.token_embedding_table(idx) # (B,T,C: n_embed)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C: n_embed)
    x = tok_embed + pos_emb # (B,T,C: n_embed)
    logits = self.lm_head(tok_embed) # (B,T,vocab_size)

    ## loss = F.cross_entropy(logits, targets)
    ## cross_entropy is defined as sum(-xln(x)), if i have a prob vector [0.65: cat, 0.35: dog]
    ## and target is dog, the loss will be -ln(0.35) = 1.04
    ## if the target is cat, then loss will be -ln(0.65) = 0.43
    ## you can see the loss decreases when our prob distribution gets closer to target
    ## in pytorch, the dimension for loss is # (B,C,T) instead of #(B,T,C)
    ## threfore we need to reshape loss func
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      # get the predictions
      logits, loss = self(idx)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C), here only 1 previous token is used, history is skipped
      # apply softmax to get probabilities from logits
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

model = BigramLanguageModel()
## if device is cuda:
m = model.to(device)
logits, loss = m(xb, yb)
## logits is the probability distribution tensor for each |batch_size| * |block_size| input
print(logits.shape)
print(loss)
## create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3) ## higher learning rate for smaller models
## adamw refers to adaptive moment estimation with weight decay using moving averate of gradients and squared
## gradients to update parameters
for iter in range(max_iters):
  ## sample a batch of data
  ## estimate loss for each eval_interval
  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")
  xb, yb = get_batch('train')

  ## evaluate the loss
  logits, loss = m(xb, yb)
  ## cleaning up previous gradients
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
##print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([256, 65])
tensor(4.3020, grad_fn=<NllLossBackward0>)
step: 0, train loss: 4.372992992401123, val loss: 4.365924835205078
step: 300, train loss: 2.8543167114257812, val loss: 2.8668007850646973
step: 600, train loss: 2.644808053970337, val loss: 2.658140182495117
step: 900, train loss: 2.5634756088256836, val loss: 2.5893383026123047
step: 1200, train loss: 2.5483484268188477, val loss: 2.5588622093200684
step: 1500, train loss: 2.533768892288208, val loss: 2.543004035949707
step: 1800, train loss: 2.5078165531158447, val loss: 2.5229456424713135
step: 2100, train loss: 2.5005290508270264, val loss: 2.5185604095458984
step: 2400, train loss: 2.4943957328796387, val loss: 2.5248606204986572
step: 2700, train loss: 2.484518527984619, val loss: 2.5112946033477783


In [82]:
# version 4: self attension meaning q and k are all coming from x, k and q
# are interchangeable in this case!
# cross-attention is used when we have some context like weather information
# formula: Attention = V(QK^T/sqrt(d)), Q: query, K: key, V:value, T: transpose,
# sqrt(d): normalization so attention does not get messed up by size of Q

torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B,T,C)
# let's see a single Head perform self-attention
head_size = 16
# each head is a possible correlation mechanism,
# e.g.: "Today is the last day for summer camp"
# one head is spelling in word Today
# another head is semantics, like Today is (single form of be for Today)
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)

wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)
## interacting q and k, higher value at wei[i][j] means higher correlation between
## x[B,i,C ] and x[B,j,C ], i and j are positions
## wei is the adj matrix of a directional complete graph
##    1 ---> 2
##    2 ---> 4
##    4 ---> 3 ....
## if we use transformer to study multi character dialogues, then we can correlate
## ramsay numbers with transformer's Attention layer, R(3,3) = 6 meaning that we will
# # see a cluster of 3 tokens from same character if the total token number is
# >= 6

tril = torch.tril(torch.ones(T, T))
##wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
## float('-inf') might not be needed in other tasks like semantic classification
wei = F.softmax(wei, dim=-1) ##normalization
v = value(x) ## storing weighted information about x for a single head
out = wei @ v
#out = wei @ x
out.shape

torch.Size([4, 8, 16])

In [80]:
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [81]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
         [0.0660, 0.089

In [87]:
class Head(nn.Module):
  """one head of self-attention"""
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embed, head_size, bias=False)
    self.query = nn.Linear(n_embed, head_size, bias=False)
    self.value = nn.Linear(n_embed, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x) # (B,T,C)
    q = self.query(x) # (B,T,C)
    wei = q @ k.transpose(-2, -1) * C**-0.5 # (B, T, C) @ (B, C, T) ---> (B, T, T)
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)
    v = self.value(x)
    out = wei @ v
    return out

class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.lm_head = nn.Linear(n_embed, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    # idx and targets are both (B,T) tensor of integers
    token_emb = self.token_embedding_table(idx) # (B,T,C)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))
    x = token_emb + pos_emb
    logits = self.lm_head(x) # (B,T,vocab_size)
    if self.lm_head is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      ## crop idx up to block_size
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond)
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)
learning_rate = 1e-3
max_iters = 5000

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
for iter in range(max_iters):
  ## sample a batch of data
  ## estimate loss for each eval_interval
  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")
  xb, yb = get_batch('train')

  ## evaluate the loss
  logits, loss = m(xb, yb)
  ## cleaning up previous gradients
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  ##print(loss.item()) ## use averaged loss per eval_interval to get a smoother result






step: 0, train loss: 4.446019172668457, val loss: 4.459233283996582
step: 300, train loss: 2.888965368270874, val loss: 2.9028170108795166
step: 600, train loss: 2.7008557319641113, val loss: 2.7144293785095215
step: 900, train loss: 2.6095917224884033, val loss: 2.630202531814575
step: 1200, train loss: 2.579555034637451, val loss: 2.5711798667907715
step: 1500, train loss: 2.546095848083496, val loss: 2.5442676544189453
step: 1800, train loss: 2.536454916000366, val loss: 2.5312230587005615
step: 2100, train loss: 2.5246574878692627, val loss: 2.5161097049713135
step: 2400, train loss: 2.501429796218872, val loss: 2.5197532176971436
step: 2700, train loss: 2.5072567462921143, val loss: 2.519226312637329
step: 3000, train loss: 2.4890663623809814, val loss: 2.5058083534240723
step: 3300, train loss: 2.5023276805877686, val loss: 2.498231887817383
step: 3600, train loss: 2.496242046356201, val loss: 2.508150100708008
step: 3900, train loss: 2.4874062538146973, val loss: 2.5007715225219

In [91]:
## MultiHead attention added

class MultiHeadAttention(nn.Module):
  """multiple heads of self-attention in parallel"""
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(num_heads * head_size, n_embed)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(out)
    return out

class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.sa_heads = MultiHeadAttention(4, n_embed//4) ## 4 heads of 8-dimensional self-attention
    self.lm_head = nn.Linear(n_embed, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    tok_embed = self.token_embedding_table(idx) # (B,T,C)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))
    x = tok_embed + pos_emb
    x= self.sa_heads(x)
    logits = self.lm_head(x) # (B,T,vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T) # Correctly reshape targets to (B*T,)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      ## crop idx up to block_size
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C), here only 1 previous token is used, history is skipped
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)
learning_rate = 1e-3
max_iters = 5000

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
for iter in range(max_iters):
  ## sample a batch of data
  ## estimate loss for each eval_interval
  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")
  xb, yb = get_batch('train')

  ## evaluate the loss
  logits, loss = m(xb, yb)
  ## cleaning up previous gradients
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  ##print(loss.item()) ## use averaged loss per eval_interval to get a smoother result

step: 0, train loss: 4.192660808563232, val loss: 4.192004203796387
step: 300, train loss: 2.770993232727051, val loss: 2.763517379760742
step: 600, train loss: 2.566837787628174, val loss: 2.5650320053100586
step: 900, train loss: 2.515684127807617, val loss: 2.4957964420318604
step: 1200, train loss: 2.4539873600006104, val loss: 2.449890375137329
step: 1500, train loss: 2.3994812965393066, val loss: 2.4090819358825684
step: 1800, train loss: 2.3737523555755615, val loss: 2.3874008655548096
step: 2100, train loss: 2.3588593006134033, val loss: 2.362799882888794
step: 2400, train loss: 2.3249499797821045, val loss: 2.343980073928833
step: 2700, train loss: 2.3266706466674805, val loss: 2.321322441101074
step: 3000, train loss: 2.3215415477752686, val loss: 2.3130979537963867
step: 3300, train loss: 2.305401563644409, val loss: 2.3072428703308105
step: 3600, train loss: 2.2943613529205322, val loss: 2.304673671722412
step: 3900, train loss: 2.280968189239502, val loss: 2.28855419158935

In [93]:
## Position wise feed forward networks FFN(x) = max(0,xW1+b1)W2 + b2
## a single layer nn after self attention, think individually after self attention
class FeedForward(nn.Module):
  def __init__(self, n_embed):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embed, 4 * n_embed),
        nn.ReLU(),
        nn.Linear(4 * n_embed, n_embed))
  def forward(self, x):
    return self.net(x)

class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.sa_heads = MultiHeadAttention(4, n_embed//4) ## 4 heads of 8-dimensional self-attention
    self.lm_head = nn.Linear(n_embed, vocab_size)
    self.ffwd = FeedForward(n_embed)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    tok_embed = self.token_embedding_table(idx) # (B,T,C)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))
    x = tok_embed + pos_emb
    x = self.ffwd(x)
    x= self.sa_heads(x)
    logits = self.lm_head(x) # (B,T,vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T) # Correctly reshape targets to (B*T,)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      ## crop idx up to block_size
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C), here only 1 previous token is used, history is skipped
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)
learning_rate = 1e-3
max_iters = 5000

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
for iter in range(max_iters):
  ## sample a batch of data
  ## estimate loss for each eval_interval
  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")
  xb, yb = get_batch('train')

  ## evaluate the loss
  logits, loss = m(xb, yb)
  ## cleaning up previous gradients
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  ##print(loss.item()) ## use averaged loss per eval_interval to get a smoother result

step: 0, train loss: 4.187022686004639, val loss: 4.186731338500977
step: 300, train loss: 2.688169002532959, val loss: 2.696760892868042
step: 600, train loss: 2.5288286209106445, val loss: 2.5361242294311523
step: 900, train loss: 2.458847999572754, val loss: 2.474306106567383
step: 1200, train loss: 2.434736967086792, val loss: 2.446643590927124
step: 1500, train loss: 2.399184465408325, val loss: 2.4020230770111084
step: 1800, train loss: 2.373915195465088, val loss: 2.3823931217193604
step: 2100, train loss: 2.3378641605377197, val loss: 2.346947431564331
step: 2400, train loss: 2.3180346488952637, val loss: 2.3448452949523926
step: 2700, train loss: 2.3038628101348877, val loss: 2.3283541202545166
step: 3000, train loss: 2.3079612255096436, val loss: 2.307527542114258
step: 3300, train loss: 2.279653549194336, val loss: 2.3040473461151123
step: 3600, train loss: 2.2823243141174316, val loss: 2.288362979888916
step: 3900, train loss: 2.276033639907837, val loss: 2.294891595840454


In [97]:
## creating transformer block: communication followed by computation

## MultiHead attention added

class Block(nn.Module):
  def __init__(self, n_embed, n_head):
    super().__init__()
    head_size = n_embed // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embed)

  def forward(self, x):
    x = self.sa(x)
    x = self.ffwd(x)
    return x

class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.blocks = nn.Sequential(
        Block(n_embed, n_head=4),
        Block(n_embed, n_head=4),
        Block(n_embed, n_head=4),
        nn.LayerNorm(n_embed))
    self.lm_head = nn.Linear(n_embed, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    tok_embed = self.token_embedding_table(idx) # (B,T,C)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))
    x = tok_embed + pos_emb
    x = self.blocks(x)
    logits = self.lm_head(x) # (B,T,vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T) # Correctly reshape targets to (B*T,)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      ## crop idx up to block_size
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C), here only 1 previous token is used, history is skipped
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)
learning_rate = 1e-3
max_iters = 5000

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
for iter in range(max_iters):
  ## sample a batch of data
  ## estimate loss for each eval_interval
  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")
  xb, yb = get_batch('train')

  ## evaluate the loss
  logits, loss = m(xb, yb)
  ## cleaning up previous gradients
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  ## this sucks (2.26 > 2.24) because right now our NN is too deep, can be optimized by residual layers
  ## layer 0 -> layer 1 -> layer 2
  ## layer 0  -------------> layer 2
  ## highway of optimization
  ## print(loss.item()) ## use averaged loss per eval_interval to get a smoother result

step: 0, train loss: 4.263692855834961, val loss: 4.267703056335449
step: 300, train loss: 3.005568504333496, val loss: 3.010251998901367
step: 600, train loss: 2.742722749710083, val loss: 2.736342430114746
step: 900, train loss: 2.6608428955078125, val loss: 2.6547176837921143
step: 1200, train loss: 2.5742480754852295, val loss: 2.5712430477142334
step: 1500, train loss: 2.5232956409454346, val loss: 2.5176498889923096
step: 1800, train loss: 2.4892380237579346, val loss: 2.474663496017456
step: 2100, train loss: 2.428285598754883, val loss: 2.4384818077087402
step: 2400, train loss: 2.400010108947754, val loss: 2.4020473957061768
step: 2700, train loss: 2.3722848892211914, val loss: 2.3745193481445312
step: 3000, train loss: 2.349698543548584, val loss: 2.3584654331207275
step: 3300, train loss: 2.326295852661133, val loss: 2.3251519203186035
step: 3600, train loss: 2.3275749683380127, val loss: 2.319545269012451
step: 3900, train loss: 2.291827440261841, val loss: 2.31239724159240

In [98]:
## Blocks with Residual Network

class MultiHeadAttention(nn.Module):
  """multiple heads of self-attention in parallel"""
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embed, n_embed)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(out)
    return out

class Block(nn.Module):
  def __init__(self, n_embed, n_head):
    super().__init__()
    head_size = n_embed // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embed)

  def forward(self, x):
    ##x = self.sa(x)
    x = x + self.sa(x)
    ##x = self.ffwd(x)
    x = x + self.ffwd(x)
    return x

class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.blocks = nn.Sequential(
        Block(n_embed, n_head=4),
        Block(n_embed, n_head=4),
        Block(n_embed, n_head=4),
        nn.LayerNorm(n_embed))
    self.lm_head = nn.Linear(n_embed, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    tok_embed = self.token_embedding_table(idx) # (B,T,C)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))
    x = tok_embed + pos_emb
    x = self.blocks(x)
    logits = self.lm_head(x) # (B,T,vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T) # Correctly reshape targets to (B*T,)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      ## crop idx up to block_size
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C), here only 1 previous token is used, history is skipped
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)
learning_rate = 1e-3
max_iters = 5000

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
for iter in range(max_iters):
  ## sample a batch of data
  ## estimate loss for each eval_interval
  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")
  xb, yb = get_batch('train')

  ## evaluate the loss
  logits, loss = m(xb, yb)
  ## cleaning up previous gradients
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  ## loss reduces significantly to 1.98 thanks to residual network!



step: 0, train loss: 4.283599376678467, val loss: 4.283738613128662
step: 300, train loss: 2.5266199111938477, val loss: 2.543529987335205
step: 600, train loss: 2.377328634262085, val loss: 2.379370927810669
step: 900, train loss: 2.2906618118286133, val loss: 2.3031249046325684
step: 1200, train loss: 2.2347922325134277, val loss: 2.2503206729888916
step: 1500, train loss: 2.1895906925201416, val loss: 2.214203119277954
step: 1800, train loss: 2.1645877361297607, val loss: 2.186835527420044
step: 2100, train loss: 2.1245453357696533, val loss: 2.178359031677246
step: 2400, train loss: 2.1083948612213135, val loss: 2.152254581451416
step: 2700, train loss: 2.0861928462982178, val loss: 2.1210081577301025
step: 3000, train loss: 2.06648588180542, val loss: 2.1313681602478027
step: 3300, train loss: 2.0594794750213623, val loss: 2.112713098526001
step: 3600, train loss: 2.048938274383545, val loss: 2.106736660003662
step: 3900, train loss: 2.036045789718628, val loss: 2.100717544555664


In [99]:
decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist())
## we see that some real words such as "Your" and "the" right now!

'\nTo chame truag\nAnd wift scroter. Lorddiend your!\nFor ever Vear, nast.\n\nDULINIENE:\nYour hight, the by'

In [108]:
## LayerNorm, very similar to batch norm
## making sure that across batch dimension, any individual neuron
## unit gaussian distribution, 0 1std, 2std ...

class BatchNorm1d:
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    ##self.momentum = momentum
    ##self.training = True
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    ##self.running_mean = torch.zeros(dim)
    ##self.running_var = torch.ones(dim)

  def __call__(self, x):
    ## calculate the forward pass
    ## 1 for normalizing row and 0 for normalizing column
    ##if self.training:
      ##xmean = x.mean(1, keepdim=True) # batch mean
      ##xvar = x.var(1, keepdim=True) # batch variance
    xmean = x.mean(1, keepdim=True)
    xvar = x.var(1, keepdim=True)
    ##else:
      ##xmean = self.running_mean
      ##xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    ## update the buffers
    ##if self.training:
      ##with torch.no_grad():
        ##self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        ##self.running_var = (1- self.momentum) * self.running_var + self.momentum * xvar
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = BatchNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape




torch.Size([32, 100])

In [109]:
x[0,:].mean(), x[0,:].std()

(tensor(-9.5367e-09), tensor(1.0000))

In [110]:
class Block(nn.Module):
  def __init__(self, n_embed, n_head):
    super().__init__()
    head_size = n_embed // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embed)
    self.ln1 = nn.LayerNorm(n_embed)
    self.ln2 = nn.LayerNorm(n_embed)

  def forward(self, x):
    ##x = self.sa(x)
    x = x + self.sa(self.ln1(x))
    ##x = self.ffwd(x)
    x = x + self.ffwd(self.ln2(x))
    return x
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.blocks = nn.Sequential(
        Block(n_embed, n_head=4),
        Block(n_embed, n_head=4),
        Block(n_embed, n_head=4),
        nn.LayerNorm(n_embed))
    self.lm_head = nn.Linear(n_embed, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    tok_embed = self.token_embedding_table(idx) # (B,T,C)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))
    x = tok_embed + pos_emb
    x = self.blocks(x)
    logits = self.lm_head(x) # (B,T,vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T) # Correctly reshape targets to (B*T,)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      ## crop idx up to block_size
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C), here only 1 previous token is used, history is skipped
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)
learning_rate = 1e-3
max_iters = 5000

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
for iter in range(max_iters):
  ## sample a batch of data
  ## estimate loss for each eval_interval
  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")
  xb, yb = get_batch('train')

  ## evaluate the loss
  logits, loss = m(xb, yb)
  ## cleaning up previous gradients
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()



step: 0, train loss: 4.379400253295898, val loss: 4.377382755279541
step: 300, train loss: 2.5497701168060303, val loss: 2.5449483394622803
step: 600, train loss: 2.391538143157959, val loss: 2.3754067420959473
step: 900, train loss: 2.3018839359283447, val loss: 2.3005971908569336
step: 1200, train loss: 2.228256940841675, val loss: 2.264866352081299
step: 1500, train loss: 2.202476978302002, val loss: 2.2245471477508545
step: 1800, train loss: 2.1606202125549316, val loss: 2.1865651607513428
step: 2100, train loss: 2.143864393234253, val loss: 2.1705808639526367
step: 2400, train loss: 2.1226892471313477, val loss: 2.157665491104126
step: 2700, train loss: 2.109161615371704, val loss: 2.1337807178497314
step: 3000, train loss: 2.080789089202881, val loss: 2.115238904953003
step: 3300, train loss: 2.068002700805664, val loss: 2.121767997741699
step: 3600, train loss: 2.0523927211761475, val loss: 2.1015748977661133
step: 3900, train loss: 2.037473440170288, val loss: 2.100478887557983

In [111]:
## now we have all components of a transformer, but let's scale it up!


In [None]:
dropout = 0.2
n_layer = 6
batch_size = 64
n_embed = 384
block_size = 256
learning_rate = 3e-4
max_iters = 5000
eval_interval = 500
n_head = 6
#### Don't run it on cpu, you need gpus!
class Head(nn.Module):
  """one head of self-attention"""
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embed, head_size, bias=False)
    self.query = nn.Linear(n_embed, head_size, bias=False)
    self.value = nn.Linear(n_embed, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x) # (B,T,C)
    q = self.query(x) # (B,T,C)
    wei = q @ k.transpose(-2, -1) * C**-0.5 # (B, T, C) @ (B, C, T) ---> (B, T, T)
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)
    wei = self.dropout(wei)
    v = self.value(x)
    out = wei @ v
    return out
class MultiHeadAttention(nn.Module):
  """multiple heads of self-attention in parallel"""
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embed, n_embed)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(out)
    return out

class FeedForward(nn.Module):
  def __init__(self, n_embed):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embed, 4 * n_embed),
        nn.ReLU(),
        nn.Linear(4 * n_embed, n_embed),
        nn.Dropout(dropout))
  def forward(self, x):
    return self.net(x)

class Block(nn.Module):
  def __init__(self, n_embed, n_head):
    super().__init__()
    head_size = n_embed // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embed)
    self.ln1 = nn.LayerNorm(n_embed)
    self.ln2 = nn.LayerNorm(n_embed)

  def forward(self, x):
    ##x = self.sa(x)
    x = x + self.sa(self.ln1(x))
    ##x = self.ffwd(x)
    x = x + self.ffwd(self.ln2(x))
    return x
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embed)
    self.lm_head = nn.Linear(n_embed, vocab_size)


  def forward(self, idx, targets=None):
    B, T = idx.shape
    tok_embed = self.token_embedding_table(idx) # (B,T,C)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))
    x = tok_embed + pos_emb
    x = self.blocks(x)
    logits = self.lm_head(x) # (B,T,vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T) # Correctly reshape targets to (B*T,)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      ## crop idx up to block_size
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C), here only 1 previous token is used, history is skipped
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)
learning_rate = 1e-3
max_iters = 5000

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
for iter in range(max_iters):
  ## sample a batch of data
  ## estimate loss for each eval_interval
  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")
  xb, yb = get_batch('train')

  ## evaluate the loss
  logits, loss = m(xb, yb)
  ## cleaning up previous gradients
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()