<a href="https://colab.research.google.com/github/zhenyiqi/nanoGPT/blob/master/gpt_dev_zhenyiqi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare dataset

## Download

In [2]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-01-28 19:55:40--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-01-28 19:55:40 (17.3 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [3]:
with open('input.txt', 'r') as f:
  text = f.read()

In [4]:
print(f"length of the dataset is {len(text)}")

length of the dataset is 1115394


## encode characters into numbers (indices)

In [5]:
chars = sorted(list(set(text)))

In [6]:
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [7]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

In [8]:
encode = lambda s: [stoi[c] for c in s]
decode = lambda i: ''.join([itos[ind] for ind in i])

In [9]:
print(encode("hello there"))
print(decode(encode("hello there")))

[46, 43, 50, 50, 53, 1, 58, 46, 43, 56, 43]
hello there


In [10]:
import torch

In [11]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

## Create train/validation sets

In [12]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [13]:
# notation
block_size = 8 # this is the maximum length of the chunk (sequence length, in other places, this is also donated as T)
train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [14]:
x = train_data[:block_size] # [18, 47, 56, 57, 58,  1, 15, 47]
y = train_data[1:block_size + 1] # [47, 56, 57, 58,  1, 15, 47, 58]
for t in range(block_size):
  context = x[:t + 1]
  target = y[t]
  print(f"when input is {context}, the target is : {target}")


when input is tensor([18]), the target is : 47
when input is tensor([18, 47]), the target is : 56
when input is tensor([18, 47, 56]), the target is : 57
when input is tensor([18, 47, 56, 57]), the target is : 58
when input is tensor([18, 47, 56, 57, 58]), the target is : 1
when input is tensor([18, 47, 56, 57, 58,  1]), the target is : 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is : 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is : 58


In [15]:
# generalize with extra batch size dimension

torch.manual_seed(1337)
batch_size = 4
block_size = 8
def get_batch(split):
  data = train_data if split == 'train' else val_data
  # a random vector of length batch_size, with its value sampled from
  # the range (0, len(data) - block_size), which is all possible values for the
  # the start of a sampled sequence.
  ix = torch.randint(len(data) - block_size, (batch_size, ))
  # now get the samples out, stacking them together
  x = torch.stack([data[i:i+block_size] for i in ix])  # (block_size, batch_size)
  y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # (block_size, batch_size)
  return x, y

In [16]:
xb, yb = get_batch('train')

In [17]:
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)
print('----')

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----


In [18]:
for b in range(batch_size):
  for t in range(block_size):
    context = xb[b, :t+1]
    target = yb[b, t]
    print(f"when input is {context.tolist()}, the target is {target}")

when input is [24], the target is 43
when input is [24, 43], the target is 58
when input is [24, 43, 58], the target is 5
when input is [24, 43, 58, 5], the target is 57
when input is [24, 43, 58, 5, 57], the target is 1
when input is [24, 43, 58, 5, 57, 1], the target is 46
when input is [24, 43, 58, 5, 57, 1, 46], the target is 43
when input is [24, 43, 58, 5, 57, 1, 46, 43], the target is 39
when input is [44], the target is 53
when input is [44, 53], the target is 56
when input is [44, 53, 56], the target is 1
when input is [44, 53, 56, 1], the target is 58
when input is [44, 53, 56, 1, 58], the target is 46
when input is [44, 53, 56, 1, 58, 46], the target is 39
when input is [44, 53, 56, 1, 58, 46, 39], the target is 58
when input is [44, 53, 56, 1, 58, 46, 39, 58], the target is 1
when input is [52], the target is 58
when input is [52, 58], the target is 1
when input is [52, 58, 1], the target is 58
when input is [52, 58, 1, 58], the target is 46
when input is [52, 58, 1, 58, 46

In [19]:
xb

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])

# Modeling

## Version 1: bigram language model

The "model" contains only the embedding layer. The input looks like
`[x1, x2, x3, x4, ..., xt]`, while the target looks like
`[x2, x3, x4, x5, ..., xt+1]`.

The embedding layer will convert each xi into a vector.
```
[[x11, x12, x13, ..., x1n],
 [x21, x22, x23, ..., x2n],
  ...
 [xt1, xt2, xt3, ..., xtn]],
```

where n is the embedding dimension.

We train the model such that, after the embedding, the embedding `[xm1, xm2, xm3, ... xmn]`, at time `m` matches the target at time `m`. However, because a token at time `m'` does not talk to another token at time `m"`, if `m' != m"`, and the training is matching a token at `m` with its target at `m`, which basically is the token at `m+1` in the ground-truth sequence, we call this model a Bigram language model.

In [33]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    # an embedding is a table that maps each token to a vector
    # nn.Embedding(n1, n2) maps a token whose value is up to n1 (starting from 0)
    # to a vector of size n2. Here we map each token to a vector of size vocab_size
    # (One example is one-hot encoding)
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    """A Forward proporgation.

    idx: the input. Dimension [B, T].
    targets: target. Dimension [B, T], [B, i]th element is the target of sequence
      idx[B, :i].
    """
    # idx and targets are both (B, T), meaning (batch_size, block_size)
    logits = self.token_embedding_table(idx) # (B, T, C) C is vocab_size / embedding size
    # here is a bit tricky, please refer to
    # https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
    # to understand what's in there
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B * T, C)
      targets = targets.view(B * T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    """Generate the next token for max_new_tokens of times for the batch idx.

    idx: a batch of samples, of size (B, T).
    max_new_tokens: int, how many tokens to be generated given a sample.
    """
    for _ in range(max_new_tokens):
      # get the predictions
      logits, loss = self(idx) # logits: (B, T, C)
      # print(logits.shape)
      # Last one of the sequence
      logits = logits[:, -1, :] # (B, C)
      probs = F.softmax(logits, dim=-1) # (B, C)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      idx = torch.cat((idx, idx_next), dim=1) # (B, T + 1)
    return idx

bigram_model = BigramLanguageModel(vocab_size)
logits, loss = bigram_model(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)


In [34]:
idx = torch.zeros((1, 1), dtype=torch.long)

print(decode(bigram_model.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([1, 1, 65])
torch.Size([1, 2, 65])
torch.Size([1, 3, 65])
torch.Size([1, 4, 65])
torch.Size([1, 5, 65])
torch.Size([1, 6, 65])
torch.Size([1, 7, 65])
torch.Size([1, 8, 65])
torch.Size([1, 9, 65])
torch.Size([1, 10, 65])
torch.Size([1, 11, 65])
torch.Size([1, 12, 65])
torch.Size([1, 13, 65])
torch.Size([1, 14, 65])
torch.Size([1, 15, 65])
torch.Size([1, 16, 65])
torch.Size([1, 17, 65])
torch.Size([1, 18, 65])
torch.Size([1, 19, 65])
torch.Size([1, 20, 65])
torch.Size([1, 21, 65])
torch.Size([1, 22, 65])
torch.Size([1, 23, 65])
torch.Size([1, 24, 65])
torch.Size([1, 25, 65])
torch.Size([1, 26, 65])
torch.Size([1, 27, 65])
torch.Size([1, 28, 65])
torch.Size([1, 29, 65])
torch.Size([1, 30, 65])
torch.Size([1, 31, 65])
torch.Size([1, 32, 65])
torch.Size([1, 33, 65])
torch.Size([1, 34, 65])
torch.Size([1, 35, 65])
torch.Size([1, 36, 65])
torch.Size([1, 37, 65])
torch.Size([1, 38, 65])
torch.Size([1, 39, 65])
torch.Size([1, 40, 65])
torch.Size([1, 41, 65])
torch.Size([1, 42, 65])
t

In [37]:
loss

tensor(4.8786, grad_fn=<NllLossBackward0>)

### Math trick for self-attention

In [51]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

Toy problem: We want x[b, t] = mean_{i <= t} x[b, i]
slow yet straightforward way of doing this

In [53]:
xbow = torch.zeros((B, T, C))
for b in range(B):
  for t in range(T):
    xprev = x[b, :t + 1] #(b)
    xbow[b, t] = torch.mean(xprev, 0)

fast way of doing this

In [70]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b

In [71]:
a

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])

In [72]:
b

tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])

In [73]:
c

tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])

In [None]:
a

better formulate this

In [75]:
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (T, T) @ (B, T, C) -Pytorch-> (B, T, T) @ (B, T, C) --> (B, T, C)

In [76]:
torch.allclose(xbow, xbow2)

False

In [79]:
xbow[0], xbow2[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

Version 3: use Softmax

In [81]:
tril = torch.tril(torch.ones((T, T)))

wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow2, xbow3)

True

same version, but making it a maksed self-attention computation

In [85]:
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)

wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) --> (B, T, T)

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
out.shape

torch.Size([4, 8, 16])

In [86]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

## Version 2: with attention layer

This allows all the tokens to talk to each other without (?) leaking future information.

### self attention head

In [None]:
class SelfAttentionHead(nn.Module):
  def __init__(self, head_size):
    """self-attention head.

    there will be multiple heads. head_size * n_head = n_embed.
    """
    super().__init__()
    self.key = nn.Linear(n_embed, head_size, bias=False)
    self.query = nn.Linear(n_embed, head_size, bias=False)
    self.value = nn.Linear(n_embed, head_size, bias=False)
    # tril is not the parameter of the module, so we call it a buffer
    # (not a parameter)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    B, T, C = x.shape
    k = self.key(x) # (B, T, head_size)
    q = self.query(x) # (B, T, head_size)

    wei = q @ k.transpose(-2, -1) * C ** -0.5# (B, T, T)
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1) # (B, T, T)
    v = self.value(x) # (B, T, head_size)
    out = wei @ v # (B, T, head_size)
    return out

### Model with one-head attention

In [93]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

n_embed = 32

class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    # an embedding is a table that maps each token to a vector
    # nn.Embedding(n1, n2) maps a token whose value is up to n1 (starting from 0)
    # to a vector of size n2. Here we map each token to a vector of size vocab_size
    # (One example is one-hot encoding)
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.sa_head = SelfAttentionHead(n_embed) # n_embed is the same as head_size (for now), meaning there is only one head
    # This is the output layer that maps the transformed embeddings back to the
    # original vocab_size vector - logits.
    self.lm_head = nn.Linear(n_embed, vocab_size)

  def forward(self, idx, targets=None):
    """A Forward proporgation.

    idx: the input. Dimension [B, T].
    targets: target. Dimension [B, T], [B, i]th element is the target of sequence
      idx[B, :i].
    """
    B, T = idx.shape

    # idx and targets are both (B, T), meaning (batch_size, block_size)
    token_embed = self.token_embedding_table(idx) # (B, T, C = n_embed)
    # pos_embed = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
    pos_embed = self.position_embedding_table(torch.arange(T)) # (T, C)

    x = token_embed + pos_embed
    x = self.sa_head(x)
    logits = self.lm_head(x) # (B, T, C = vocab_size)
    # here is a bit tricky, please refer to
    # https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
    # to understand what's in there
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B * T, C)
      targets = targets.view(B * T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    """Generate the next token for max_new_tokens of times for the batch idx.

    idx: a batch of samples, of size (B, T).
    max_new_tokens: int, how many tokens to be generated given a sample.
    """
    for _ in range(max_new_tokens):
      # crop the input size so that it's at most the same as block_size when
      # generating
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond) # logits: (B, T, C)
      # print(logits.shape)
      # Last one of the sequence
      logits = logits[:, -1, :] # (B, C)
      probs = F.softmax(logits, dim=-1) # (B, C)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      idx = torch.cat((idx, idx_next), dim=1) # (B, T + 1)
    return idx

bigram_model = BigramLanguageModel()
logits, loss = bigram_model(xb, yb)
print(logits.shape)
print(loss)

torch.Size([256, 65])
tensor(4.5491, grad_fn=<NllLossBackward0>)


## Version 3: Model with Multi-head attention and anaother FF layer

### self-attention head

In [104]:
class SelfAttentionHead(nn.Module):
  def __init__(self, head_size):
    """self-attention head.

    there will be multiple heads. head_size * n_head = n_embed.
    """
    super().__init__()
    self.key = nn.Linear(n_embed, head_size, bias=False)
    self.query = nn.Linear(n_embed, head_size, bias=False)
    self.value = nn.Linear(n_embed, head_size, bias=False)
    # tril is not the parameter of the module, so we call it a buffer
    # (not a parameter)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    B, T, C = x.shape
    k = self.key(x) # (B, T, head_size)
    q = self.query(x) # (B, T, head_size)

    wei = q @ k.transpose(-2, -1) * C ** -0.5# (B, T, T)
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1) # (B, T, T)
    v = self.value(x) # (B, T, head_size)
    out = wei @ v # (B, T, head_size)
    return out

### MultiHeadAttention

In [105]:
class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([SelfAttentionHead(head_size) for _ in range(num_heads)])

  def forward(self, x):
    return torch.cat([h(x) for h in self.heads], dim=-1)

### A Feed-Forward layer

In [106]:
class FeedForward(nn.Module):
  def __init__(self, n_embed):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embed, n_embed),
        nn.ReLU(),
    )

  def forward(self, x):
    return self.net(x)

### model with multi-head attention

In [107]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

n_embed = 32

class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    # an embedding is a table that maps each token to a vector
    # nn.Embedding(n1, n2) maps a token whose value is up to n1 (starting from 0)
    # to a vector of size n2. Here we map each token to a vector of size vocab_size
    # (One example is one-hot encoding)
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.sa_head = MultiHeadAttention(4, n_embed // 4) # i.e. 4 heds of 8-dimensional self-attention
    self.ffwd = FeedForward(n_embed)
    # This is the output layer that maps the transformed embeddings back to the
    # original vocab_size vector - logits.
    self.lm_head = nn.Linear(n_embed, vocab_size)

  def forward(self, idx, targets=None):
    """A Forward proporgation.

    idx: the input. Dimension [B, T].
    targets: target. Dimension [B, T], [B, i]th element is the target of sequence
      idx[B, :i].
    """
    B, T = idx.shape

    # idx and targets are both (B, T), meaning (batch_size, block_size)
    token_embed = self.token_embedding_table(idx) # (B, T, C = n_embed)
    # pos_embed = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
    pos_embed = self.position_embedding_table(torch.arange(T)) # (T, C)

    x = token_embed + pos_embed
    x = self.sa_head(x)
    x = self.ffwd(x)
    logits = self.lm_head(x) # (B, T, C = vocab_size)
    # here is a bit tricky, please refer to
    # https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
    # to understand what's in there
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B * T, C)
      targets = targets.view(B * T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    """Generate the next token for max_new_tokens of times for the batch idx.

    idx: a batch of samples, of size (B, T).
    max_new_tokens: int, how many tokens to be generated given a sample.
    """
    for _ in range(max_new_tokens):
      # crop the input size so that it's at most the same as block_size when
      # generating
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond) # logits: (B, T, C)
      # print(logits.shape)
      # Last one of the sequence
      logits = logits[:, -1, :] # (B, C)
      probs = F.softmax(logits, dim=-1) # (B, C)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      idx = torch.cat((idx, idx_next), dim=1) # (B, T + 1)
    return idx

bigram_model = BigramLanguageModel()
logits, loss = bigram_model(xb, yb)
print(logits.shape)
print(loss)

torch.Size([256, 65])
tensor(4.2090, grad_fn=<NllLossBackward0>)


## Version 4.

### Multi-head attention with a projection

In [109]:
class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([SelfAttentionHead(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embed, n_embed)

  def forward(self, x):
    attention_out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(attention_out)
    return out

### FeedForward with projection and larger hidden dimension

In [110]:
class FeedForward(nn.Module):
  def __init__(self, n_embed):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embed, n_embed * 4),
        nn.ReLU(),
        nn.Linear(n_embed * 4, n_embed),
    )

  def forward(self, x):
    return self.net(x)

### Block of (self-attention and feed-forward)

In [108]:
class Block(nn.Module):
  def __init__(self, n_embed, n_head):
    super().__init__()
    head_size = n_embed // n_head
    self.sa = MultiHeadAttention(n_head, head_size=head_size)
    self.ffwd = FeedForward(n_embed)

  def forward(self, x):
    x = x + self.sa(x)
    x = x + self.ffwd(x)
    return x

### Model with multiple layers of (self-attention, feed-forward) and skipadd

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

n_embed = 32

class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    # an embedding is a table that maps each token to a vector
    # nn.Embedding(n1, n2) maps a token whose value is up to n1 (starting from 0)
    # to a vector of size n2. Here we map each token to a vector of size vocab_size
    # (One example is one-hot encoding)
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.blocks = nn.Sequential(
        Block(n_embed, 4),
        Block(n_embed, 4),
        Block(n_embed, 4),
    )
    # This is the output layer that maps the transformed embeddings back to the
    # original vocab_size vector - logits.
    self.lm_head = nn.Linear(n_embed, vocab_size)

  def forward(self, idx, targets=None):
    """A Forward proporgation.

    idx: the input. Dimension [B, T].
    targets: target. Dimension [B, T], [B, i]th element is the target of sequence
      idx[B, :i].
    """
    B, T = idx.shape

    # idx and targets are both (B, T), meaning (batch_size, block_size)
    token_embed = self.token_embedding_table(idx) # (B, T, C = n_embed)
    # pos_embed = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
    pos_embed = self.position_embedding_table(torch.arange(T)) # (T, C)

    x = token_embed + pos_embed
    x = self.blocks(x)
    logits = self.lm_head(x) # (B, T, C = vocab_size)
    # here is a bit tricky, please refer to
    # https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
    # to understand what's in there
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B * T, C)
      targets = targets.view(B * T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    """Generate the next token for max_new_tokens of times for the batch idx.

    idx: a batch of samples, of size (B, T).
    max_new_tokens: int, how many tokens to be generated given a sample.
    """
    for _ in range(max_new_tokens):
      # crop the input size so that it's at most the same as block_size when
      # generating
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond) # logits: (B, T, C)
      # print(logits.shape)
      # Last one of the sequence
      logits = logits[:, -1, :] # (B, C)
      probs = F.softmax(logits, dim=-1) # (B, C)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      idx = torch.cat((idx, idx_next), dim=1) # (B, T + 1)
    return idx

bigram_model = BigramLanguageModel()
logits, loss = bigram_model(xb, yb)
print(logits.shape)
print(loss)

torch.Size([256, 65])
tensor(4.2090, grad_fn=<NllLossBackward0>)


## Version 5.

### LayerNorm

In [None]:
# class LayerNorm(nn.Module):
#   def __init__(self, dim, eps=1e-5):
#     self.eps = eps
      # gamma and beta are trainable variables
#     self.gamma = torch.ones(dim)
#     self.beta = torch.ones(dim)

#   def forward(self, x):
#     xmean = x.mean(1, keepdim=True)
#     xvar = x.var(1, keepdim=True)
#     xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
#     out = self.gamma * xhat + self.beta
#     return out

#   def parameters(self):
#     return [self.gamma, self.beta]

### Multi-head attention with a projection

In [111]:
class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([SelfAttentionHead(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embed, n_embed)

  def forward(self, x):
    attention_out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(attention_out)
    return out

### FeedForward with projection and larger hidden dimension

In [113]:
class FeedForward(nn.Module):
  def __init__(self, n_embed):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embed, n_embed * 4),
        nn.ReLU(),
        nn.Linear(n_embed * 4, n_embed),
    )

  def forward(self, x):
    return self.net(x)

### Block of (self-attention and feed-forward)

In [116]:
class Block(nn.Module):
  def __init__(self, n_embed, n_head):
    super().__init__()
    head_size = n_embed // n_head
    self.sa = MultiHeadAttention(n_head, head_size=head_size)
    self.ffwd = FeedForward(n_embed)
    self.ln1 = nn.LayerNorm(n_embed)
    self.ln2 = nn.LayerNorm(n_embed)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

### Model with multiple layers of (self-attention, feed-forward) and layernorm + skipadd

In [117]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

n_embed = 32

class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    # an embedding is a table that maps each token to a vector
    # nn.Embedding(n1, n2) maps a token whose value is up to n1 (starting from 0)
    # to a vector of size n2. Here we map each token to a vector of size vocab_size
    # (One example is one-hot encoding)
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.blocks = nn.Sequential(
        Block(n_embed, 4),
        Block(n_embed, 4),
        Block(n_embed, 4),
        nn.LayerNorm(n_embed),
    )
    # This is the output layer that maps the transformed embeddings back to the
    # original vocab_size vector - logits.
    self.lm_head = nn.Linear(n_embed, vocab_size)

  def forward(self, idx, targets=None):
    """A Forward proporgation.

    idx: the input. Dimension [B, T].
    targets: target. Dimension [B, T], [B, i]th element is the target of sequence
      idx[B, :i].
    """
    B, T = idx.shape

    # idx and targets are both (B, T), meaning (batch_size, block_size)
    token_embed = self.token_embedding_table(idx) # (B, T, C = n_embed)
    # pos_embed = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
    pos_embed = self.position_embedding_table(torch.arange(T)) # (T, C)

    x = token_embed + pos_embed
    x = self.blocks(x)
    logits = self.lm_head(x) # (B, T, C = vocab_size)
    # here is a bit tricky, please refer to
    # https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
    # to understand what's in there
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B * T, C)
      targets = targets.view(B * T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    """Generate the next token for max_new_tokens of times for the batch idx.

    idx: a batch of samples, of size (B, T).
    max_new_tokens: int, how many tokens to be generated given a sample.
    """
    for _ in range(max_new_tokens):
      # crop the input size so that it's at most the same as block_size when
      # generating
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond) # logits: (B, T, C)
      # print(logits.shape)
      # Last one of the sequence
      logits = logits[:, -1, :] # (B, C)
      probs = F.softmax(logits, dim=-1) # (B, C)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      idx = torch.cat((idx, idx_next), dim=1) # (B, T + 1)
    return idx

bigram_model = BigramLanguageModel()
logits, loss = bigram_model(xb, yb)
print(logits.shape)
print(loss)

torch.Size([256, 65])
tensor(4.2801, grad_fn=<NllLossBackward0>)


## Version 6. Some cosmetics and dropout

### LayerNorm

In [120]:
dropout = 0.2

In [121]:
# class LayerNorm(nn.Module):
#   def __init__(self, dim, eps=1e-5):
#     self.eps = eps
      # gamma and beta are trainable variables
#     self.gamma = torch.ones(dim)
#     self.beta = torch.ones(dim)

#   def forward(self, x):
#     xmean = x.mean(1, keepdim=True)
#     xvar = x.var(1, keepdim=True)
#     xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
#     out = self.gamma * xhat + self.beta
#     return out

#   def parameters(self):
#     return [self.gamma, self.beta]

### self-attention head with dropout

In [122]:
class SelfAttentionHead(nn.Module):
  def __init__(self, head_size):
    """self-attention head.

    there will be multiple heads. head_size * n_head = n_embed.
    """
    super().__init__()
    self.key = nn.Linear(n_embed, head_size, bias=False)
    self.query = nn.Linear(n_embed, head_size, bias=False)
    self.value = nn.Linear(n_embed, head_size, bias=False)
    # tril is not the parameter of the module, so we call it a buffer
    # (not a parameter)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B, T, C = x.shape
    k = self.key(x) # (B, T, head_size)
    q = self.query(x) # (B, T, head_size)

    wei = q @ k.transpose(-2, -1) * C ** -0.5# (B, T, T)
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1) # (B, T, T)
    wei = self.dropout(wei)

    v = self.value(x) # (B, T, head_size)
    out = wei @ v # (B, T, head_size)
    return out

### Multi-head attention with a projection

In [123]:
class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([SelfAttentionHead(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embed, n_embed)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    attention_out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(attention_out)
    return out

### FeedForward with dropout

In [124]:
class FeedForward(nn.Module):
  def __init__(self, n_embed):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embed, n_embed * 4),
        nn.ReLU(),
        nn.Linear(n_embed * 4, n_embed),
        nn.Dropout(dropout)
    )

  def forward(self, x):
    return self.net(x)

### Block of (self-attention and feed-forward)

In [125]:
class Block(nn.Module):
  def __init__(self, n_embed, n_head):
    super().__init__()
    head_size = n_embed // n_head
    self.sa = MultiHeadAttention(n_head, head_size=head_size)
    self.ffwd = FeedForward(n_embed)
    self.ln1 = nn.LayerNorm(n_embed)
    self.ln2 = nn.LayerNorm(n_embed)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

### Model with multiple layers of (self-attention, feed-forward) and layernorm + skipadd

In [127]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

n_embed = 32
n_head = 4
n_layer = 3

class GPT(nn.Module):
  def __init__(self):
    super().__init__()
    # an embedding is a table that maps each token to a vector
    # nn.Embedding(n1, n2) maps a token whose value is up to n1 (starting from 0)
    # to a vector of size n2. Here we map each token to a vector of size vocab_size
    # (One example is one-hot encoding)
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.blocks = nn.Sequential(*[Block(n_embed, n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embed) # final layer norm
    # This is the output layer that maps the transformed embeddings back to the
    # original vocab_size vector - logits.
    self.lm_head = nn.Linear(n_embed, vocab_size)

  def forward(self, idx, targets=None):
    """A Forward proporgation.

    idx: the input. Dimension [B, T].
    targets: target. Dimension [B, T], [B, i]th element is the target of sequence
      idx[B, :i].
    """
    B, T = idx.shape

    # idx and targets are both (B, T), meaning (batch_size, block_size)
    token_embed = self.token_embedding_table(idx) # (B, T, C = n_embed)
    # pos_embed = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
    pos_embed = self.position_embedding_table(torch.arange(T)) # (T, C)

    x = token_embed + pos_embed
    x = self.blocks(x)
    x = self.ln_f(x)
    logits = self.lm_head(x) # (B, T, C = vocab_size)
    # here is a bit tricky, please refer to
    # https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
    # to understand what's in there
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B * T, C)
      targets = targets.view(B * T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    """Generate the next token for max_new_tokens of times for the batch idx.

    idx: a batch of samples, of size (B, T).
    max_new_tokens: int, how many tokens to be generated given a sample.
    """
    for _ in range(max_new_tokens):
      # crop the input size so that it's at most the same as block_size when
      # generating
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond) # logits: (B, T, C)
      # print(logits.shape)
      # Last one of the sequence
      logits = logits[:, -1, :] # (B, C)
      probs = F.softmax(logits, dim=-1) # (B, C)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      idx = torch.cat((idx, idx_next), dim=1) # (B, T + 1)
    return idx

model = GPT()
logits, loss = model(xb, yb)
print(logits.shape)
print(loss)

torch.Size([256, 65])
tensor(4.2813, grad_fn=<NllLossBackward0>)


In [129]:
sum(p.numel() for p in model.parameters())

42369

# Training

In [36]:
optimizer = torch.optim.AdamW(bigram_model.parameters(), lr=1e-3)

In [48]:
print(xb[0], yb[0])

tensor([ 1, 44, 53, 56, 44, 43, 47, 58]) tensor([44, 53, 56, 44, 43, 47, 58, 10])


In [94]:
max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'

eval_iters = 200
n_embed = 32

In [None]:
# To tell Pytorch that everything inside this function will not be used in
# loss.backward() so that it won't allocate any memory for the local variables
# for backproporgation.
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

In [43]:
batch_size = 32
for iter in range(max_iters):
  if iter % eval_iterval == 0:
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = bigram_model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2.4298441410064697
2.5773096084594727
2.533466100692749
2.5048553943634033
2.4899168014526367
2.427765130996704
2.636652708053589
2.505525588989258
2.5062665939331055
2.457545042037964
2.603576898574829
2.5009303092956543
2.4755256175994873
2.545457124710083
2.5184295177459717
2.499173402786255
2.469838857650757
2.496755361557007
2.495213508605957
2.5284054279327393
2.542431592941284
2.475693464279175
2.6100001335144043
2.449551820755005
2.4545693397521973
2.4120032787323
2.4530041217803955
2.5481057167053223
2.437422275543213
2.456456184387207
2.385481834411621
2.484210968017578
2.557079553604126
2.5067341327667236
2.4843406677246094
2.3583245277404785
2.4627528190612793
2.4859793186187744
2.553626537322998
2.516507625579834
2.4363224506378174
2.4653146266937256
2.4754550457000732
2.477795362472534
2.4188363552093506
2.500852346420288
2.5148911476135254
2.4518351554870605
2.4265778064727783
2.6080410480499268
2.573694705

# Eval

In [45]:
idx = torch.zeros((1, 1), dtype=torch.long)

print(decode(bigram_model.generate(idx, max_new_tokens=300)[0].tolist()))

torch.Size([1, 1, 65])
torch.Size([1, 2, 65])
torch.Size([1, 3, 65])
torch.Size([1, 4, 65])
torch.Size([1, 5, 65])
torch.Size([1, 6, 65])
torch.Size([1, 7, 65])
torch.Size([1, 8, 65])
torch.Size([1, 9, 65])
torch.Size([1, 10, 65])
torch.Size([1, 11, 65])
torch.Size([1, 12, 65])
torch.Size([1, 13, 65])
torch.Size([1, 14, 65])
torch.Size([1, 15, 65])
torch.Size([1, 16, 65])
torch.Size([1, 17, 65])
torch.Size([1, 18, 65])
torch.Size([1, 19, 65])
torch.Size([1, 20, 65])
torch.Size([1, 21, 65])
torch.Size([1, 22, 65])
torch.Size([1, 23, 65])
torch.Size([1, 24, 65])
torch.Size([1, 25, 65])
torch.Size([1, 26, 65])
torch.Size([1, 27, 65])
torch.Size([1, 28, 65])
torch.Size([1, 29, 65])
torch.Size([1, 30, 65])
torch.Size([1, 31, 65])
torch.Size([1, 32, 65])
torch.Size([1, 33, 65])
torch.Size([1, 34, 65])
torch.Size([1, 35, 65])
torch.Size([1, 36, 65])
torch.Size([1, 37, 65])
torch.Size([1, 38, 65])
torch.Size([1, 39, 65])
torch.Size([1, 40, 65])
torch.Size([1, 41, 65])
torch.Size([1, 42, 65])
t