<a href="https://colab.research.google.com/github/toussaintma/neuralnetworksfromzerotohero/blob/main/walkthrough_makemore_gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# data and code at https://github.com/karpathy/makemore
# NanoGPT code at https://github.com/karpathy/nanoGPT
# course at https://www.youtube.com/watch?v=PaCmpygFfXo&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=2&t=11s
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
# Letw built GTP: from scratch, in code, spelled out
# GPT 3 paper: https://arxiv.org/pdf/2005.14165.pdf
# Attention is all you need paper: https://arxiv.org/abs/1706.03762


--2023-07-20 08:13:45--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2023-07-20 08:13:45 (113 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [5]:
with open('input.txt', 'r', encoding='utf-8') as f:
  text = f.read()

print('Length of dataset in characters: ', len(text))

Length of dataset in characters:  1115394


In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
vocab = ''.join(chars)
print(f'Vocabulary of size {vocab_size}: {vocab}')

Vocabulary of size 65: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [7]:
stoi = {s:i for (i, s) in enumerate(chars)}
itos = {i:s for (i, s) in enumerate(chars)}
encode = lambda x: [stoi[a] for a in x]
decode = lambda x: ''.join([itos[a] for a in x])
stoi[itos[13]], decode(encode('ABC'))

(13, 'ABC')

In [8]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [9]:
n = int(0.90 * len(data))
train_data = data[:n]
eval_data = data[n:]

In [10]:
block_size = 8
train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [11]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f'when input i {context} the target is {target}')

when input i tensor([18]) the target is 47
when input i tensor([18, 47]) the target is 56
when input i tensor([18, 47, 56]) the target is 57
when input i tensor([18, 47, 56, 57]) the target is 58
when input i tensor([18, 47, 56, 57, 58]) the target is 1
when input i tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input i tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input i tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [20]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_data(split):
  data = train_data if split == 'train' else eval_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i: i+block_size] for i in ix], dim=0)
  y = torch.stack([data[i+1: i+block_size+1] for i in ix], dim=0)
  return x,y

Xb, Yb = get_data('train')

### Bigram Model

In [67]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

n_embedding = vocab_size

class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embedding)

  def forward(self, idx, targets=None):
    logits = self.token_embedding_table(idx) # Batch * Time * Channel

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_token):
    # idx indices of batch B T C because we have no targets
    out = []
    for i in range(max_new_token):
      logits, loss = self(idx)
      logits = logits[:, -1,:] # B C
      probs = F.softmax(logits, dim=-1) # B C
      pred = torch.multinomial(probs, num_samples=1) # B 1
      idx = torch.cat((idx, pred), dim=1) # B T+1
    return idx

model = BigramLanguageModel(vocab_size)
logits, loss = model(Xb, Yb)
Xb.shape, logits.shape, loss

idx = model.generate(torch.zeros((1,1), dtype=torch.long), max_new_token=100)[0].tolist()
print(decode(idx))




Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [68]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [75]:
batch_size = 32

for step in range(10000):
  Xb, Yb = get_data('train')
  logits, loss = model.forward(Xb, Yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
print(loss.item())


2.495130777359009


In [77]:
#print(decode(model.generate(torch.zeros((1,1), dtype=torch.long), max_new_token=400)[0].tolist()))

### Self-attention trick

In [79]:
torch.manual_seed = 1337
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [85]:

xbow = torch.zeros((B, T, C))
for b in range(B):
  for t in range(T):
    xprev = x[b, :t+1] # t C
    xbow[b, t] = torch.mean(xprev, 0) # C

#print(x[0])
#print(xbow[0])

In [87]:
a = torch.tril(torch.ones(3,3))
a


tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])