## [Video Link](https://www.youtube.com/watch?v=kCc8FmEb1nY&t=477s)

In [279]:
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O ../data/raw/input.txt

In [9]:
import torch
import torch.nn as nn
# autorefresh
%load_ext autoreload
%autoreload 2

In [281]:
with open('../data/raw/input.txt', 'r') as f:
    text = f.read()

In [282]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [3]:
def test_func(abc, **kwargs):
    print(abc)

test_func(**{'abc': 1, 'def': 2})

1


In [283]:
encode_mapping = {c: i for i, c in enumerate(chars)}
decode_mapping = {i: c for i, c in enumerate(chars)}
encoder = lambda x: [encode_mapping[s] for s in x]
decoder = lambda x: ''.join([decode_mapping[s] for s in x])

In [284]:
device='mps'
token_embeddings = nn.Embedding(vocab_size, 10, device=device)(x)
pos_embeddings = nn.Embedding(8, 10, device=device)(torch.tensor([1, 2, 3, 4, 5, 6, 7, 0], device=device))

In [285]:
token_embeddings.shape
pos_embeddings.shape
(token_embeddings+pos_embeddings).shape

torch.Size([4, 8, 10])

In [287]:
import torch
# Not sure why but for the Bigram mps is significantly slower than cpu
device='mps'
data = torch.tensor(encoder(text), dtype=torch.long, device=device)


In [288]:
# train test split
n = int(0.9*len(data))
train = data[:n]
test = data[n:]

In [289]:
torch.manual_seed(1337)
block_size = 8
batch_size = 4

def get_batch(data, batch_size):
    data_slice = torch.randint(0, len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in data_slice])
    y = torch.stack([data[i+1:i+block_size+1] for i in data_slice])
    return  x, y

x, y = get_batch(train, batch_size)
print('X:',x.shape,':\n', x)
print('Y:',y.shape,':\n', y)

X: torch.Size([4, 8]) :
 tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='mps:0')
Y: torch.Size([4, 8]) :
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]], device='mps:0')


### Simplist Bigram model which only uses two words

In [323]:

from torch.nn import functional as F
# Bigram is just pairwise model
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size) -> None:
        super().__init__()
        # Not sure why he uses vocab_size as the embedding dim, maybe just convenient? It's usually multiple of 2
        self.embedding = nn.Embedding(vocab_size, vocab_size, device=device)

    def forward(self, x, y=None):
        logits = self.embedding(x) # (Batch, Time, Channels) Time=word sequence, Channels=Embedding
        # This logic seems quite confusing, is there anyway to optimize this?
        if y is None:
            loss=None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            y = y.view(-1)
            # Lesson, check documentation and see if order is as intended
            loss = F.cross_entropy(logits, y)
        return logits, loss
    
    def generate(self, x, max_tokens=100):
        # output depends on input's shape, so if input shape is (1, T), output will be (1, T, C)
        for i in range(max_tokens):
            logits, loss = self(x)
            # print(f'logits.shape is {logits.shape}')
            # Due to weird forward logic, when generating, logits is not reshaped
            generated = logits[:,-1,:] # B,C
            probs = nn.Softmax(dim=-1)(generated)
            next_tokens = torch.multinomial(probs, 1)
            print(next_tokens.shape)
            x = torch.cat([x, next_tokens], dim=1)
        return x
    
model = BigramLanguageModel(vocab_size).to(device)
out, loss = model(x, y)
print(loss)

tensor(4.7311, device='mps:0', grad_fn=<NllLossBackward0>)


In [325]:
x.shape

torch.Size([32, 8])

In [330]:
torch.stack([x, x], dim=2).shape

torch.Size([32, 8, 2])

In [331]:
torch.cat([x, x], dim=1).shape

torch.Size([32, 16])

In [324]:
temp_list = [x,x]

In [None]:
torch.cat([x])

In [307]:
learning_rate = 0.01
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
n_epochs= 3
batch_size = 32
n_steps = 1000

# TODO: understand better here, does the order matter?
for epoch in range(n_epochs):
    for _ in range(n_steps):
        x, y = get_batch(train, batch_size)
        _, loss = model(x, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch}, Loss: {loss.item()}')

Epoch 0, Loss: 2.4873929023742676
Epoch 1, Loss: 2.4554905891418457
Epoch 2, Loss: 2.4300172328948975


In [316]:
torch.split(torch.tensor([1,2,3,4,5,6,7,8,9,10]), [1,9], dim=0).cat()

(tensor([1]), tensor([ 2,  3,  4,  5,  6,  7,  8,  9, 10]))

In [318]:
model.generate(torch.zeros(1,1, dtype=torch.long, device=device), max_tokens=100)

torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([

tensor([[ 0, 46,  6, 15, 21, 24, 25, 58, 62,  5, 10, 19, 19, 61, 22, 16, 50, 10,
         50, 63, 62, 26, 25, 51,  2, 48,  6, 45, 14, 51, 12, 14, 51, 41, 45, 45,
         13, 36, 18, 41, 32, 37, 18,  4, 30,  2, 58, 55, 39, 41, 25, 59, 38, 28,
          7, 19, 50, 47, 49,  4, 21,  6,  7,  5, 29, 10, 19,  2, 34,  4, 28, 24,
         62, 39, 47, 61, 54, 58, 33, 55, 33, 59, 15, 34, 54, 24, 43, 56, 56, 63,
         21, 60, 61,  8, 47, 32,  0, 33,  9,  0,  0]], device='mps:0')

In [309]:
print(decoder(model.generate(torch.zeros(1,1, dtype=torch.long, device=device), max_tokens=100)[0].tolist()))

UnboundLocalError: local variable 'logits' referenced before assignment

### Transformer

In [107]:
torch.manual_seed(1337)
B,T,C = 4, 8, 2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [144]:
# xbow = torch.zeros(B,T,C)
# for b in range(B):
#     for t in range(T):
#         x_prev = x[b, :t+1] # t,C
#         xbow[b,t] = x_prev.mean(axis=0) # bag of words
    

In [146]:
tri = torch.tril(torch.ones(T,T))
# Don't forget normalization 
tri_norm = tri / tri.sum(axis=1, keepdim=True) # notice the keepdim
xbow = tri_norm @ xbow

In [184]:
# Method 3
tri = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)
masked = wei.masked_fill(tri==0, float('-inf'))
tri_norm = torch.softmax(masked, dim=-1)
xbow = tri_norm @ xbow

In [185]:
tri_norm

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [221]:
from multiprocessing import Value


B = 4
T = 8
C = 32
# Self attention mechanism
head_size = 16


x = torch.randn(B,T,C)
# ask
query = nn.Linear(C, head_size, bias=False) # B,T,C
# look more into it
key = nn.Linear(C, head_size, bias=False) # B,T,C
# intuition: simply linearly transformed raw x
Value = nn.Linear(C, head_size, bias=False) # B,T,C

k = key(x) # B,T,H
q = query(x) # B,T,H 

q@k.transpose(-2, -1)

tensor([[[ 1.7293,  1.7776,  0.4130,  0.9263, -2.6684, -0.0425,  3.4025,
          -1.1335],
         [-1.1056, -2.0497,  0.5517, -3.0066,  1.7959,  1.2291,  0.6362,
           1.8547],
         [-0.2204,  0.9991, -0.4337, -0.4233,  1.1247, -0.4147, -1.4429,
           0.3976],
         [ 0.9109, -1.5431, -2.4327,  0.3213, -0.3392, -0.1686, -0.0611,
          -0.0707],
         [-2.1553, -2.0961,  1.0414, -0.2077,  1.1906,  0.2539, -1.1959,
           0.7533],
         [ 2.6176,  1.4151, -1.1325, -0.8985,  1.3273, -1.0891, -1.8364,
          -0.2618],
         [ 1.4695,  0.8811, -0.9591, -1.6081, -0.8878,  1.7417, -0.2726,
          -1.3540],
         [-0.6524, -0.4305, -0.8854, -1.2430,  0.5488,  0.3144,  0.7821,
          -0.7590]],

        [[ 1.3132, -1.7015, -0.4501,  0.8765,  0.8277,  0.1446, -1.0251,
           0.4049],
         [ 0.2882, -0.9414,  1.0976, -0.0939,  2.0157,  2.1101,  2.6527,
          -2.2760],
         [ 1.4726, -0.7085, -1.7851,  0.6409, -1.5864,  0.9529, -5.8

In [204]:
k.weight.shape

torch.Size([8, 8])

In [267]:
model.parameters

<bound method Module.parameters of BigramLanguageModel(
  (embedding): Embedding(65, 65)
)>

In [336]:
torch.manual_seed(1337)
learning_rate = 1e-3
n_epochs= 10
n_steps = 1000
batch_size = 32
block_size = 8
device='cpu'
n_embed = 32
head_size = 16
n_attentions = 8

class SelfAttention(nn.Module):
    def __init__(self, n_embed, head_size) -> None:
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, device=device)
        self.query = nn.Linear(n_embed, head_size, device=device)
        self.value = nn.Linear(n_embed, head_size, device=device)
        
    def forward(self, x):
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        # Weight is the key to self attention mechanism, essentially which previous key is more relevant to current k
        weight = q@k.transpose(-2,-1)* head_size**-0.5 # B,T,H @ B,H,T -> B,T,T

        # apply triangle mask, note the order of things. I.e. when to apply mask, softmax and multply by v at the end
        # TODO: the implementation here is different from Andrej's, would this screw things up downstream?
        tri = torch.tril(torch.ones(block_size,block_size, device=device))
        weight = weight.masked_fill(tri==0, float('-inf'))
        weight = torch.softmax(weight, dim=-1)
        weight = weight@v # B,T,T @ B,T,H -> B,T,H
        return weight

class Transformer(nn.Module):
    def __init__(self, vocab_size) -> None:
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embed, device=device)
        self.pos_embedding = nn.Embedding(block_size, n_embed, device=device)
        self.self_attention = SelfAttention(n_embed, head_size)
        self.fc = nn.Linear(head_size*n_attentions, vocab_size, device=device)

    def forward(self, x, y=None):
        char_embedding_layer = self.token_embedding(x) # (Batch, Time, Channels) Time=word sequence, Channels=embed_size
        pos_embedding_layer = self.pos_embedding(torch.arange(block_size, device=device)) # (T, C)
        x = char_embedding_layer + pos_embedding_layer
        # TODO: separate multi headed attention to a separate class, and see why I'm not reaching 3.2 in error
        weight = torch.cat([self.self_attention(x) for i in range(n_attentions)],dim=-1) # B,T,H*N_attentions -> B,T,H*N_attentions
        logits = self.fc(weight) # B,T,T @ B,T,H -> B,T,H
        # TODO: normalization
        
        # When generating, y is None
        if y is None:
            loss=None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            y = y.view(-1)
            # Lesson, check documentation and see if order is as intended
            loss = F.cross_entropy(logits, y)
        return logits, loss
    
    def generate(self, x, max_tokens=100):
        for i in range(max_tokens):
            idx_block = x[:,-block_size:]
            logits, loss = self(idx_block)
            # Due to weird forward logic, when generating, logits is not reshaped
            generated = logits[:,-1,:] # B,C
            probs = nn.Softmax(dim=-1)(generated)
            next_tokens = torch.multinomial(probs, 1)
            x = torch.cat([x, next_tokens], dim=1)
        return x
    

In [337]:
model = Transformer(vocab_size)

In [338]:
model.parameters

<bound method Module.parameters of Transformer(
  (token_embedding): Embedding(65, 32)
  (pos_embedding): Embedding(8, 32)
  (self_attention): SelfAttention(
    (key): Linear(in_features=32, out_features=16, bias=True)
    (query): Linear(in_features=32, out_features=16, bias=True)
    (value): Linear(in_features=32, out_features=16, bias=True)
  )
  (fc): Linear(in_features=128, out_features=65, bias=True)
)>

In [34]:
from dataclasses import dataclass

@dataclass(kw_only=True)
class Config:
    learning_rate:float = 5e-4
    n_epochs:int = 5
    n_steps:int = 1000
    batch_size:float = 64
    block_size = 64
    device='mps'
    n_embed = 384
    n_heads = 8
    head_size = n_embed//n_heads
    n_blocks = 3
    # Accoridng to paper, output of each sub-layer, before it is added to the
    # sub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the positional encodings
    dropout = 0.1

In [35]:
config = Config(batch_size=32)
config

Config(learning_rate=0.0005, n_epochs=5, n_steps=1000, batch_size=32)

In [75]:
import yaml
test = yaml.load(open('../config/default_config.yaml', 'r'), Loader=yaml.FullLoader)
test

{'learning_rate': '5e-4',
 'n_epochs': 5,
 'n_steps': 1000,
 'batch_size': 64,
 'block_size': 64,
 'device': 'mps',
 'n_embed': 384,
 'n_heads': 8,
 'n_blocks': 3,
 'dropout': 0.1}

In [14]:
from src.utils.config import Config
config = Config()
config.__dict__

{'learning_rate': 0.0005,
 'n_epochs': 5,
 'n_steps': 1000,
 'batch_size': 64,
 'block_size': 64,
 'device': 'mps',
 'n_embed': 384,
 'n_heads': 8,
 'n_blocks': 3,
 'dropout': 0.1,
 'head_size': 48}

In [57]:
from math import e


def test_match(match):
    match match:
        case 1:
            print('1')
        case 2:
            print('2')
        case whatever:
            print('else')

test_match(3)

else


In [50]:
'a' | 'b'

TypeError: unsupported operand type(s) for |: 'str' and 'str'

In [37]:
config.__dict__

{'learning_rate': 0.0005, 'n_epochs': 5, 'n_steps': 1000, 'batch_size': 32}

In [16]:
from sympy import per


@dataclass
class Person:
    name: str = 'John Doe'
    age: int = 30
    city: str = 'New York'

person = Person()
person

Person(name='John Doe', age=30, city='New York')

In [7]:
from torch import dropout


config = Config()
config.dropout

  device: torch.device = torch.device(torch._C._get_default_device()),  # torch.device('cpu'),


NameError: name 'Config' is not defined

In [8]:
config = Config()

NameError: name 'Config' is not defined

In [16]:
config

Config()