In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
with open('shaytonat 1-3.txt', 'r') as f:
  text = f.read()
bag = list(set(text))
n_bag = len(bag)
print(f'Beliglar soni: {n_bag}')

encode = lambda s: [bag.index(l) for l in s]
decode = lambda ids: "".join([bag[id] for id in ids])

Beliglar soni: 93


In [4]:
n_data = len(text)
val_size = 0.1
n_train = int((1 - val_size) * n_data)
n_val = n_data - n_train

train_data = torch.tensor(encode(text[:n_train]), dtype=torch.int32)
val_data = torch.tensor(encode(text[n_train:]), dtype=torch.int32)
print("O'rgatuvchida: ", n_train)
print("Sinovda: ", n_val)

O'rgatuvchida:  1848219
Sinovda:  205358


In [39]:
batch_size = 32

block_size = 256
embed_dim = 1024
vocab_size = n_bag
num_heads = 16
num_blocks = 2

In [6]:
def get_batch(split='train'):
    data = train_data if split == 'train' else val_data
    xb = []
    yb = []
    for i in range(batch_size):
        idx = np.random.randint(0, len(data) - block_size - 1)
        xb.append(data[idx:idx+block_size])
        yb.append(data[idx+1:idx+block_size+1])
    
    xb = torch.stack(xb)
    yb = torch.stack(yb).to(torch.int64)

    return xb, yb

In [41]:
class Head(nn.Module):

    def __init__(self,
                 embed_dim,
                 head_size):
        super().__init__()
        self.head_size = head_size
        self.query = nn.Linear(embed_dim, head_size) # (B, T, head_size)
        self.key = nn.Linear(embed_dim, head_size)   # (B, T, head_size)
        self.value = nn.Linear(embed_dim, head_size) # (B, T, head_size)

        self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))
    
    def forward(self, x):
        T = x.shape[1]

        q = self.query(x)
        k = self.key(x)
        v = self.value(x)

        # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)
        wei = q @ k.transpose(-2, -1)
        wei = wei / (self.head_size ** 0.5)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)

        # (B, T, T) @ (B, T, head_size) -> (B, T, head_size)
        out = wei @ v
        return out

class MultiHead(nn.Module):

    def __init__(self, 
                 embed_dim, 
                 head_size,
                 num_heads):
        super().__init__()
        self.heads = nn.ModuleList([Head(embed_dim, head_size) for _ in range(num_heads)])
    
    def forward(self, x):
        return torch.cat([head(x) for head in self.heads], dim=-1)

class Block(nn.Module):

    def __init__(self,
                 embed_dim,
                 num_heads):
        super().__init__()
        
        self.norm_layer_1 = nn.LayerNorm(embed_dim)
        self.multi_head = MultiHead(embed_dim, embed_dim // num_heads, num_heads)
        self.norm_layer_2 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU()
        )
    
    def forward(self, x):
        x = x + self.multi_head(self.norm_layer_1(x))
        x = x + self.ff(self.norm_layer_2(x))
        return x



class BigramLM(nn.Module):

    def __init__(self,
                 embed_dim,
                num_blocks,
                num_heads):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, 
                                                  embed_dim)
        self.postion_embedding_table = nn.Embedding(block_size, 
                                                  embed_dim)
        self.blocks = nn.Sequential(*[
            Block(embed_dim, num_heads) for _ in range(num_blocks)
        ])
        self.fc = nn.Linear(embed_dim, vocab_size)
    
    def forward(self, xb, yb=None):
        B, T = xb.shape
        # xb -> (batch_size, block_size) => (4, 8)
        # yb -> (batch_size, block_size) => (4, 8)
        # (batch_size, block_size, n_emb)
        # (4, 8, 32)
        token_emb = self.token_embedding_table(xb)
        postion_emb = self.postion_embedding_table(torch.arange(0, T, device=device))
        x = self.blocks(token_emb + postion_emb)
        # (4, 8, 93)
        logits = self.fc(x)
        
        if yb is not None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            yb = yb.view(B*T)
            loss = F.cross_entropy(logits, yb)
        else:
            loss = None

        return logits, loss
    
    def generate(self, idx, max_new_token):
        for _ in range(max_new_token):
            # (batch_size, block_size, vocab_size)
            # (1, 8, 93)
            idx = idx[:, -block_size:]
            logits, _ = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)

            idx_next = torch.multinomial(probs, num_samples=1)
            
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [42]:
model = BigramLM(embed_dim,
                num_blocks,
                num_heads)
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

In [54]:
n_steps = 1_000
for step in range(n_steps):
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)

    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if step % 100 == 0:
        print(f"Step {step+1}: {loss.item():.4f}")

Step 1: 1.5536
Step 101: 1.5696
Step 201: 1.4914
Step 301: 1.5319
Step 401: 1.5049
Step 501: 1.5266
Step 601: 1.5345
Step 701: 1.5026
Step 801: 1.5092
Step 901: 1.4635


In [44]:
num_params = sum([param.numel() for param in model.parameters()])
num_params

8857693

In [57]:
idx = torch.tensor([encode("— Уйда ким бор? — деб сўради паст бўйли киши.")], 
                   dtype=torch.long, 
                   device=device)
gen_idx = model.generate(idx, max_new_token=500)
print(decode(gen_idx[0]))

отиб кўтавангизни балхонага чиқариб юборга тидираб. —
— Эшийлик, озгина шаҳар эмас, бўлса ҳам, биларми, қандошга соломдан провозига ҳам
дирровнича чотирмоқчи кулганда ўзи очиқларимга баравада Памжурхон ёрухда қилмаган эди.
Уй пўйланга нонавб онадиганинг дав


In [18]:
torch.manual_seed(42)

B, T, C = 4, 8, 2
x = torch.randn(B, T, C)

tril = torch.tril(torch.ones((T, T)))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=1)

# Scaled Dot-Product Attention

In [29]:
torch.manual_seed(42)

B, T, C = 4, 8, 2
head_size = 16

x = torch.randn(B, T, C)

query = nn.Linear(C, head_size) # (B, T, head_size)
key = nn.Linear(C, head_size)   # (B, T, head_size)
value = nn.Linear(C, head_size) # (B, T, head_size)

q = query(x)
k = key(x)
v = value(x)

# (B, T, head_size) @ (B, head_size, T) -> (B, T, T)
wei = q @ k.transpose(-2, -1)
wei = wei / (head_size ** 0.5)

tril = torch.tril(torch.ones((T, T)))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

# (B, T, T) @ (B, T, head_size) -> (B, T, head_size)
out = wei @ v

In [78]:
a = torch.randn(size=(4, 4))
b = torch.randn(size=(4, 4))

c = a @ b / (4 ** 0.5)
torch.var(c)

tensor(1.2606)