In [110]:
# let's now encode the entire text dataset and store it into a torch.Tensor
import torch # we use PyTorch: https://pytorch.org
import torch.nn as nn
from torch.nn import functional as F

In [111]:
# Hyper parameters
batch_size = 32 # how many independent sequences will we process in parallel? , Parallelization rate.
block_size = 8 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_emd = 32
# --------------


torch.manual_seed(1337)


# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()


# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Let's now split up the data into train and validation sets
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

--2025-11-06 02:51:50--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8003::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.19’


2025-11-06 02:51:51 (9.66 MB/s) - ‘input.txt.19’ saved [1115394/1115394]



In [112]:
print(xb) # our input to the transformer

tensor([[ 6,  1, 57, 61, 53, 56, 52,  6],
        [61,  1, 58, 53,  5, 58, 12,  0],
        [58, 46, 39, 58,  1, 47, 57,  1],
        [32, 47, 58, 59, 57,  1, 24, 39],
        [52, 53, 40, 50, 43, 57,  8,  0],
        [51, 43, 12,  0,  0, 22, 33, 24],
        [47, 58, 47, 53, 52,  6,  1, 44],
        [46, 43, 56, 43,  1, 52, 53, 61],
        [42,  1, 57, 39, 63, 57,  1, 46],
        [ 1, 57, 43, 43,  1, 58, 46, 43],
        [42,  0, 32, 53,  1, 50, 47, 44],
        [58, 63,  8,  0,  0, 22, 33, 24],
        [58, 43, 41, 58, 53, 56, 12,  1],
        [ 1, 58, 46, 43,  1, 50, 39, 61],
        [46, 39, 56, 42,  6,  1, 24, 53],
        [10,  0, 21,  1, 61, 47, 50, 50],
        [58, 46, 43,  1, 39, 52, 58, 47],
        [33, 25, 21, 27, 10,  0, 13,  1],
        [43, 50, 47, 49, 43,  6,  1, 39],
        [55, 59, 43, 43, 52,  6,  1, 63],
        [43, 52, 58,  8,  0,  0, 18, 30],
        [56, 12,  0,  0, 31, 13, 25, 28],
        [53, 56, 58,  1, 46, 47, 57,  1],
        [46, 39, 58,  1, 60, 53, 4

In [121]:

# --- HEAD MODULE ---
class Head(nn.Module):
    """ One head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        # register lower triangular mask for causal attention
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape

        k = self.key(x)    # (B, T, head_size)
        q = self.query(x)  # (B, T, head_size)

        # compute attention weights
        wei = q @ k.transpose(-2, -1) * (k.shape[-1] ** -0.5)  # (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)

        # weighted aggregation of values
        v = self.value(x)
        out = wei @ v  # (B, T, head_size)
        return out


# --- MULTI-HEAD ATTENTION ---
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self, x):
        # concatenate output from each head
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return out


# --- BIGRAM LANGUAGE MODEL ---
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.sa_heads = MultiHeadAttention(4, n_embed // 4)  # 4 heads
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)  # (B, T, n_embed)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, n_embed)
        x = tok_emb + pos_emb  # (B, T, n_embed)

        x = self.sa_heads(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            # flatten batch and time for loss
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]  # crop context
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]  # last time step
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [122]:
model = BigramLanguageModel()
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, ' M parameters')
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)


for iter in range(max_iters):

    if iter % eval_interval == 0 or iter == max_iters -1:
      losses = estimate_loss()
      print(f"stepp {iter}: trian loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb,yb = get_batch('train')

    logits, loss = m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

0.007553  M parameters
stepp 0: trian loss 4.1452, val loss 4.1477
stepp 500: trian loss 2.6505, val loss 2.6823
stepp 1000: trian loss 2.4924, val loss 2.5142
stepp 1500: trian loss 2.4330, val loss 2.4183
stepp 2000: trian loss 2.3832, val loss 2.3853
stepp 2500: trian loss 2.3376, val loss 2.3436
stepp 3000: trian loss 2.3094, val loss 2.3323
stepp 3500: trian loss 2.2821, val loss 2.3085
stepp 4000: trian loss 2.2791, val loss 2.3018
stepp 4500: trian loss 2.2468, val loss 2.3002
stepp 4999: trian loss 2.2536, val loss 2.2841

A:
Whame bres shing their ing lay tiths teray, all tawiu of ownearsces?

Thishe youge: thrint on loome you himammene surd I cour nons ho oforme ay to feor the onit alll ris be
you me dis my, komy some dearer Grwand,
An theasres of kprowngce thou, of hesh is's, hun on faul yrow dantc; tomes but thy hin nid thes:
Beshership bre angch erar wor
aughd, now'd Dou wis sith wooonk!
Wa hersour I haveentr! Cothe to fron haven noud lard:
ArPENCIO:
Cout ont jithe havet c

In [None]:
# only code-example left
# version 4 : self attention
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value  = nn.Linear(C, head_size, bias=False)
k = key(x) # B, T, 16
q = query(x) # B, T, 16
wei = q @k.transpose(-2,-1) # B, T, 16 @ # B, 16, T ---> B, T, T

tril = torch.tril((torch.ones(T,T)))
#wei = torch.zeros((T,T))
# important for encoder, for decoder we dont want to see the future
# wei = wei.masked_fill(tril == 0, float('-inf'))

wei = F.softmax(wei, dim=-1)


v = value(x)
out = wei @ x
out.shape

v