<a href="https://colab.research.google.com/github/tanyag/tiny_llm_colab/blob/main/mini_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
import torch
import torch.nn as nn
import torch.nn.functional as F


Read File

In [43]:
with open('/content/sample_data/sample_text.txt', 'r') as f:
    text = f.read()

In [44]:
# from google.colab import drive
# drive.mount('/content/drive')

Tokenizer

In [45]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

def encode(s): return [stoi[c] for c in s]
def decode(t): return ''.join([itos[i] for i in t])
data = torch.tensor(encode(text), dtype=torch.long)


🧪 4. Train/Validation Split


In [46]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]


⚙️ 5. Hyperparameters


In [47]:
block_size = 64   # max context length
batch_size = 32
n_embed = 64
n_head = 4
n_layer = 2
dropout = 0.1
device = 'cuda' if torch.cuda.is_available() else 'cpu'


✅ Step 2: Define the Transformer Model (in Colab)

🔹 1. Embedding + Positional Encoding

In [48]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, n_embed):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embed)
        self.position_embedding = nn.Embedding(block_size, n_embed)

    def forward(self, x):
        B, T = x.shape
        token_emb = self.token_embedding(x)               # (B, T, n_embed)
        pos_emb = self.position_embedding(torch.arange(T, device=x.device))  # (T, n_embed)
        return token_emb + pos_emb  # (B, T, n_embed)


🔹 2. Single Head of Self-Attention

In [49]:
class SelfAttentionHead(nn.Module):
    def __init__(self, n_embed, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)

        # causal mask: prevents attending to future
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)

        wei = q @ k.transpose(-2, -1) * (C ** -0.5)  # scaled dot-product
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        v = self.value(x)
        out = wei @ v  # (B, T, head_size)
        return out


🔹 3. Multi-Head Attention

In [50]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed // n_head
        self.heads = nn.ModuleList([SelfAttentionHead(n_embed, head_size) for _ in range(n_head)])
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

🔹 4. FeedForward Network

In [51]:
class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

🔹 5. Transformer Block (Attention + FeedForward)

In [52]:
class TransformerBlock(nn.Module):
    def __init__(self, n_embed, n_head):
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)
        self.attn = MultiHeadAttention(n_embed, n_head)
        self.ffwd = FeedForward(n_embed)

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

🔹 6. Full Transformer Model

In [53]:
class TinyTransformer(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = TokenEmbedding(vocab_size, n_embed)
        self.blocks = nn.Sequential(*[TransformerBlock(n_embed, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed)
        self.head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx):
        x = self.embed(idx)            # (B, T, n_embed)
        x = self.blocks(x)             # transformer layers
        x = self.ln_f(x)               # final layer norm
        logits = self.head(x)          # (B, T, vocab_size)
        return logits

Step 3: Train the Transformer and Generate Text in Colab.

✅ Step 3A: Create Batches

In [54]:
def get_batch(split):
    data_split = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    x = torch.stack([data_split[i:i + block_size] for i in ix])
    y = torch.stack([data_split[i + 1:i + 1 + block_size] for i in ix])
    return x.to(device), y.to(device)


✅ Step 3B: Instantiate the Model and Optimizer

In [55]:
model = TinyTransformer().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

✅ Step 3C: Training Loop



In [56]:
max_iters = 2000
eval_interval = 200

for step in range(max_iters):
    if step % eval_interval == 0:
        model.eval()
        xb, yb = get_batch('val')
        with torch.no_grad():
            logits = model(xb)
            loss = F.cross_entropy(logits.view(-1, vocab_size), yb.view(-1))
        print(f"Step {step}: val loss = {loss.item():.4f}")
        model.train()

    xb, yb = get_batch('train')
    logits = model(xb)
    loss = F.cross_entropy(logits.view(-1, vocab_size), yb.view(-1))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


Step 0: val loss = 3.9451
Step 200: val loss = 2.2010
Step 400: val loss = 2.1170
Step 600: val loss = 2.0421
Step 800: val loss = 2.1522
Step 1000: val loss = 2.2564
Step 1200: val loss = 2.4154
Step 1400: val loss = 2.4909
Step 1600: val loss = 2.8642
Step 1800: val loss = 2.5145


✅ Step 3D: Generate Text from the Model

In [57]:
@torch.no_grad()
def generate(model, start_text, length=200):
    model.eval()
    context = torch.tensor([[stoi[c] for c in start_text]], dtype=torch.long).to(device)
    for _ in range(length):
        context_condensed = context[:, -block_size:]
        logits = model(context_condensed)
        probs = F.softmax(logits[:, -1, :], dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        context = torch.cat((context, next_token), dim=1)
    return decode(context[0].tolist())


🧪 Test It:

In [60]:
print(generate(model, start_text="After growing up", length=300))


After growing up, and there was no more chocolate milk at bedtime, norg a where chapter of books read a oudd, a the ry her very nigh made up new stories for herself, and she liked boxes full of things, and she liked boxes full of things, and she liked to know how and where to find things read again. But every night
