<a href="https://colab.research.google.com/github/vky342/SmallGPT/blob/main/smallgpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [51]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np


In [52]:
class CharTokenizer:
    def __init__(self, data):
        # Get unique characters from the data
        chars = sorted(list(set(data)))
        self.stoi = {ch: i for i, ch in enumerate(chars)}
        self.itos = {i: ch for ch, i in self.stoi.items()}
        self.vocab_size = len(self.stoi)

    def encode(self, text):
        return [self.stoi[ch] for ch in text if ch in self.stoi]

    def decode(self, indices):
        return ''.join([self.itos[i] for i in indices])

    def pad_or_truncate(self, encoded, max_length=60):
        # Pad with zeros or truncate
        if len(encoded) < max_length:
            return encoded + [0] * (max_length - len(encoded))
        else:
            return encoded[:max_length]


In [53]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, context_length, dropout=0.1):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = nn.Parameter(torch.zeros(1, context_length, embed_dim))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        x: (batch_size, context_length) → token IDs
        Returns: (batch_size, context_length, embed_dim)
        """
        tok_emb = self.token_embedding(x)              # shape: (B, T, E)
        pos_emb = self.pos_embedding[:, :x.size(1), :] # shape: (1, T, E)
        out = tok_emb + pos_emb                        # Add token + position
        return self.dropout(out)

In [54]:
# Sample data
data = "hello kunal"

# Initialize tokenizer
tokenizer = CharTokenizer(data)

# Encode + pad a prompt
prompt = "hello gpt"
encoded = tokenizer.encode(prompt)
padded = tokenizer.pad_or_truncate(encoded, max_length=60)

# Decode back
decoded = tokenizer.decode(padded[:len(encoded)])

print("Encoded:", encoded)
print("Padded:", padded)
print("Decoded:", decoded)
print("Vocab size:", tokenizer.vocab_size)

Encoded: [3, 2, 5, 5, 7, 0]
Padded: [3, 2, 5, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Decoded: hello 
Vocab size: 9


In [None]:
from google.colab import files
uploaded = files.upload()


Saving qna.txt to qna.txt


In [67]:
with open("data.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

raw_text = raw_text.lower().strip()
print(raw_text[:500])

driving from my parent’s
home to cochin last friday
morning, i saw my mother,
beside me,
doze, open mouthed, her face
ashen like that
of a corpse and realised with
pain
that she was as old as she
looked but soon
put that thought away, and
looked out at young
trees sprinting, the merry children spilling
out of their homes, but after the airport’s
security check, standing a few yards
away, i looked again at her, wan, pale
as a late winter’s moon and felt that old
familiar ache, my childhood’s fear


In [56]:
tokenizer = CharTokenizer(raw_text)
encoded_text = tokenizer.encode(raw_text)
print(encoded_text)

[16, 30, 21, 34, 21, 26, 19, 1, 18, 30, 27, 25, 1, 25, 37, 1, 28, 13, 30, 17, 26, 32, 43, 31, 0, 20, 27, 25, 17, 1, 32, 27, 1, 15, 27, 15, 20, 21, 26, 1, 24, 13, 31, 32, 1, 18, 30, 21, 16, 13, 37, 0, 25, 27, 30, 26, 21, 26, 19, 7, 1, 21, 1, 31, 13, 35, 1, 25, 37, 1, 25, 27, 32, 20, 17, 30, 7, 0, 14, 17, 31, 21, 16, 17, 1, 25, 17, 7, 0, 16, 27, 38, 17, 7, 1, 27, 28, 17, 26, 1, 25, 27, 33, 32, 20, 17, 16, 7, 1, 20, 17, 30, 1, 18, 13, 15, 17, 0, 13, 31, 20, 17, 26, 1, 24, 21, 23, 17, 1, 32, 20, 13, 32, 0, 27, 18, 1, 13, 1, 15, 27, 30, 28, 31, 17, 1, 13, 26, 16, 1, 30, 17, 13, 24, 21, 31, 17, 16, 1, 35, 21, 32, 20, 0, 28, 13, 21, 26, 0, 32, 20, 13, 32, 1, 31, 20, 17, 1, 35, 13, 31, 1, 13, 31, 1, 27, 24, 16, 1, 13, 31, 1, 31, 20, 17, 0, 24, 27, 27, 23, 17, 16, 1, 14, 33, 32, 1, 31, 27, 27, 26, 0, 28, 33, 32, 1, 32, 20, 13, 32, 1, 32, 20, 27, 33, 19, 20, 32, 1, 13, 35, 13, 37, 7, 1, 13, 26, 16, 0, 24, 27, 27, 23, 17, 16, 1, 27, 33, 32, 1, 13, 32, 1, 37, 27, 33, 26, 19, 0, 32, 30, 17, 17, 31,

In [57]:
class SimpleSelfAttention(nn.Module):
    def __init__(self, embed_dim, context_length):
        super().__init__()
        self.embed_dim = embed_dim
        self.context_length = context_length

        # Linear projections for Q, K, V
        self.key = nn.Linear(embed_dim, embed_dim, bias=False)
        self.query = nn.Linear(embed_dim, embed_dim, bias=False)
        self.value = nn.Linear(embed_dim, embed_dim, bias=False)

        # Output projection
        self.proj = nn.Linear(embed_dim, embed_dim)

        # Causal mask (buffer, not trainable)
        mask = torch.tril(torch.ones(context_length, context_length))
        self.register_buffer("causal_mask", mask)

    def forward(self, x):
        B, T, C = x.shape  # (batch, time, channels)

        # Compute Q, K, V
        k = self.key(x)    # (B, T, C)
        q = self.query(x)  # (B, T, C)
        v = self.value(x)  # (B, T, C)

        # Compute raw attention scores
        att = q @ k.transpose(-2, -1) / (C ** 0.5)  # (B, T, T)

        # Apply causal mask: set future scores to -inf
        att = att.masked_fill(self.causal_mask[:T, :T] == 0, float('-inf'))

        # Softmax over the past tokens
        att = F.softmax(att, dim=-1)  # (B, T, T)

        # Weighted sum of values
        out = att @ v  # (B, T, C)

        # Final linear projection
        return self.proj(out)

In [58]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, context_length, dropout=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(embed_dim)
        self.attn = SimpleSelfAttention(embed_dim, context_length)
        self.ln2 = nn.LayerNorm(embed_dim)

        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, 4 * embed_dim),
            nn.ReLU(),
            nn.Linear(4 * embed_dim, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        # Self-attention block
        x = x + self.attn(self.ln1(x))  # residual + pre-LN
        # Feedforward block
        x = x + self.mlp(self.ln2(x))   # residual + pre-LN
        return x


In [59]:
class MiniGPT(nn.Module):
    def __init__(self, vocab_size, context_length, embed_dim, num_layers, dropout=0.1):
        super().__init__()
        self.embed = Embedding(vocab_size, embed_dim, context_length, dropout)

        self.blocks = nn.Sequential(
            *[TransformerBlock(embed_dim, context_length, dropout) for _ in range(num_layers)]
        )

        self.ln_final = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size)  # Final projection to vocab size

    def forward(self, x):
        """
        x: (B, T) — batch of token IDs
        returns: (B, T, vocab_size) — logits for next token
        """
        x = self.embed(x)           # (B, T, C)
        x = self.blocks(x)          # (B, T, C)
        x = self.ln_final(x)        # (B, T, C)
        logits = self.head(x)       # (B, T, vocab_size)
        return logits


In [60]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encoded_text, context_length):
        self.data = encoded_text
        self.context_length = context_length

    def __len__(self):
        return len(self.data) - self.context_length

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx : idx + self.context_length], dtype=torch.long)
        y = torch.tensor(self.data[idx + 1 : idx + 1 + self.context_length], dtype=torch.long)
        return x, y  # Input, Target


In [61]:

batch_size = 32
vocab_size = tokenizer.vocab_size
context_length = 60
embed_dim = 64
num_layers = 3

dataset = Dataset(encoded_text, context_length)
loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [62]:
for xb, yb in loader:
    print("Input shape: ", xb.shape)
    print("Target shape:", yb.shape)
    break

Input shape:  torch.Size([32, 60])
Target shape: torch.Size([32, 60])


In [63]:
for xb, yb in loader:
    print("Input (xb):")
    print(xb[0])   # First sample in batch
    print("\nTarget (yb):")
    print(yb[0])   # Corresponding target
    print("Input text : ", tokenizer.decode(xb[0].tolist()))
    print("Target text: ", tokenizer.decode(yb[0].tolist()))
    break


Input (xb):
tensor([13, 32,  1, 21,  1, 20, 13, 34, 17,  1, 13, 28, 28, 30, 27, 34, 17, 16,
         7,  0, 28, 30, 27, 31, 28, 17, 30,  1, 14, 33, 32,  1, 24, 21, 32, 32,
        24, 17,  7,  1, 20, 13, 31,  1, 16, 30, 21, 17, 16,  1, 33, 28,  1, 27,
        18,  1, 24, 13, 32, 17])

Target (yb):
tensor([32,  1, 21,  1, 20, 13, 34, 17,  1, 13, 28, 28, 30, 27, 34, 17, 16,  7,
         0, 28, 30, 27, 31, 28, 17, 30,  1, 14, 33, 32,  1, 24, 21, 32, 32, 24,
        17,  7,  1, 20, 13, 31,  1, 16, 30, 21, 17, 16,  1, 33, 28,  1, 27, 18,
         1, 24, 13, 32, 17,  7])
Input text :  at i have approved,
prosper but little, has dried up of late
Target text:  t i have approved,
prosper but little, has dried up of late,


In [64]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MiniGPT(vocab_size, context_length, embed_dim, num_layers).to(device)
state_dict = torch.load('smallgpt_wei.pth', map_location=torch.device('cpu'))
model.load_state_dict(state_dict)
model.eval()
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")

Total parameters: 159020


In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=2e-3, momentum=0.85)
criterion = nn.CrossEntropyLoss()

In [None]:
epochs = 30

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits.view(-1, vocab_size), yb.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")

Epoch 1/30 - Loss: 0.2949
Epoch 2/30 - Loss: 0.2951
Epoch 3/30 - Loss: 0.2941
Epoch 4/30 - Loss: 0.2955
Epoch 5/30 - Loss: 0.2945
Epoch 6/30 - Loss: 0.2946
Epoch 7/30 - Loss: 0.2942
Epoch 8/30 - Loss: 0.2937
Epoch 9/30 - Loss: 0.2940
Epoch 10/30 - Loss: 0.2938
Epoch 11/30 - Loss: 0.2943
Epoch 12/30 - Loss: 0.2942
Epoch 13/30 - Loss: 0.2939
Epoch 14/30 - Loss: 0.2939
Epoch 15/30 - Loss: 0.2932
Epoch 16/30 - Loss: 0.2932
Epoch 17/30 - Loss: 0.2936
Epoch 18/30 - Loss: 0.2933
Epoch 19/30 - Loss: 0.2932
Epoch 20/30 - Loss: 0.2935
Epoch 21/30 - Loss: 0.2938
Epoch 22/30 - Loss: 0.2928
Epoch 23/30 - Loss: 0.2931
Epoch 24/30 - Loss: 0.2935
Epoch 25/30 - Loss: 0.2930
Epoch 26/30 - Loss: 0.2941
Epoch 27/30 - Loss: 0.2926
Epoch 28/30 - Loss: 0.2933
Epoch 29/30 - Loss: 0.2933
Epoch 30/30 - Loss: 0.2924


In [65]:
def generate_text(model, tokenizer, prompt, max_new_tokens=500, temperature=1.0):
    model.eval()
    device = next(model.parameters()).device

    # Encode prompt and move to device
    input_ids = tokenizer.encode(prompt)
    x = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)  # (1, T)

    for _ in range(max_new_tokens):
        x_crop = x[:, -context_length:] if x.size(1) > context_length else x
        logits = model(x_crop)  # (1, T, vocab_size)
        logits = logits[:, -1, :] / temperature  # last time step
        probs = torch.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)  # (1, 1)
        x = torch.cat([x, next_id], dim=1)  # Append next token

    # Get generated part (after the prompt)
    generated_ids = x[0].tolist()
    generated_text = tokenizer.decode(generated_ids[len(input_ids):])

    return prompt + generated_text

In [66]:
prompt = "the"
print(generate_text(model, tokenizer, prompt, max_new_tokens=450, temperature=0.8))


thes cance, and thee o time.
what needs my shakespear for his honour’d bones,
the labour of an age in piled stones,
or that his hallow’d reliques should be hid
under a star-ypointing pyramid?
dear son of memory, great heir of fame,
what need’st thou such weak witness of thy name?
thou in our wonder and astonishment
hast built thy self a live-long monument.
for whilst to th’shame of slow endeavouring art,
thy easie numbers flow, and that each heart
h
