<a href="https://colab.research.google.com/github/varun29-git/deep-learning-foundations/blob/main/decoder_only_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Decoder-only Transformer from Scratch**
This project is a character-level Generative Pre-trained Transformer (GPT) implemented in PyTorch. It serves as a capstone for a self-study series, tracking the architectural evolution from Bigram models to modular Transformers with causal self-attention, residual connections, and layer normalization.

The model is trained on the TinyShakespeare dataset for autoregressive text generation, prioritizing rigorous tensor shape tracking for clarity.





---



**Acknowledgments:** Based on the "Neural Networks: Zero to Hero" series by Andrej Karpathy.

In [20]:
# Requirements
import torch
import torch.nn.functional as F
import torch.nn as nn

In [21]:
# Configuration
block_size = 256 # What is the maximum context_length for predictions.
batch_size = 64 # How many independent sequences we will be processing in parellel.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
eval_iters = 200
n_head = 6
n_embd = 384
n_layer = 6
dropout = 0.2

In [22]:
!wget -q https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


In [23]:
with open("input.txt", "r", encoding="utf-8") as f:
  text = f.read()


In [24]:
print(f"Number of Characters: {len(text)}")

Number of Characters: 1115394


In [25]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(' '.join(chars))
print(vocab_size)


   ! $ & ' , - . 3 : ; ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z
65


In [26]:
stoi = {s:i for i,s in enumerate(chars)}
itos = {i:s for i,s in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda s: "".join([itos[c] for c in s])
x = "Hello, World"

print(f"X: {x}")
print(f"Example for Encoder: {encode(x)}")
print(f"Example for Decoder: {decode(encode(x))}")

X: Hello, World
Example for Encoder: [20, 43, 50, 50, 53, 6, 1, 35, 53, 56, 50, 42]
Example for Decoder: Hello, World


In [27]:
# Data converted into integers
data = torch.tensor(encode(text), dtype= torch.long)

In [28]:
n = (int(0.9 * len(data)))
train_data = data[:n]
val_data = data[n:]

In [29]:
# We only train the dataset on chunks
x = train_data[:block_size] # Inputs to the transformer
y = train_data[1:block_size + 1] # Target

In [30]:
torch.manual_seed(1337)

# Function to get a batch to train with batch size = 4 and block_size = 8
def get_batch(split):
  data = train_data if split == "train" else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x, y

# Get training batch
xb, yb = get_batch(train_data)


In [31]:
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ["train", "val"]:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out


In [32]:
class Head(nn.Module):
  "One head of self attention"

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    self.dropout = nn.Dropout(dropout)
  def forward(self, x):
    B, T, C = x.shape
    k = self.key(x) # B, T, C
    q = self.query(x) # B, T, C
    # Compute affinities
    wei = q @ k.transpose(-2,-1) * (C**-0.5) # B, T, C @ B, C, T -> B, T, T
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf")) # (B, T, T)
    wei = F.softmax(wei, dim=-1) # (B, T, T)
    wei = self.dropout(wei)
    # Aggregation of values(weighted)
    v = self.value(x) # (B, T, C)
    out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
    return out


In [33]:
class MultiHeadAttention(nn.Module):

  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    # h(x) is (B, T, C), concat on dim -1 -> (B, T, n_embd)
    out = torch.cat([h(x)for h in self.heads], dim =-1)
    out = self.dropout(self.proj(out))
    return out

In [34]:
class FeedForward(nn.Module):

  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4 * n_embd),
        nn.ReLU(),
        nn.Linear(4 * n_embd, n_embd),
        nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)


In [35]:
class Block(nn.Module):

  def __init__(self, n_embd, n_head):

    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln_1 = nn.LayerNorm(n_embd)
    self.ln_2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.sa(self.ln_1(x))
    x = x + self.ffwd(self.ln_2(x))
    return x



In [36]:
torch.manual_seed(1337)

class DecoderTransformerModel(nn.Module):

  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets= None):
    B, T = idx.shape
    # Calculate logits unconditionally, as it's needed for both training and generation.
    tok_emb = self.token_embedding_table(idx) # (B, T, C)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
    x = tok_emb + pos_emb
    x = self.blocks(x)
    logits = self.lm_head(x)

    loss = None
    if targets is not None:
      # If targets are provided, calculate the loss.
      # Reshape logits and targets for F.cross_entropy
      B, T, C = logits.shape
      logits_reshaped = logits.view(B * T, C)
      targets_reshaped = targets.view(B * T)
      loss = F.cross_entropy(logits_reshaped, targets_reshaped)
    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      # Crop idx to the last block_size tokens
      # This is crucial for models with a fixed context window.
      idx_cond = idx[:, -block_size:] # The block_size variable is available from previous cells.

      # Get predictions
      # Note: when calling self(idx_cond), targets is implicitly None,
      # so only logits will be computed and returned by the forward method.
      logits, _ = self(idx_cond) # Using _ to ignore the loss, as it's None during generation

      # Focus only on the last time step
      logits = logits[:, -1,:] # Becomes (B, C)

      # Apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)

      # Sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

      # Append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

    return idx # Move return statement outside the loop to generate all tokens

model = DecoderTransformerModel()
m = model.to(device)
# out,loss = m(xb, yb)
# print(loss)
# idx = torch.zeros((1,1), dtype=torch.long)
# print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

In [37]:
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3)

In [38]:
batch_size = 32
for iter in range(max_iters):
  if iter % eval_iters == 0:
    losses = estimate_loss()
    print(f"step: {iter}, train loss: {losses["train"]:.4f}, val loss: {losses["val"]:.4f}")

  xb, yb = get_batch('train')
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())



step: 0, train loss: 4.4749, val loss: 4.4704
step: 200, train loss: 2.4368, val loss: 2.4629
step: 400, train loss: 2.0129, val loss: 2.0855
step: 600, train loss: 1.7589, val loss: 1.8979
step: 800, train loss: 1.6108, val loss: 1.7865
step: 1000, train loss: 1.5282, val loss: 1.7107
step: 1200, train loss: 1.4644, val loss: 1.6582
step: 1400, train loss: 1.4207, val loss: 1.6346
step: 1600, train loss: 1.3746, val loss: 1.6123
step: 1800, train loss: 1.3482, val loss: 1.5899
step: 2000, train loss: 1.3165, val loss: 1.5702
step: 2200, train loss: 1.2929, val loss: 1.5523
step: 2400, train loss: 1.2718, val loss: 1.5485
step: 2600, train loss: 1.2545, val loss: 1.5426
step: 2800, train loss: 1.2338, val loss: 1.5326
step: 3000, train loss: 1.2207, val loss: 1.5345
step: 3200, train loss: 1.2036, val loss: 1.5299
step: 3400, train loss: 1.1879, val loss: 1.5293
step: 3600, train loss: 1.1740, val loss: 1.5235
step: 3800, train loss: 1.1597, val loss: 1.5238
step: 4000, train loss: 1.1

In [39]:
idx = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(m.generate(idx, max_new_tokens=500)[0].tolist()))



KING RICHARD III:
Qow yourself-breating kind and success and instant
That cries 'twixt countrusting  look intelloastor.

QUEEN ELIZABETH:
My lord, to--

KING RICHARD II:
Queen, the hoursing
Good friends with body butternoons, his grieval,
And revel the brother of the hearts are the lie:
And if thou canst deserve imperate,
Horr ha quest heretors to thee,
Whose peace and his wrong must too,
Or perform that needy,
He flinteth him the nay and chiles in these unterring.
A safeguard is of my love.
Th
