<a href="https://colab.research.google.com/github/varun29-git/deep-learning-foundations/blob/main/MLP_in_2_Ways.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Building a Deep MLP: Batch Normalization & Modular Layers**



This notebook explores the internal mechanics of deep neural networks by building a character-level language model from scratch. We progress from a manual, mathematically explicit implementation to a scalable, Object-Oriented architecture mimicking torch.nn.

*The notebook builds on following concepts:*

**Batch Normalization:** Understanding the math behind population statistics and momentum.

**Initialization:** Implementing Kaiming Init ($Gain / \sqrt{fan_{in}}$) to solve vanishing gradients.

**Refactoring:** Moving from global variables to a modular Class-based structure (Linear, BatchNorm1d, Tanh).


In [128]:
# Requirements
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
import random

In [113]:
!wget -q https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

In [114]:
# Configuration
g = torch.Generator().manual_seed(2147483647)
block_size = 3
batch_size = 32
epsilon = 1e-5

In [115]:
# Dataset
with open('names.txt', 'r') as f:
    names = f.read().splitlines()
print(f"Dataset size: {len(names)}")

# Shuffle the names
random.seed(42)
random.shuffle(names)

alphabets = sorted({ch for name in names for ch in name})

# String to Integer
stoi = {s:i+1 for i,s in enumerate(alphabets)}
stoi['.'] = 0

# Integer to String
itos = {i:s for s, i in stoi.items()}

Dataset size: 32033


In [116]:
# Creation of Training, Validation and Testing Dataset
def build_dataset(words):
  X = []
  Y = []
  for word in words:
    context = [0] * block_size
    for char in word + '.':
      ix = stoi[char]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix]
  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(f"X.shape = {X.shape}, Y.shape = {Y.shape})")
  return X, Y

n1 = int(len(names) * 0.8)
n2 = int(len(names) * 0.9)

Xtr,Ytr = build_dataset(names[:n1]) # Training
Xval,Yval = build_dataset(names[n1:n2]) # Validation
Xte, Yte = build_dataset(names[n2:]) # Testing

X.shape = torch.Size([182625, 3]), Y.shape = torch.Size([182625]))
X.shape = torch.Size([22655, 3]), Y.shape = torch.Size([22655]))
X.shape = torch.Size([22866, 3]), Y.shape = torch.Size([22866]))


In [117]:
# Hyperparameters

torch.manual_seed(42)
vocab_size = len(itos)
n_hidden = 100 # Number of Hidden Layers
n_emb = 10 # Number of embeddings
C = torch.randn((vocab_size, n_emb), generator=g) # Creation of an Embedding table

# **1. The Manual Approach (First Principles)**

This implementation constructs an MLP using a flat script where every operation (like $h_{pre} = Wx + b$) is explicit and parameters are managed globally. This method exposes the raw mechanics of Kaiming Initialization and Batch Normalization without layers of abstraction, ideal for understanding the underlying math.



In [118]:
# Parameters

W1 = torch.randn((n_emb * block_size, n_hidden), generator=g) * ((5/3) / ((n_emb * block_size) ** 0.5))
# W1 divided by sqr root of fan in to make the model less confidently wrong.
# Also scaled by 5/3 to prevent from getting into a vanishing gradient problem
# Because of batchnorm mean substraction, bias for the linear layer will have no effect. Therefore it is not initialized
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.1
b2 = torch.randn(vocab_size, generator=g) * 0.1
bngain = torch.ones((1, n_hidden)) * 0.1 + 1.0
bnbias = torch.ones((1, n_hidden)) * 0.1
bnmean_running = torch.zeros((1, n_hidden))
bnvar_running = torch.ones((1, n_hidden))

parameters = [W1, W2, C, b2, bnbias, bngain]

for p in parameters:
  p.requires_grad = True

In [119]:
# # Training
epochs = 100000

# Start
for epoch in range(epochs):

  # Gradient Initialization
  for p in parameters:
    p.grad = None

  # Minibatch Construct
  ix = torch.randint(0,Xtr.shape[0], (batch_size,))

  # Forward Pass begins
  emb = C[Xtr[ix]]
  emb_cat = emb.view(emb.shape[0], -1) # Makes Matrix Multiplication possible

  # Linear Layer
  hpreact = emb_cat @ W1

  # Batch Normalisation
  bnmeani = hpreact.mean(0, keepdim=True)
  bnvari = hpreact.var(0, keepdim=True, unbiased=False)
  hpreact = bngain * ((hpreact - bnmeani) / torch.sqrt(bnvari + epsilon)) + bnbias
  # Norm = (X - Mean(X)) / Standard Deviation (X)
  with torch.no_grad():
    bnmean_running = 0.9 * bnmean_running + 0.1 * bnmeani
    bnvar_running = 0.9 * bnvar_running + 0.1 * bnvari

  # Activation
  h = torch.tanh(hpreact)
  logits = h @ W2 + b2

  # Loss
  loss = F.cross_entropy(logits, Ytr[ix])
  lossi.append(loss.log10().item())
  # Backward Pass
  loss.backward()

  # Optimization
  learning_rate = 0.1 if epoch <= (epochs/2) else 0.01
  with torch.no_grad():
    for p in parameters:
      p -= learning_rate * p.grad

  # Print Loss
  if epoch % (epochs/10) == 0:
    print(f"epoch: {epoch}, loss: {loss.item()}")

# Final Loss
print(f"epoch: {epochs}, loss: {loss.item()}")

epoch: 0, loss: 3.5724077224731445
epoch: 10000, loss: 2.260617733001709
epoch: 20000, loss: 2.3816561698913574
epoch: 30000, loss: 2.249138116836548
epoch: 40000, loss: 2.1082563400268555
epoch: 50000, loss: 2.3121681213378906
epoch: 60000, loss: 2.2469069957733154
epoch: 70000, loss: 2.449575901031494
epoch: 80000, loss: 1.9596929550170898
epoch: 90000, loss: 2.2607407569885254
epoch: 100000, loss: 1.919772744178772


In [120]:
@torch.no_grad()

# Calculate the complete batch loss
def split_loss(split):
  x, y = {
      "train" : (Xtr, Ytr),
      "val" : (Xval, Yval),
      "test" : (Xte, Yte),
  }[split]

  # Forward Pass
  emb = C[x]
  emb_cat = emb.view(emb.shape[0], -1)
  hpreact = (emb_cat) @ W1

  # Batch Normalisation
  hpreact = bngain * (hpreact - bnmean_running) / torch.sqrt(bnvar_running + epsilon) + bnbias

  # Activation
  h = torch.tanh(hpreact)
  logits = h @ W2 + b2

  # Loss
  loss = F.cross_entropy(logits, y)
  print(split,":", loss.item())

# Prints Actual Losses
split_loss('train')
split_loss('val')
split_loss('test')

train : 2.1354637145996094
val : 2.154210090637207
test : 2.15461802482605


In [121]:

g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    out = []
    context = [0] * block_size

    while True:
        # Forward Pass Begins


        emb = C[torch.tensor([context])] # (1, block_size, n_emb)
        emb_cat = emb.view(1, -1)        # (1, block_size * n_emb)

        # Linear Layer
        # We don't use b1 here because BatchNorm cancels it out
        hpreact = emb_cat @ W1

        # Batch Normalization
        bn_std = torch.sqrt(bnvar_running + epsilon)
        hpreact = bngain * (hpreact - bnmean_running) / bn_std + bnbias

        # Activation
        h = torch.tanh(hpreact)

        # Output Layer
        logits = h @ W2 + b2

        # Sampling
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()

        # Shift context
        context = context[1:] + [ix]
        out.append(ix)

        if ix == 0:
            break

    print(''.join(itos[i] for i in out))

carmah.
amelle.
khi.
mili.
taty.
salaysleer.
hubeddelynn.
jareei.
ner.
kia.
chaiivon.
leigh.
ham.
jord.
quinn.
shoilea.
jadiquin.
ell.
dearisia.
kael.


# **2. The OOP Approach (Modular Architecture)**

This section refactors the code into modular classes mimicking torch.nn to support deep, scalable architectures. It automates parameter management and handles critical state-switching for Batch Normalization, ensuring correct statistics usage during both training and inference.

In [122]:
class Linear:

  def __init__(self, fan_in,fan_out ,bias=True):
    # Wrap weights and biases in torch.nn.Parameter
    self.weight = torch.nn.Parameter(torch.randn((fan_in, fan_out), generator=g) * (fan_in ** -0.5))
    self.bias = torch.nn.Parameter(torch.zeros(fan_out)) if bias else None

  def __call__(self,x):
    self.out = x  @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out

  def parameters(self):
    return [self.weight] + ([self.bias] if self.bias is not None else [])


class BatchNorm1d:

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # Parameters trained with backprop, wrapped in torch.nn.Parameter
    self.gamma = torch.nn.Parameter(torch.ones(dim))
    self.beta = torch.nn.Parameter(torch.zeros(dim))
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)

  def __call__(self, x):
    if self.training:
      xmean = x.mean(0, keepdim=True)
      xvar = x.var(0, keepdim=True, unbiased=False)
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean)/torch.sqrt(xvar + self.eps)
    self.out = self.gamma * xhat + self.beta
    if self.training:
      with torch.no_grad():
        self.running_mean = (1-self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1-self.momentum) * self.running_var + self.momentum * xvar
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

class Tanh:

  def __init__(self):
    pass

  def __call__(self, x):
    self.out = torch.tanh(x)
    return self.out

  def parameters(self):
    return []

In [123]:
# Hyperparameters (just put for reference again) :)

torch.manual_seed(42)
vocab_size = len(itos)
n_hidden = 100 # Number of Hidden Layers
n_emb = 10 # Number of embeddings
C = torch.randn((vocab_size, n_emb), generator=g) # Creation of an Embedding table

In [124]:
C = torch.randn((vocab_size, n_emb), generator=g, requires_grad=True)
layers = [
    Linear(n_emb * block_size, n_hidden, bias=False), BatchNorm1d(n_hidden),Tanh(),
    Linear(n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, vocab_size),
]

with torch.no_grad():
    # Last layer: make less confident
    layers[-1].weight *= 0.1
    # All other layers: apply gain
    for layer in layers[:-1]:
        if isinstance(layer, Linear):
            layer.weight *= 5/3

parameters = [C] + [p for layer in layers for p in layer.parameters()]
print("Number of Parameters:",sum(p.nelement() for p in parameters))


Number of Parameters: 46997


In [125]:
# same optimization as last time

lossi = []

for epoch in range(epochs):

    # Minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix]  # batch X, Y

    # Forward pass
    emb = C[Xb]  # embed the characters into vectors
    x = emb.view(emb.shape[0], -1)  # concatenate the vectors
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, Yb)  # loss function

    # Backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # Update
    learning_rate = 0.1 if epoch <= (epochs/2) else 0.01
    for p in parameters:
        p.data += -learning_rate * p.grad

    # Print Loss
    if epoch % (epochs/10) == 0:
      print(f"epoch: {epoch}, loss: {loss.item()}")

# Final Loss
print(f"epoch: {epochs}, loss: {loss.item()}")

epoch: 0, loss: 3.3017003536224365
epoch: 10000, loss: 2.5791115760803223
epoch: 20000, loss: 2.43801212310791
epoch: 30000, loss: 2.0418097972869873
epoch: 40000, loss: 2.49560284614563
epoch: 50000, loss: 2.2687554359436035
epoch: 60000, loss: 2.0356671810150146
epoch: 70000, loss: 2.061049222946167
epoch: 80000, loss: 1.8533865213394165
epoch: 90000, loss: 1.455241084098816
epoch: 100000, loss: 1.8749793767929077


In [126]:
@torch.no_grad()
def split_loss_oop(split):
    x, y = {
        'train': (Xtr, Ytr),
        'val': (Xval, Yval),
        'test': (Xte, Yte),
    }[split]

    # Non Training Mode to use running mean and var
    for layer in layers:
        if isinstance(layer, BatchNorm1d):
            layer.training = False

    # Forward Pass
    emb = C[x]                      # (N, block_size, n_emb)
    x = emb.view(emb.shape[0], -1)  # (N, block_size * n_emb)

    for layer in layers:
        x = layer(x)

    loss = F.cross_entropy(x, y)
    print(split,":",loss.item())


    # Switch back to Training Mode
    for layer in layers:
        if isinstance(layer, BatchNorm1d):
            layer.training = True

split_loss_oop('train')
split_loss_oop('val')

train : 2.042226552963257
val : 2.09619402885437


In [127]:
# Output Generation

for layer in layers:
    if isinstance(layer, BatchNorm1d):
        layer.training = False

g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):

    out = []
    context = [0] * block_size # initialize with all ...

    while True:
        # Forward pass

        emb = C[torch.tensor([context])]
        x = emb.view(1, -1)

        for layer in layers:
            x = layer(x)

        logits = x
        probs = F.softmax(logits, dim=1)

        # Takes samples
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()

        # Shift context
        context = context[1:] + [ix]
        out.append(ix)

        if ix == 0:
            break

    print(''.join(itos[i] for i in out))

carlah.
amelle.
khi.
milia.
atlannah.
sky.
mahnie.
deliah.
jareei.
nellara.
chaiiv.
kaleigh.
ham.
joce.
quintin.
lilea.
jadiquor.
elo.
dearixi.
jaxeenivraylen.
