## 📅 Day 3: Define Model with nn.Module

In [1]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(2, 4),
            nn.ReLU(),
            nn.Linear(4, 2),
            nn.ReLU(),
            nn.Linear(2, 1)
        )

    def forward(self, x):
        return self.layers(x)

model = MLP()
print(model)

MLP(
  (layers): Sequential(
    (0): Linear(in_features=2, out_features=4, bias=True)
    (1): ReLU()
    (2): Linear(in_features=4, out_features=2, bias=True)
    (3): ReLU()
    (4): Linear(in_features=2, out_features=1, bias=True)
  )
)


In [5]:
# - Create a deeper model with dropout and batchnorm
class deeperMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(256,128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(128,8)
        )

    def forward(self, x):
        return self.layers(x)

In [6]:
# - Print model parameters (weights & biases)
model = deeperMLP()
print(model.parameters)

<bound method Module.parameters of deeperMLP(
  (layers): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.5, inplace=False)
    (8): Linear(in_features=128, out_features=8, bias=True)
  )
)>


In [7]:
# - Forward random input and inspect output shape
import torch
x = torch.randn((5,512))
model.forward(x)

tensor([[ 0.5258,  0.0107,  0.1155,  0.5611, -0.9563,  0.4834,  0.0225,  0.9426],
        [-0.4578,  0.7084,  0.8889,  0.0994, -0.4184, -0.7783,  0.1204, -0.4687],
        [-0.4432, -0.2854,  0.4332,  1.2963, -1.6711,  0.6954,  0.7195, -0.8587],
        [ 0.0763,  0.1265,  0.1029,  0.1823, -0.9249,  0.7460,  0.0136, -0.1553],
        [ 0.2384,  0.1753, -0.1287,  0.4755, -1.0940,  0.0910, -0.3139,  0.8647]],
       grad_fn=<AddmmBackward0>)

In [8]:
# - Write a custom forward method with skip connection
# Pre-activation residual (common in modern nets/Transformers):
# Norm → Activation → Linear → Norm → Activation → Linear → (add skip)
# Pre-activation often trains a bit more smoothly.
class oneResidualBlock(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(64, 128, bias=False),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(128, 64)
        )
    
    def forward(self, x):
        y = self.layers(x) + x
        return y

In [None]:
# ---------- Residual Block ----------
class ResidualBlock(nn.Module):
    def __init__(self, dim=64, hidden=128, p_drop=0.2, pre_norm=False, scale_residual=1.0):
        super().__init__()
        self.dim = dim
        self.hidden = hidden
        self.pre_norm = pre_norm
        self.scale = scale_residual
        # Post-norm: Linear → Norm → … → set bias=False. Pre-norm: Norm → Linear → … → set bias=True.
        # The norm layer handles the shift/scale
        self.linear_0 = nn.Linear(dim, hidden, bias=pre_norm)
        self.activation = nn.ReLU()
        self.drop_0 = nn.Dropout(p_drop)
        self.linear_1 = nn.Linear(hidden, dim)

        self.norm_0 = nn.LayerNorm(self.dim)
        self.norm_1 = nn.LayerNorm(self.hidden)

    def forward(self, x):
        # pre-norm: x → LN → fc1 → Activation → Dropout → LN → fc2 → + skip
        # post-noem x → fc1 → LN → Activation → Dropout → fc2 → + skip
        if self.pre_norm:
            h = self.norm_0(x)
            h = self.linear_0(h)
        else:
            h = self.linear_0(x)
            h = self.norm_1(h)
        h = self.activation(h)
        h = self.drop_1(h)
        y = self.linear_1(h)
        y = y * self.scale + x
        return y  
        

In [10]:
# ---------- Multi-Block Residual MLP ----------
class ResidualMLP(nn.Module):
    def __init__(self, in_dim=64, working_dim = 256, hidden=512, num_blocks=4, p_drop=0.2, pre_norm=False, out_dim=1):
        super().__init__()
        # Raw input (size in_dim) → Stem (projects to working_dim) → Residual blocks → Final head
        self.stem = nn.Linear(in_dim, working_dim, bias=False)
        self.stem_norm = nn.LayerNorm(working_dim)
        self.stem_act  = nn.ReLU()
        self.pre_norm = pre_norm
        if num_blocks>8:
            scale_residual=0.1
        else:
            scale_residual=1
        self.blocks = nn.ModuleList([ResidualBlock(working_dim, hidden, p_drop, pre_norm, scale_residual) for _ in range(num_blocks)])
        self.ln = nn.LayerNorm(working_dim)
        self.final = nn.Linear(working_dim, out_dim)

    def forward(self, x):
        h = self.stem(x)
        if not self.pre_norm:
            h = self.stem_norm(h)
            h = self.stem_act(h)
        for block in self.blocks:
            h = block(h)
        y = self.ln(h)
        y = self.final(y)
        return y