In [3]:
import torch
import torch.nn as nn

## Linear Layer

A linear layer is simply a linear transformation of input represented by matrix multiplication:

In [4]:
d_in = 3
d_out = 5

x = torch.randn(d_in)

layer = nn.Linear(3, 5)

In [6]:
y = layer(x)
y.shape

torch.Size([5])

In [16]:
y

tensor([-0.9730,  0.6827, -0.5636, -0.4266, -0.2361], grad_fn=<ViewBackward0>)

In [14]:
layer.weight

Parameter containing:
tensor([[-0.5517,  0.2568,  0.0783],
        [ 0.2566, -0.1705,  0.1225],
        [-0.3060, -0.3898, -0.2540],
        [-0.2436, -0.0818,  0.2361],
        [-0.2218,  0.1268,  0.2490]], requires_grad=True)

In [17]:
layer.weight.shape

torch.Size([5, 3])

In [15]:
layer.bias

Parameter containing:
tensor([ 0.2336,  0.0629, -0.1942, -0.0534,  0.2297], requires_grad=True)

In [18]:
layer.bias.shape

torch.Size([5])

In [20]:
torch.matmul(layer.weight, x) + layer.bias

tensor([-0.9730,  0.6827, -0.5636, -0.4266, -0.2361], grad_fn=<AddBackward0>)

Note that we can pass a batch through the linear layer:

In [7]:
batch_dim = 4

X = torch.randn(batch_dim, d_in)
Y = layer(X)

In [8]:
Y.shape

torch.Size([4, 5])

## Activation Functions

In [70]:
nn.ReLU, nn.GELU

(torch.nn.modules.activation.ReLU, torch.nn.modules.activation.GELU)

## Dropout Layer

In [22]:
dropout_layer = nn.Dropout(p=0.5)

x = torch.randn(d_in)
x

tensor([ 2.9677, -2.7410,  0.4559])

In [24]:
dropout_layer(x)

tensor([5.9355, -0.0000, 0.0000])

In [25]:
X = torch.randn(batch_dim, d_in)
X

tensor([[-0.6034, -1.3383, -1.0745],
        [ 0.0970,  1.1263,  1.3107],
        [ 0.5935,  1.6124, -0.3863],
        [-0.2381,  0.3930,  0.3202]])

In [26]:
dropout_layer(X)

tensor([[-0.0000, -0.0000, -2.1490],
        [ 0.1939,  2.2525,  2.6215],
        [ 1.1871,  0.0000, -0.0000],
        [-0.0000,  0.0000,  0.6404]])

## Layer Normalization

In [27]:
X = torch.randn(batch_dim, d_in)
X

tensor([[ 0.0213, -0.9436, -1.0795],
        [-0.0361,  1.1067, -0.6574],
        [ 1.0809,  0.1795, -0.6091],
        [-1.0005, -1.0290,  1.0055]])

In [31]:
mean = X.mean(dim=-1, keepdim=True)
mean

tensor([[-0.6673],
        [ 0.1378],
        [ 0.2171],
        [-0.3414]])

In [32]:
var = X.var(dim=-1, keepdim=True)
var

tensor([[0.3603],
        [0.8007],
        [0.7151],
        [1.3607]])

In [33]:
X_norm = (X - mean) / torch.sqrt(var)

In [34]:
X_norm

tensor([[ 1.1473, -0.4605, -0.6868],
        [-0.1943,  1.0829, -0.8886],
        [ 1.0215, -0.0444, -0.9770],
        [-0.5651, -0.5895,  1.1546]])

In [35]:
X_norm.mean(dim=-1, keepdim=True)

tensor([[ 0.0000e+00],
        [-1.9868e-08],
        [-1.9868e-08],
        [ 0.0000e+00]])

In [37]:
X_norm.var(dim=-1, keepdim=True)

tensor([[1.0000],
        [1.0000],
        [1.0000],
        [1.0000]])

In [64]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift 

In [67]:
layer_norm = nn.LayerNorm(d_in)
layer_norm(X)

tensor([[ 1.4051, -0.5639, -0.8412],
        [-0.2380,  1.3263, -1.0883],
        [ 1.2510, -0.0544, -1.1966],
        [-0.6921, -0.7220,  1.4141]], grad_fn=<NativeLayerNormBackward0>)

In [68]:
layer_norm(X).var(dim=-1, unbiased=False, keepdim=True)

tensor([[1.0000],
        [1.0000],
        [1.0000],
        [1.0000]], grad_fn=<VarBackward0>)