In [1]:
import torch
import torch.nn as nn
import math

In [2]:
class LayerNorm(nn.Module):
    def __init__(self, feature_len: int, eps: float=1e-6) -> None: 
        super().__init__()
        self.para_mul = nn.Parameter(torch.ones(feature_len))
        self.para_bias = nn.Parameter(torch.zeros(feature_len))
        self.eps = eps
    
    def forward(self, x): # x: [batch_size, seq_len, d_model]
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.para_mul * (x - mean) / (std + self.eps) + self.para_bias # [batch_size, seq_len, d_model]

In [3]:
# Suppose we have a feature length of 300
feature_len = 300

# Create an instance of our LayerNorm class
layer_norm = LayerNorm(feature_len)

# Suppose we have the following batch of 2 sequences (mini-batch size of 2)
# Each sequence has 4 words (sequence length of 4)
# Each word is represented by a 300-dimensional vector (d_model = 300)
x = torch.rand(2, 4, 300)

# Pass our sequences through the layer normalization
normalized_sequences = layer_norm(x)

print(normalized_sequences.shape)  # Should output: torch.Size([2, 4, 300])

torch.Size([2, 4, 300])


In [4]:
normalized_sequences

tensor([[[ 0.0197, -1.0291,  0.9568,  ..., -0.0539,  0.2884,  0.7148],
         [ 0.2491,  1.4138,  1.1152,  ..., -0.5740, -0.6588,  0.0357],
         [ 0.3927, -1.2732,  0.9233,  ...,  0.9915,  0.7239,  0.4172],
         [ 0.4524, -0.2981, -1.0243,  ..., -0.5243, -1.6016,  0.9814]],

        [[-0.6347,  0.9279, -1.0341,  ..., -0.0521, -0.8354, -0.7375],
         [-1.5796,  0.8898,  0.2707,  ...,  0.0209, -0.7101,  1.1840],
         [-1.7730, -1.0018, -1.7518,  ...,  1.3988, -0.8080, -0.7552],
         [ 0.2187, -1.0349, -0.0763,  ...,  1.4860, -1.4960,  0.9116]]],
       grad_fn=<AddBackward0>)