In [1]:
import torch
import torch.nn as nn
import math

In [2]:
nn.Parameter(torch.ones(3))

Parameter containing:
tensor([1., 1., 1.], requires_grad=True)

In [3]:
import torch
from torch import nn

class LayerNormalization(nn.Module):
    def __init__(self, features: int, eps:float=10**-6):
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True) # (batch, seq_len, 1)
        std = x.std(dim=-1, keepdim=True) # (batch, seq_len, 1)
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

# Create an instance of LayerNormalization
layer_norm = LayerNormalization(features=512)

# Create a random tensor of shape (batch_size, seq_len, hidden_size). represent a batch of 10 sequences, each of length 20, with 512 features for each token in the sequence.
x = torch.randn(10, 20, 512)

# Apply layer normalization
output = layer_norm(x)

print(output.shape)  # Should print: torch.Size([10, 20, 512])

torch.Size([10, 20, 512])
