In [1]:
import torch
from torch import nn

In [2]:
inputs = torch.Tensor([[[0.2, 0.1, 0.3], [0.5, 0.1, 0.1]]])
B, S, E = inputs.size()    ## B for batch size S for sequence length and E for embedding length
inputs = inputs.reshape(S, B, E)
inputs.size()

torch.Size([2, 1, 3])

In [3]:
parameter_shape = inputs.size()[-2:]
gamma = nn.Parameter(torch.ones(parameter_shape))   ### gamma and beta are learnable params in normalization process
beta =  nn.Parameter(torch.zeros(parameter_shape))

In [4]:
gamma.size(), beta.size()

(torch.Size([1, 3]), torch.Size([1, 3]))

In [5]:
dims = [-(i + 1) for i in range(len(parameter_shape))]

In [6]:
dims

[-1, -2]

In [7]:
mean = inputs.mean(dim=dims, keepdim=True)
mean.size()

torch.Size([2, 1, 1])

In [8]:
mean

tensor([[[0.2000]],

        [[0.2333]]])

In [9]:
var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
epsilon = 1e-5
std = (var + epsilon).sqrt()
std

tensor([[[0.0817]],

        [[0.1886]]])

In [10]:
y = (inputs - mean) / std
y

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])

In [11]:
out = gamma * y + beta

In [12]:
out

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)

## Class

In [13]:
import torch
from torch import nn

class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, input):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean \n ({mean.size()}): \n {mean}")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation \n ({std.size()}): \n {std}")
        y = (inputs - mean) / std
        print(f"y \n ({y.size()}) = \n {y}")
        out = self.gamma * y  + self.beta
        print(f"out \n ({out.size()}) = \n {out}")
        return out

In [14]:
batch_size = 3
sentence_length = 5
embedding_dim = 8
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(f"input \n ({inputs.size()}) = \n {inputs}")

input 
 (torch.Size([5, 3, 8])) = 
 tensor([[[ 1.1932, -0.3584, -2.3048,  1.0391, -0.3112, -0.3776,  0.2972,
          -0.5690],
         [-1.0171, -1.2674,  0.0736, -1.0093, -1.0817,  0.7104, -0.0110,
           0.0963],
         [-0.4091,  0.3082,  0.4149, -0.4959,  0.2493,  1.1680,  0.0595,
           1.2665]],

        [[-0.5962,  0.2269, -0.7407, -2.0755,  1.3221, -0.6562,  0.6296,
           0.1039],
         [ 0.9871, -1.7799, -1.3770,  0.5051,  0.3637, -0.4608, -0.5525,
           0.5211],
         [ 2.0222,  0.7592,  0.9755,  0.0372,  0.4032,  1.4789, -1.7278,
           1.5370]],

        [[ 0.9787, -1.4767,  0.6266,  0.9696, -0.9668, -0.2342,  1.3896,
           0.3429],
         [-1.4336,  0.9728, -1.0355, -1.9432, -1.3162,  0.9092,  1.0335,
          -0.6664],
         [ 0.9723, -0.2787,  0.4769, -0.2136,  1.5746, -0.5242,  0.5870,
          -1.9245]],

        [[ 2.4720,  0.3176, -0.2830, -1.1147,  0.5534,  0.6577, -1.1523,
           0.1012],
         [-1.6620, -0.6744, 

In [15]:
layer_norm = LayerNormalization(inputs.size()[-1:])

In [16]:
out = layer_norm.forward(inputs)

Mean 
 (torch.Size([5, 3, 1])): 
 tensor([[[-0.1739],
         [-0.4383],
         [ 0.3202]],

        [[-0.2233],
         [-0.2242],
         [ 0.6857]],

        [[ 0.2037],
         [-0.4349],
         [ 0.0837]],

        [[ 0.1940],
         [-0.1200],
         [ 0.1681]],

        [[ 0.1838],
         [ 0.2186],
         [-0.0830]]])
Standard Deviation 
 (torch.Size([5, 3, 1])): 
 tensor([[[1.0219],
         [0.6903],
         [0.6015]],

        [[0.9650],
         [0.9228],
         [1.0929]],

        [[0.9472],
         [1.1406],
         [1.0027]],

        [[1.0788],
         [0.9348],
         [0.8256]],

        [[0.9951],
         [0.8764],
         [0.8718]]])
y 
 (torch.Size([5, 3, 8])) = 
 tensor([[[ 1.3379, -0.1805, -2.0853,  1.1870, -0.1344, -0.1993,  0.4610,
          -0.3866],
         [-0.8385, -1.2011,  0.7415, -0.8271, -0.9320,  1.6640,  0.6189,
           0.7744],
         [-1.2124, -0.0199,  0.1575, -1.3568, -0.1178,  1.4096, -0.4334,
           1.5732]],



In [17]:
out[0].mean(), out[0].std()

(tensor(2.9802e-08, grad_fn=<MeanBackward0>),
 tensor(1.0215, grad_fn=<StdBackward0>))