In [1]:
import torch
from torch import nn
     

In [2]:

inputs = torch.Tensor([[[0.2, 0.1, 0.3], [0.5, 0.1, 0.1]]])  # 2 words and 3 is dim of embed.
B, S, E = inputs.size()
inputs = inputs.reshape(S, B, E)
inputs.size()

torch.Size([2, 1, 3])

###### . above 2 is number of words and 1 is batch size and 3 is embedd dim.

In [6]:
parameter_shape = inputs.size()[-2:]
parameter_shape

torch.Size([1, 3])

In [7]:
gamma = nn.Parameter(torch.ones(parameter_shape))
beta =  nn.Parameter(torch.zeros(parameter_shape))

In [10]:
gamma

Parameter containing:
tensor([[1., 1., 1.]], requires_grad=True)

### requires grad means learnable parameters

In [9]:
gamma.size()

torch.Size([1, 3])

In [11]:
dims = [-(i+1) for i in range(len(parameter_shape))]

In [13]:
dims

[-1, -2]

In [16]:
mean = inputs.mean(dim = dims , keepdim=True)
print(mean.size())
mean

torch.Size([2, 1, 1])


tensor([[[0.2000]],

        [[0.2333]]])

In [21]:
var = ((inputs - mean) ** 2).mean(dim = dims , keepdim=True) 
var

tensor([[[0.0067]],

        [[0.0356]]])

In [25]:
epsilon = 1e-5
import math
std = (var + epsilon).sqrt()
std

tensor([[[0.0817]],

        [[0.1886]]])

In [27]:
y = (inputs - mean) / std
y

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])

In [28]:

out = gamma * y + beta

In [29]:
out

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)

## Class

In [36]:

import torch
from torch import nn

class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean \n ({mean.size()}): \n {mean}")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation \n ({std.size()}): \n {std}")
        y = (inputs - mean) / std
        print(f"y \n ({y.size()}) = \n {y}")
        out = self.gamma * y  + self.beta
        print(f"out \n ({out.size()}) = \n {out}")
        return out

In [37]:
batch_size = 3
sentence_length = 5
embedding_dim = 8 
inputs = torch.randn(sentence_length, batch_size, embedding_dim)



In [38]:
print(f"input \n ({inputs.size()}) = \n {inputs}")

input 
 (torch.Size([5, 3, 8])) = 
 tensor([[[-9.7133e-01,  3.4538e-01,  6.2232e-02, -7.3475e-01, -2.9677e-01,
          -6.9970e-01,  9.4477e-01,  7.8345e-01],
         [-2.7896e-01,  6.5391e-02,  9.4728e-01, -6.0492e-01, -6.1659e-01,
          -1.9093e+00, -7.7270e-03,  2.8011e-01],
         [-9.7236e-01,  3.4140e-02,  1.3527e+00, -1.0655e+00, -1.4385e+00,
          -9.7426e-01,  8.0860e-02, -1.7422e+00]],

        [[-7.8449e-01,  3.0231e-01,  5.7949e-01,  1.1428e+00,  5.1061e-01,
          -7.2614e-01, -6.0314e-01,  1.1902e-02],
         [ 1.0101e-01,  8.2658e-01,  1.3408e+00, -1.2632e+00, -2.7295e-01,
          -7.5048e-01, -7.4986e-01, -1.0533e-01],
         [-2.0456e+00,  7.5453e-01,  5.6586e-01, -6.8945e-01,  1.1970e+00,
          -4.1402e-01, -2.3778e-01, -7.9907e-01]],

        [[ 3.0690e-01, -1.3909e+00,  5.7740e-01, -2.5567e+00, -7.3442e-01,
          -3.7989e-01, -1.1128e-01,  7.1328e-01],
         [ 1.4228e+00, -8.3991e-02, -4.6483e-01, -1.3216e+00, -1.3080e+00,
          

In [39]:
layer_norm = LayerNormalization(inputs.size()[-1:])

In [40]:

out = layer_norm.forward(inputs)

Mean 
 (torch.Size([5, 3, 1])): 
 tensor([[[-0.0708],
         [-0.2656],
         [-0.5906]],

        [[ 0.0542],
         [-0.1092],
         [-0.2086]],

        [[-0.4470],
         [-0.3479],
         [-0.0247]],

        [[-0.5034],
         [ 0.3674],
         [-0.0542]],

        [[ 0.1604],
         [-0.5473],
         [-0.2273]]])
Standard Deviation 
 (torch.Size([5, 3, 1])): 
 tensor([[[0.6755],
         [0.7814],
         [0.9474]],

        [[0.6593],
         [0.8064],
         [0.9683]],

        [[1.0311],
         [0.9551],
         [0.8892]],

        [[1.0125],
         [1.0753],
         [1.0369]],

        [[0.9410],
         [0.8212],
         [1.3969]]])
y 
 (torch.Size([5, 3, 8])) = 
 tensor([[[-1.3331e+00,  6.1616e-01,  1.9700e-01, -9.8284e-01, -3.3446e-01,
          -9.3094e-01,  1.5035e+00,  1.2647e+00],
         [-1.7116e-02,  4.2355e-01,  1.5521e+00, -4.3423e-01, -4.4916e-01,
          -2.1034e+00,  3.2998e-01,  6.9832e-01],
         [-4.0290e-01,  6.5943e

In [41]:
out[0].mean(), out[0].std()

(tensor(-1.4901e-08, grad_fn=<MeanBackward0>),
 tensor(1.0215, grad_fn=<StdBackward0>))