In [1]:

import torch
from torch import nn
     

inputs = torch.Tensor([[[0.2, 0.1, 0.3], [0.5, 0.1, 0.1]]])
B, S, E = inputs.size()
inputs = inputs.reshape(S, B, E)
inputs.size()

torch.Size([2, 1, 3])

In [2]:
inputs

tensor([[[0.2000, 0.1000, 0.3000]],

        [[0.5000, 0.1000, 0.1000]]])

In [3]:
parameter_shape = inputs.size()[-2:]
gamma = nn.Parameter(torch.ones(parameter_shape))
beta =  nn.Parameter(torch.zeros(parameter_shape))

In [5]:
beta

Parameter containing:
tensor([[0., 0., 0.]], requires_grad=True)

In [6]:
dims = [-(i + 1) for i in range(len(parameter_shape))]


In [7]:

mean = inputs.mean(dim=dims, keepdim=True)
mean.size()

torch.Size([2, 1, 1])

In [8]:
var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
epsilon = 1e-5
std = (var + epsilon).sqrt()
std

tensor([[[0.0817]],

        [[0.1886]]])

In [9]:

y = (inputs - mean) / std
y

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])

In [10]:
out = gamma * y + beta


In [11]:

import torch
from torch import nn

class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, input):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean \n ({mean.size()}): \n {mean}")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation \n ({std.size()}): \n {std}")
        y = (inputs - mean) / std
        print(f"y \n ({y.size()}) = \n {y}")
        out = self.gamma * y  + self.beta
        print(f"out \n ({out.size()}) = \n {out}")
        return out
     

batch_size = 3
sentence_length = 5
embedding_dim = 8 
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(f"input \n ({inputs.size()}) = \n {inputs}")

input 
 (torch.Size([5, 3, 8])) = 
 tensor([[[ 0.6469,  1.3656,  1.3446,  0.3038, -0.5741,  2.0021, -0.2257,
           0.1254],
         [-0.2555, -0.3813,  1.4523,  0.7855,  0.6054,  0.3587, -1.0080,
           0.2108],
         [-0.1996, -0.9304,  0.8518,  0.7144, -0.6996, -0.1698, -0.4499,
          -0.1009]],

        [[-0.1454,  0.4202,  0.7579,  0.1820,  1.9422,  0.2353, -0.6983,
           0.3932],
         [ 0.4795,  0.1138,  0.6843, -0.9968, -0.8469,  0.8386, -0.6900,
           0.5337],
         [ 1.6088, -0.2448,  0.2011, -0.4285,  0.4154, -0.6001, -0.1717,
           0.2716]],

        [[-0.4426,  0.4266,  1.6828, -0.0876,  0.2802, -0.8874,  0.3211,
          -0.4766],
         [ 1.7027, -0.0258,  0.8888,  0.6342, -0.2795, -0.6697, -0.1828,
           1.0442],
         [ 1.2084, -0.6941,  1.0817, -0.1604,  0.2661, -0.4435,  0.7868,
          -0.0302]],

        [[ 1.0988, -0.0940,  0.4360, -0.6760,  0.1892,  0.2815, -0.8643,
          -1.5505],
         [ 0.7095, -0.6921, 

In [13]:

batch_size = 3
sentence_length = 2
embedding_dim = 3
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(f"input \n ({inputs.size()}) = \n {inputs}")

input 
 (torch.Size([2, 3, 3])) = 
 tensor([[[-2.3107, -0.4618,  1.0862],
         [ 1.1488, -0.1019,  0.7588],
         [ 1.1869, -1.7824,  0.2725]],

        [[ 0.9561,  0.8370, -0.9578],
         [ 0.4475, -0.1893,  1.0462],
         [ 1.6932,  0.6806,  0.7444]]])
