In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [36]:
batch_size = 3
seq_len = 4
feature_size = 5

# for 4D tensor like image with shape (batch_size, channels, height, width)
channels = 3
height = 4
width = 4

eps = 1e-5

### Batch Normalization (BatchNorm)

$$ \text{BN}(\mathbf{x}) = \mathbf{\gamma}\odot\frac{\mathbf{x}-\hat{\mathbf{\mu}}_\mathcal{B}}{\hat{\mathbf{\sigma}}_\mathcal{B}} + \mathbf{\beta} $$

$$ \hat{\mathbf{\mu}}_\mathcal{B} = \frac{1}{|\mathcal{B|}}\sum \mathbf{x} $$
$$ \hat{\mathbf{\sigma}}_{\mathcal{B}}^{2} = \frac{1}{|\mathcal{B|}}\sum (\mathbf{x}-\hat{\mathbf{\mu}}_\mathcal{B})^2 + \epsilon $$

#### batchnorm1d

In [18]:
# 2D tensor
input_tensor2d = torch.randn(batch_size, feature_size)
print(f'The shape of tensor_2d: {input_tensor2d.shape}')
print(f'The tensor_2d:\n {input_tensor2d}')
print(f'The first batch of 2D tensor:\n {input_tensor2d[0, :].unsqueeze(0)}')
print(f'The first dimension of 2D tensor:\n {input_tensor2d[:, 0].unsqueeze(1)}')

The shape of tensor_2d: torch.Size([3, 5])
The tensor_2d:
 tensor([[-1.1184, -1.0546, -1.2016,  0.7217, -0.6093],
        [ 0.1426, -0.0405, -1.0825, -1.2979, -0.3769],
        [ 1.0711,  0.6616, -0.3377,  1.0691,  0.2124]])
The first batch of 2D tensor:
 tensor([[-1.1184, -1.0546, -1.2016,  0.7217, -0.6093]])
The first dimension of 2D tensor:
 tensor([[-1.1184],
        [ 0.1426],
        [ 1.0711]])


In [23]:
bn_gamma = torch.ones(feature_size)
bn_beta = torch.zeros(feature_size)
bn_mean_iter = input_tensor2d.mean(dim=0, keepdim=True) # (1, feature_size)
print(f'The shape of bn_mean_iter: {bn_mean_iter.shape}')
print(f'The mean of 2D tensor:\n {bn_mean_iter}')
# the implementation of nn.BatchNorm1d use unbiased=False as default, which is the biased version of the variance instead of the unbiased version(1/(n-1), known as Bessel's correction)
bn_std_iter = input_tensor2d.std(dim=0, keepdim=True, unbiased=False) # (1, feature_size)
print(f'The shape of bn_std_iter: {bn_std_iter.shape}')
print(f'The std of 2D tensor:\n {bn_std_iter}')
bn_input = bn_gamma * (input_tensor2d - bn_mean_iter) / (bn_std_iter + eps) + bn_beta
print(f'The shape of bn_input: {bn_input.shape}')
print(f'The BN tensor:\n {bn_input}')

print(f'\n The first element of BN tensor:\n bn_gamma[0]*(input_tensor2d[0, 0]-bn_mean_iter[0, 0])/(bn_std_iter[0, 0]+eps)+bn_beta[0]={bn_gamma[0] * (input_tensor2d[0, 0]-bn_mean_iter[0, 0])/(bn_std_iter[0, 0]+eps) + bn_beta[0]}')

bn_input_torch = nn.BatchNorm1d(feature_size, eps=eps)(input_tensor2d)
print(f'\n Using nn.BatchNorm1d:\n {bn_input_torch}')

The shape of bn_mean_iter: torch.Size([1, 5])
The mean of 2D tensor:
 tensor([[ 0.0318, -0.1445, -0.8739,  0.1643, -0.2579]])
The shape of bn_std_iter: torch.Size([1, 5])
The std of 2D tensor:
 tensor([[0.8973, 0.7045, 0.3823, 1.0436, 0.3458]])
The shape of bn_input: torch.Size([3, 5])
The BN tensor:
 tensor([[-1.2818, -1.2919, -0.8570,  0.5341, -1.0159],
        [ 0.1235,  0.1477, -0.5457, -1.4011, -0.3440],
        [ 1.1583,  1.1442,  1.4027,  0.8670,  1.3599]])

 The first element of BN tensor:
 bn_gamma[0]*(input_tensor2d[0, 0]-bn_mean_iter[0, 0])/(bn_std_iter[0, 0]+eps)+bn_beta[0]=-1.281819462776184

 Using nn.BatchNorm1d:
 tensor([[-1.2818, -1.2919, -0.8570,  0.5341, -1.0159],
        [ 0.1235,  0.1477, -0.5457, -1.4011, -0.3440],
        [ 1.1583,  1.1442,  1.4027,  0.8670,  1.3599]],
       grad_fn=<NativeBatchNormBackward0>)


In [27]:
# 3D tensor
input_tensor3d = torch.randn(batch_size, seq_len, feature_size)
print(f'The shape of tensor_3d: {input_tensor3d.shape}')
print(f'The tensor_3d:\n {input_tensor3d}')
print(f'The first batch of 3D tensor:\n {input_tensor3d[0, :, :].unsqueeze(0)}')
print(f'The first slice of second dimension:\n {input_tensor3d[:, 0, :].unsqueeze(1)}')

The shape of tensor_3d: torch.Size([3, 4, 5])
The tensor_3d:
 tensor([[[-0.0982, -1.0667,  0.4885, -0.5220,  0.9251],
         [-2.7599, -2.4701,  2.1923, -0.8618, -2.6623],
         [-0.4728, -1.0070, -0.7028, -0.2707, -0.5866],
         [-1.4460, -1.0373,  0.5397, -1.3195,  0.0590]],

        [[ 0.5668, -1.5206,  0.3681, -0.6688,  0.0589],
         [ 0.4510, -1.5160,  0.8270,  0.0939, -0.3892],
         [ 1.1518,  0.2065,  0.1341, -0.3625,  1.2381],
         [-0.2900, -1.9708,  2.0321, -0.7334,  0.6031]],

        [[-0.4486, -2.1239,  0.4556,  1.2954,  0.8084],
         [ 0.0639,  0.0935,  1.4150, -0.6641,  0.3924],
         [ 0.5734, -0.3665, -0.1075, -0.2191, -0.4738],
         [-1.2907, -0.2323,  0.8580,  0.4622, -0.5655]]])
The first batch of 3D tensor:
 tensor([[[-0.0982, -1.0667,  0.4885, -0.5220,  0.9251],
         [-2.7599, -2.4701,  2.1923, -0.8618, -2.6623],
         [-0.4728, -1.0070, -0.7028, -0.2707, -0.5866],
         [-1.4460, -1.0373,  0.5397, -1.3195,  0.0590]]])
The

In [32]:
bn_gamma = torch.ones(seq_len, feature_size)
bn_beta = torch.zeros(seq_len, feature_size)
bn_mean_iter = input_tensor3d.mean(dim=(0), keepdim=True) # (1, seq_len, feature_size)
print(f'The shape of bn_mean_iter: {bn_mean_iter.shape}')
bn_std_iter = input_tensor3d.std(dim=(0), keepdim=True, unbiased=False) # (1, seq_len, feature_size)
print(f'The shape of bn_std_iter: {bn_std_iter.shape}')
bn_input = bn_gamma * (input_tensor3d - bn_mean_iter) / (bn_std_iter + eps) + bn_beta
print(f'The shape of bn_input: {bn_input.shape}')
print(f'The BN tensor:\n {bn_input}')

print(f'\n The first element of BN tensor:\n bn_gamma[0, 0]*(input_tensor3d[0, 0, 0]-bn_mean_iter[0, 0, 0])/(bn_std_iter[0, 0, 0]+eps)+bn_beta[0, 0]={bn_gamma[0, 0] * (input_tensor3d[0, 0, 0]-bn_mean_iter[0, 0, 0])/(bn_std_iter[0, 0, 0]+eps) + bn_beta[0, 0]}')

bn_input_torch = nn.BatchNorm1d(seq_len*feature_size, eps=eps)(input_tensor3d.view(batch_size, -1)).view(batch_size, seq_len, feature_size)
print(f'\n Using nn.BatchNorm1d:\n {bn_input_torch[0, :, :].unsqueeze(0)}')

The shape of bn_mean_iter: torch.Size([1, 4, 5])
The shape of bn_std_iter: torch.Size([1, 4, 5])
The shape of bn_input: torch.Size([3, 4, 5])
The BN tensor:
 tensor([[[-0.2490,  1.1632,  1.0051, -0.6233,  0.8536],
         [-1.4056, -1.1084,  1.2773, -0.9333, -1.3707],
         [-1.3241, -1.2468, -1.3574,  0.2261, -0.7736],
         [-0.8533,  0.0603, -0.9404, -1.0645,  0.0561]],

        [[ 1.3301,  0.1150, -1.3638, -0.7877, -1.4032],
         [ 0.8380, -0.2065, -1.1643,  1.3868,  0.3837],
         [ 1.0922,  1.2014,  1.0222, -1.3218,  1.4120],
         [ 1.4033, -1.2538,  1.3849, -0.2740,  1.1957]],

        [[-1.0811, -1.2782,  0.3587,  1.4110,  0.5496],
         [ 0.5675,  1.3148, -0.1129, -0.4534,  0.9869],
         [ 0.2319,  0.0453,  0.3352,  1.0957, -0.6384],
         [-0.5501,  1.1935, -0.4444,  1.3385, -1.2518]]])

 The first element of BN tensor:
 bn_gamma[0, 0]*(input_tensor3d[0, 0, 0]-bn_mean_iter[0, 0, 0])/(bn_std_iter[0, 0, 0]+eps)+bn_beta[0, 0]=-0.2489834427833557

 Usi

#### batchnorm2d

In [37]:
input_tensor4d = torch.randn(batch_size, channels, height, width)
print(f'The shape of tensor_4d: {input_tensor4d.shape}')
print(f'The first batch of 4D tensor:\n {input_tensor4d[0, :, :, :].unsqueeze(0)}')

The shape of tensor_4d: torch.Size([3, 3, 4, 4])
The first batch of 4D tensor:
 tensor([[[[ 0.9570,  1.2314, -0.9661,  0.5621],
          [ 0.5525,  0.6786,  0.2468,  0.3984],
          [-1.1513,  0.8751,  0.0982, -0.2322],
          [-1.7415, -1.0709,  0.2398, -0.2371]],

         [[ 0.2142,  0.1374, -0.1000, -1.1841],
          [ 1.2448,  0.9691,  1.2455, -0.3678],
          [-1.2782,  1.4360, -0.7283, -0.7314],
          [-0.5961,  0.3292, -0.0860,  0.8337]],

         [[ 0.4999,  0.1215, -0.8561, -1.0602],
          [ 0.6408, -0.8781,  0.4281,  0.5179],
          [ 0.2410,  0.6312,  1.5939, -1.7955],
          [-0.1183,  0.5526,  0.4169, -0.4531]]]])


In [41]:
bn_gamma = torch.ones(channels, 1, 1)
bn_beta = torch.zeros(channels, 1, 1)
bn_mean_iter = input_tensor4d.mean(dim=(0,2,3), keepdim=True) # (1, channels, 1, 1)
print(f'The shape of bn_mean_iter: {bn_mean_iter.shape}')
print(f'The mean of 4D tensor:\n {bn_mean_iter}')
bn_std_iter = input_tensor4d.std(dim=(0,2,3), keepdim=True, unbiased=False) # (1, channels, 1, 1)
print(f'The shape of bn_std_iter: {bn_std_iter.shape}')
print(f'The std of 4D tensor:\n {bn_std_iter}')
bn_input = bn_gamma * (input_tensor4d - bn_mean_iter) / (bn_std_iter + 1e-5) + bn_beta
print(f'The shape of bn_input: {bn_input.shape}')
print(f'The first batch of BN tensor:\n {bn_input[0, :, :, :].unsqueeze(0)}')

bn_input_torch = nn.BatchNorm2d(channels, eps=1e-5)(input_tensor4d)
print(f'\n Using nn.BatchNorm2d:\n {bn_input_torch[0, :, :, :].unsqueeze(0)}')

The shape of bn_mean_iter: torch.Size([1, 3, 1, 1])
The mean of 4D tensor:
 tensor([[[[ 0.0243]],

         [[-0.0798]],

         [[ 0.0080]]]])
The shape of bn_std_iter: torch.Size([1, 3, 1, 1])
The std of 4D tensor:
 tensor([[[[0.9206]],

         [[0.8925]],

         [[0.9796]]]])
The shape of bn_input: torch.Size([3, 3, 4, 4])
The first batch of BN tensor:
 tensor([[[[ 1.0132,  1.3111, -1.0758,  0.5841],
          [ 0.5737,  0.7107,  0.2417,  0.4064],
          [-1.2769,  0.9241,  0.0803, -0.2786],
          [-1.9181, -1.1896,  0.2341, -0.2840]],

         [[ 0.3295,  0.2434, -0.0226, -1.2373],
          [ 1.4843,  1.1753,  1.4850, -0.3226],
          [-1.3427,  1.6985, -0.7265, -0.7301],
          [-0.5785,  0.4583, -0.0070,  1.0236]],

         [[ 0.5021,  0.1158, -0.8821, -1.0905],
          [ 0.6459, -0.9046,  0.4288,  0.5205],
          [ 0.2378,  0.6361,  1.6188, -1.8411],
          [-0.1290,  0.5558,  0.4174, -0.4707]]]])

 Using nn.BatchNorm2d:
 tensor([[[[ 1.0132,  1.311

### Layer Normalization (LayerNorm)

### Root Mean Square Layer Normalization (RMSNorm)