In [1]:
import torch
import torch.nn as nn

# BatchNorm1d

In [2]:
#torch.nn.BatchNorm1d(num_features, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True, device=None, dtype=None)

Applies Batch Normalization over a 2D or 3D input as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift.

* Input: (N, C) or (N, C, L), where N is the batch size, C is the number of features or channels, and L is the sequence length

* Output: (N, C) or (N, C, L) (same shape as input)

## a) the inputy data is 2d

In [3]:
sample_size = 3
number_features = 4

x = torch.arange(sample_size*number_features).reshape(sample_size, number_features) * 1.0
print(x)

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.]])


In [4]:
batch_norm_1d = nn.BatchNorm1d(num_features=number_features, momentum=1)

In [5]:
print("before process data:\n")
print(batch_norm_1d.running_mean)
print(batch_norm_1d.running_var)

before process data:

tensor([0., 0., 0., 0.])
tensor([1., 1., 1., 1.])


In [6]:
output = batch_norm_1d(x)
print(output)

tensor([[-1.2247e+00, -1.2247e+00, -1.2247e+00, -1.2247e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  1.1921e-07],
        [ 1.2247e+00,  1.2247e+00,  1.2247e+00,  1.2247e+00]],
       grad_fn=<NativeBatchNormBackward0>)


In [7]:
print("after process data:\n")
print(batch_norm_1d.running_mean)
print(batch_norm_1d.running_var)

after process data:

tensor([4., 5., 6., 7.])
tensor([16., 16., 16., 16.])


## how to reimplement it?

In [8]:
x.mean(0)

tensor([4., 5., 6., 7.])

In [9]:
x.var(0)

tensor([16., 16., 16., 16.])

In [10]:
expect_output = (x - x.mean(0))/torch.sqrt(x.var(0))
print(expect_output)

tensor([[-1., -1., -1., -1.],
        [ 0.,  0.,  0.,  0.],
        [ 1.,  1.,  1.,  1.]])


## b) the input is 3d data

In [11]:
sample_size = 3
number_feature = 4
sequence_length = 5

x = torch.arange(sample_size*number_features*sequence_length).reshape(sample_size, number_features, sequence_length) * 1.0
print(x)

tensor([[[ 0.,  1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.,  9.],
         [10., 11., 12., 13., 14.],
         [15., 16., 17., 18., 19.]],

        [[20., 21., 22., 23., 24.],
         [25., 26., 27., 28., 29.],
         [30., 31., 32., 33., 34.],
         [35., 36., 37., 38., 39.]],

        [[40., 41., 42., 43., 44.],
         [45., 46., 47., 48., 49.],
         [50., 51., 52., 53., 54.],
         [55., 56., 57., 58., 59.]]])


In [12]:
batch_norm_1d = nn.BatchNorm1d(num_features=number_features, eps=0.0, momentum=1.0)

In [13]:
batch_norm_1d.running_mean

tensor([0., 0., 0., 0.])

In [14]:
batch_norm_1d.running_var

tensor([1., 1., 1., 1.])

In [15]:
output = batch_norm_1d(x)
print(output)

tensor([[[-1.3422e+00, -1.2812e+00, -1.2202e+00, -1.1592e+00, -1.0982e+00],
         [-1.3422e+00, -1.2812e+00, -1.2202e+00, -1.1592e+00, -1.0982e+00],
         [-1.3422e+00, -1.2812e+00, -1.2202e+00, -1.1592e+00, -1.0982e+00],
         [-1.3422e+00, -1.2812e+00, -1.2202e+00, -1.1592e+00, -1.0982e+00]],

        [[-1.2202e-01, -6.1009e-02,  5.2154e-08,  6.1009e-02,  1.2202e-01],
         [-1.2202e-01, -6.1009e-02, -3.3528e-08,  6.1009e-02,  1.2202e-01],
         [-1.2202e-01, -6.1009e-02,  0.0000e+00,  6.1009e-02,  1.2202e-01],
         [-1.2202e-01, -6.1009e-02, -8.5682e-08,  6.1009e-02,  1.2202e-01]],

        [[ 1.0982e+00,  1.1592e+00,  1.2202e+00,  1.2812e+00,  1.3422e+00],
         [ 1.0982e+00,  1.1592e+00,  1.2202e+00,  1.2812e+00,  1.3422e+00],
         [ 1.0982e+00,  1.1592e+00,  1.2202e+00,  1.2812e+00,  1.3422e+00],
         [ 1.0982e+00,  1.1592e+00,  1.2202e+00,  1.2812e+00,  1.3422e+00]]],
       grad_fn=<NativeBatchNormBackward0>)


In [16]:
batch_norm_1d.running_mean

tensor([22., 27., 32., 37.])

In [17]:
batch_norm_1d.running_var

tensor([287.8571, 287.8571, 287.8571, 287.8571])

## understand batchnorm1d on 3d data

In [18]:
x.select(dim=1,index=0)

tensor([[ 0.,  1.,  2.,  3.,  4.],
        [20., 21., 22., 23., 24.],
        [40., 41., 42., 43., 44.]])

In [19]:
x.select(dim=1,index=0).mean()

tensor(22.)

In [20]:
x.mean(dim=(0,2),keepdim=True) #== batch_norm_1d.running_mean

tensor([[[22.],
         [27.],
         [32.],
         [37.]]])

In [21]:
x.var(dim=(0,2),keepdim=True) #== batch_norm_1d.running_mean

tensor([[[287.8571],
         [287.8571],
         [287.8571],
         [287.8571]]])

In [22]:
expect_output = (x - x.mean(dim=(0,2),keepdim=True))/x.std(dim=(0,2),keepdim=True)
print(expect_output)

tensor([[[-1.2967, -1.2377, -1.1788, -1.1199, -1.0609],
         [-1.2967, -1.2377, -1.1788, -1.1199, -1.0609],
         [-1.2967, -1.2377, -1.1788, -1.1199, -1.0609],
         [-1.2967, -1.2377, -1.1788, -1.1199, -1.0609]],

        [[-0.1179, -0.0589,  0.0000,  0.0589,  0.1179],
         [-0.1179, -0.0589,  0.0000,  0.0589,  0.1179],
         [-0.1179, -0.0589,  0.0000,  0.0589,  0.1179],
         [-0.1179, -0.0589,  0.0000,  0.0589,  0.1179]],

        [[ 1.0609,  1.1199,  1.1788,  1.2377,  1.2967],
         [ 1.0609,  1.1199,  1.1788,  1.2377,  1.2967],
         [ 1.0609,  1.1199,  1.1788,  1.2377,  1.2967],
         [ 1.0609,  1.1199,  1.1788,  1.2377,  1.2967]]])


## what happens in eval mode?

In [23]:
batch_norm_1d.running_mean.data.fill_(10.0)
batch_norm_1d.running_var.data.fill_(1.0)

print(batch_norm_1d.running_mean)
print(batch_norm_1d.running_var)

batch_norm_1d.eval()

output = batch_norm_1d(x)
print(output)

tensor([10., 10., 10., 10.])
tensor([1., 1., 1., 1.])
tensor([[[-10.,  -9.,  -8.,  -7.,  -6.],
         [ -5.,  -4.,  -3.,  -2.,  -1.],
         [  0.,   1.,   2.,   3.,   4.],
         [  5.,   6.,   7.,   8.,   9.]],

        [[ 10.,  11.,  12.,  13.,  14.],
         [ 15.,  16.,  17.,  18.,  19.],
         [ 20.,  21.,  22.,  23.,  24.],
         [ 25.,  26.,  27.,  28.,  29.]],

        [[ 30.,  31.,  32.,  33.,  34.],
         [ 35.,  36.,  37.,  38.,  39.],
         [ 40.,  41.,  42.,  43.,  44.],
         [ 45.,  46.,  47.,  48.,  49.]]], grad_fn=<NativeBatchNormBackward0>)
