在卷积神经网络的卷积层之后总会添加BatchNorm2d进行数据的归一化处理，这使得数据在进行Relu之前不会因为数据过大而导致网络性能的不稳定，BatchNorm2d()函数数学原理如下：

$\mathrm{y}=\frac{x-mean(x)}{\sqrt{Var(x)}+eps}*gamma+beta$

[1_网络搭建/3_BatchNorm](https://enzo-miman.github.io/#/1_%E7%BD%91%E7%BB%9C%E6%90%AD%E5%BB%BA/3_BatchNorm)

[Bilibili](https://www.bilibili.com/video/BV11s4y1c7pg/?spm_id_from=333.788)

[Batch Norm详解之原理及为什么神经网络需要它](https://zhuanlan.zhihu.com/p/441573901)

[【基础算法】六问透彻理解BN(Batch Normalization）](https://zhuanlan.zhihu.com/p/93643523)


![cal_pic](https://p.ipic.vip/pn83n9.png)

In [35]:
import numpy as np
import torch.nn as nn
import torch

def batch_norm(feature, statistic_mean, statistic_var):
    feature_shape = feature.shape
    for i in range(feature_shape[1]):
        channel = feature[:, i, :, :]
        mean = channel.mean()   # 均值
        std_1 = channel.std()   # 总体标准差
        std_t2 = channel.std(ddof=1)  # 样本标准差
        # 对channel中的数据进行归一化
        feature[:, i, :, :] = (channel - mean) / np.sqrt(std_1 ** 2 + 1e-5)
        # 更新统计均值 和 方差
        statistic_mean[i] = statistic_mean[i] * 0.9 + mean * 0.1
        statistic_var[i] = statistic_var[i] * 0.9 + (std_t2 ** 2) * 0.1

    # print(feature)
    print('statistic_mean : ', statistic_mean)
    print('statistic_var : ', statistic_var)

feature_array = np.random.randn(2, 2, 2, 2)
feature_tensor = torch.tensor(feature_array.copy(), dtype=torch.float32)

# 初始化统计均值和方差
statistic_mean = [0.0, 0.0]
statistic_var = [1.0, 1.0]

# 手动计算 batch normalization 结果，打印统计均值和方差
batch_norm(feature_array, statistic_mean, statistic_var)

# 调用 torch.nn.BatchNorm2d
bn = nn.BatchNorm2d(2, eps=1e-5)

output = bn(feature_tensor)

# print(output)
print('bn.running_mean : ', bn.running_mean)
print('bn.running_var : ', bn.running_var)
print(bn.weight)
print(bn.bias)

statistic_mean :  [0.014776679358404921, 0.05635309876238154]
statistic_var :  [0.93756777599325, 0.9465390281782048]
bn.running_mean :  tensor([0.0148, 0.0564])
bn.running_var :  tensor([0.9376, 0.9465])
Parameter containing:
tensor([1., 1.], requires_grad=True)
Parameter containing:
tensor([0., 0.], requires_grad=True)


In [17]:
feature=feature_array
channel = feature[:, 0, :, :]
print(channel.view())
mean = channel.mean()   # 均值
std_1 = channel.std()   # 总体标准差
std_t2 = channel.std(ddof=1)  # 样本标准差
print(mean)
print(std_1)
print(std_t2)

[[[-0.59243467  0.93604603]
  [ 0.88346365  0.35842368]]

 [[ 0.8117685  -2.04183014]
  [ 0.51963757 -0.87507462]]]
2.7755575615628914e-17
0.9999971117586312
1.069041879989797


In [27]:
import torch
import torch.nn as nn
#num_features - num_features from an expected input of size:batch_size*num_features*height*width
#eps:default:1e-5 (公式中为数值稳定性加到分母上的值)
#momentum:动量参数，用于running_mean and running_var计算的值，default：0.1
m=nn.BatchNorm2d(2,affine=True) #affine参数设为True表示weight和bias将被使用
input=torch.randn(1,2,3,4)
output=m(input)
 
print(input)
print(m.weight)
print(m.bias)
print(output)
print(output.size())

tensor([[[[-1.5798, -2.5201,  0.3904,  1.5037],
          [ 0.2580,  0.3442,  1.2225,  1.0226],
          [-0.4528, -0.3625,  1.8377, -0.0065]],

         [[ 1.2658, -0.5274, -1.7758, -1.0991],
          [ 0.6457,  1.5849,  0.0869, -0.3118],
          [ 0.3529, -0.3111,  0.2038,  1.2938]]]])
Parameter containing:
tensor([1., 1.], requires_grad=True)
Parameter containing:
tensor([0., 0.], requires_grad=True)
tensor([[[[-1.4238, -2.2031,  0.2091,  1.1317],
          [ 0.0994,  0.1708,  0.8987,  0.7331],
          [-0.4897, -0.4149,  1.4086, -0.1199]],

         [[ 1.1970, -0.6720, -1.9732, -1.2679],
          [ 0.5507,  1.5295, -0.0318, -0.4473],
          [ 0.2454, -0.4466,  0.0901,  1.2261]]]],
       grad_fn=<NativeBatchNormBackward0>)
torch.Size([1, 2, 3, 4])


In [42]:
print(isinstance(m,nn.BatchNorm2d))
print(hasattr(m,'weight'))
# for name , param in m.named_parameters():
#     print(name,param)
# dir(bn)

True
True


In [21]:
import math

In [24]:
0.9999971117586312*math.sqrt(8)-1.069041879989797*math.sqrt(7)

-4.440892098500626e-16