# Layer Normalization
层归一化对每个样本的所有特征进行归一化操作，即沿着每个样本的特征维度（例如，在图像中是通道维度、在文本中是单词或句子的维度）操作。
## 计算方法
$$
\mu =\frac{1}{d}\sum _{i=1}^{d}x_{i}
$$

$$
\sigma ^{2}=\frac{1}{d}\sum _{i=1}^{d}(x_{i}-\mu )^{2}
$$
其中， $d $是样本的特征数量，$x_i $是样本的第$ i$ 个特征。
归一化：
$$
\widehat{x}_{i}=\frac{x_{i}-\mu }{\sqrt{\sigma ^{2}+\epsilon }}
$$
LayerNorm 还引入了两个可学习的参数 gamma（缩放）和 beta（偏移），可以对归一化后的结果进行线性变换
$$
y_{i}=\gamma \widehat{x_{i}}+\beta
$$

## 计算方法
输入格式一般为：(batch_size, num_features)


图像数据是 (batch_size, channels, height, width)

对于文本数据是 (batch_size, seq_length, emd_size)


In [1]:
import torch
from torch import nn

class LayerNorm(nn.Module):
    def __init__(self, normalized_shape, is_learn = False, eps=1e-5):
        super(LayerNorm, self).__init__()
        self.noralized_shape = normalized_shape
        self.is_learn = is_learn
        self.eps = eps
        
        if self.is_learn:
            self.gamma = nn.Parameter(torch.ones(normalized_shape))
            self.beta = nn.Parameter(torch.zeros(normalized_shape))
        else:
            self.gamma = None
            self.beta = None


    def forward(self, x):

        mean = torch.mean(x,dim=(1,2,3), keepdim=True)
        var = torch.var(x,dim=(1,2,3), unbiased=False, keepdim=True)

        layer_norm = (x - mean) / torch.sqrt(var + self.eps)
        
        if self.is_learn:
            layer_norm = layer_norm * self.gamma + self.beta
        return layer_norm

In [2]:
#测试
x = torch.randn(16, 3, 32, 32) # 假设输入是一个16张3通道32x32的图像
layernorm = LayerNorm(normalized_shape=(3, 32, 32))
layernorm_nn = nn.LayerNorm(normalized_shape=[3, 32, 32], elementwise_affine=True)
output = layernorm(x)
output_nn = layernorm_nn(x)
are_equal = torch.allclose(output, output_nn, atol=1e-5)
print("Output of custom LayerNorm:", are_equal)

Output of custom LayerNorm: True


# Batch Normalization
BatchNorm是对一个batch-size样本内的每个特征做归一化，维度上分析，就是在NHW维度上分别进行归一化，保留特征图的通道尺寸大小进行的归一化。

BN抹杀了不同特征之间的大小关系，但是保留了不同样本间的大小关系；LN抹杀了不同样本间的大小关系，但是保留了一个样本内不同特征之间的大小关系。

In [21]:
import torch
from torch import nn

class BatchNorm(nn.Module):
    def __init__(self, num_features, gamma, beta, eps=1e-5):# 与nn.BatchNorm2d保持一致 num_features是通道数
        super(BatchNorm, self).__init__()
        self.noralized_shape = num_features
        self.eps = eps
        
        self.gamma = gamma
        self.beta = beta


    def forward(self, x):

        mean = torch.mean(x, dim=(0,2,3), keepdim=True)
        var = torch.var(x, dim=(0,2,3), unbiased=False, keepdim=True)
        # 用torch.mean和torch.var计算均值和方差时精度可能会有问题

        layer_norm = (x - mean) / torch.sqrt(var + self.eps)
        
        layer_norm = layer_norm * self.gamma.view(1, -1, 1, 1) + self.beta.view(1, -1, 1, 1)
        return layer_norm

In [22]:
#测试
x = torch.randn(16, 3, 32, 32) # 假设输入是一个16张3通道32x32的图像
gamma = nn.Parameter(torch.randn(x.size(1)))
beta =  nn.Parameter(torch.randn(x.size(1)))
batchnorm = BatchNorm(3, gamma=gamma, beta=beta)
batchnorm_nn = nn.BatchNorm2d(num_features=3)
output = layernorm(x)
output_nn = layernorm_nn(x)
are_equal = torch.allclose(output, output_nn, atol=1e-5)
print("Output of custom BatchNorm:", are_equal)

Output of custom BatchNorm: True


# RMSnorm

去除了减去均值的操作，也就是没有去中心化的操作，只有缩放的操作。RMSnorm就是均值为0的layer norm。

优点：没有了去中心化的操作，可以提升运行效率。（多用于大语言模型）

In [7]:
class RMSNorm(torch.nn.Module):
    def __init__(self, normalized_shape, eps=1e-6):
        super(RMSNorm, self).__init__()
        self.normalized_shape = normalized_shape
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(normalized_shape))
        self.beta = nn.Parameter(torch.zeros(normalized_shape))

    def forward(self, x):
        # 计算均方根
        norm = torch.sqrt(torch.mean(x ** 2, dim=(1,2,3), keepdim=True) + self.eps)
        # 归一化
        x_normalized = x / norm
        # 应用权重
        RMS_Norm = x_normalized * self.gamma + self.beta
        return RMS_Norm

In [26]:
x = torch.randn(16, 3, 32, 32)
rmsnorm = RMSNorm(normalized_shape=(3, 32, 32))
output = rmsnorm(x)
output_nn = nn.RMSNorm(normalized_shape=[3, 32, 32])(x)
are_equal = torch.allclose(output, output_nn, atol=1e-5)
print("Output of custom RMSNorm:", are_equal)


Output of custom RMSNorm: True


# InstanceNorm
$(B, C, H, W) -> (B, C, 1, 1)$

相当于对每个batch每个通道做了归一化。可以保留原始图像的信号而不混杂，因此常用于风格迁移等工作。

In [31]:
class InstanceNorm(nn.Module):
    def __init__(self, num_features, gamma, beta, eps=1e-5):
        super(InstanceNorm, self).__init__()
        self.noralized_shape = num_features
        self.eps = eps
        
        self.gamma = gamma
        self.beta = beta
        
    def forward(self, x):

        mean = torch.mean(x, dim=(2,3), keepdim=True)
        var = torch.var(x, dim=(2,3), unbiased=False, keepdim=True)

        layer_norm = (x - mean) / torch.sqrt(var + self.eps)
        layer_norm = layer_norm * self.gamma.view(1, -1, 1, 1) + self.beta.view(1, -1, 1, 1)
        return layer_norm

In [34]:
#测试
x = torch.randn(16, 3, 32, 32) # 假设输入是一个16张3通道32x32的图像
gamma = nn.Parameter(torch.randn(x.size(1))) 
beta =  nn.Parameter(torch.randn(x.size(1)))
Instancenorm = InstanceNorm(num_features=3, gamma=gamma, beta=beta)
Instancenorm_nn = nn.InstanceNorm2d(num_features=3)
Instancenorm_nn.weight = gamma
Instancenorm_nn.bias = beta
output = Instancenorm(x)
output_nn = Instancenorm_nn(x)
are_equal = torch.allclose(output, output_nn, atol=1e-5)
print("Output of custom LayerNorm:", are_equal)

Output of custom LayerNorm: True
