### 层归一化 LayerNorm
  - 是一种正则化方法
  - 提高模型的训练稳定性
  - 通过对每一层的输入进行标准化，使得网络中的每一层在训练过程中保持相对一致的分布
  - 加速收敛
  - 缓解梯度消失问题
  - 在特征纬度上进行标准化
  - 使用可学习的缩放参数和偏置参数进行调整
  - 比BarchNorm更适合序列建模任务和小批量数据训练

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

torch.manual_seed(42) # 设置随机种子，保证每次运行结果一致

<torch._C.Generator at 0x7cd90cd4ef10>

In [6]:
# 定义带归一化的神经网络层
class LayerNormBlock(nn.Module):
    def __init__(self, embedding_size):
        super(LayerNormBlock, self).__init__()
        self.layer_norm = nn.LayerNorm(embedding_size)
        self.fc = nn.Linear(embedding_size, embedding_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.layer_norm(x)
        out = self.fc(out)
        out = self.relu(out)
        return out

In [7]:
# 构建包含层归一化的简单网络模型
class SimpleLayerNormModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes=10):
        super(SimpleLayerNormModel, self).__init__()
        self.layer1 = LayerNormBlock(input_dim)
        self.layer2 = LayerNormBlock(hidden_dim)
        self.layer3 = LayerNormBlock(hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.fc_out(out)
        return out

In [8]:
# 模拟输入数据
input_data = torch.randn(32, 128)  # 批量大小为32，输入维度为128

# 创建模型实例
model = SimpleLayerNormModel(input_dim=128, hidden_dim=128, num_classes=10)

# 前向传播
output = model(input_data)
print(output.shape)  # 输出形状应为 (32, 10)

torch.Size([32, 10])


In [9]:
# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [10]:
# 模拟训练步骤
targets = torch.randint(0, 10, (32,))  # 随机类别标签， 10个类别，批量大小32

In [11]:
optimizer.zero_grad()
loss = criterion(output, targets)
loss.backward()
optimizer.step()
print(f"Loss: {loss.item()}")

Loss: 2.318890333175659


In [12]:
# 归一化层参数更新情况
for name, param in model.named_parameters():
    if 'layer_norm' in name:
        print(f"{name} - mean: {param.data.mean():.4f}, std: {param.data.std():.4f}")

layer1.layer_norm.weight - mean: 1.0000, std: 0.0010
layer1.layer_norm.bias - mean: 0.0001, std: 0.0010
layer2.layer_norm.weight - mean: 1.0000, std: 0.0010
layer2.layer_norm.bias - mean: -0.0002, std: 0.0010
layer3.layer_norm.weight - mean: 0.9999, std: 0.0010
layer3.layer_norm.bias - mean: -0.0001, std: 0.0010


In [14]:
# 测试模型在不同输入分布下的稳定性
test_input = torch.randn(32, 128) * 10 + 50  # 不同分布的输入
test_output = model(test_input)
print("测试数据的网络输出:", test_output.shape)

测试数据的网络输出: torch.Size([32, 10])
