In [1]:
import torch
from torch import nn
from torch.nn import functional as F

1. **均值**
$$
\mu = \frac{1}{d} \sum_{i=1}^{d} x_i
$$

2. **方差**
$$
\sigma^2 = \frac{1}{d} \sum_{i=1}^{d} (x_i - \mu)^2
$$

3. **标准化**
$$
\hat{x}_i = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}}
$$

4. **仿射变换（可学习参数）**
$$
y_i = \gamma \hat{x}_i + \beta
$$

5. **最终形式**
$$
\text{LayerNorm}(x) = \gamma \cdot \frac{x - \mu}{\sqrt{\sigma^2 + \epsilon}} + \beta
$$

In [None]:
class LayerNorm(nn.Module):
    def __init__(self,d_model,eps=1e-12):
        super().__init__()
        self.d_model=d_model
        self.eps=eps
        self.gamma=nn.Parameter(data=torch.ones(self.d_model))
        self.beta=nn.Parameter(data=torch.zeros(self.d_model))
    def forward(self,x:torch.Tensor):
        mean=x.mean(dim=-1,keepdim=True) # 计算平均值
        var=x.var(dim=-1,keepdim=True,unbiased=False) # 计算方差
        out=(x-mean)/torch.sqrt(var+self.eps)
        out=self.gamma*out+self.beta
        return out