## 4.1 编码一个LLM架构
本章我们将会编写GPT-2模型架构。之所以选择该架构，是因为GPT-2的参数规模可以直接在用户级GPU上训练，并且OpenAI已经公开了其参数。GPT-3与GPT-2架构类似，只是参数规模从1.5B提升到了175B，因此无法在本地训练。

In [1]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [2]:
import torch
import torch.nn as nn

class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])
        self.trf_blocks = nn.Sequential(*[
            DummyTransformerBlock(cfg) for _ in range(cfg['n_layers'])
        ])
        self.final_norm = DummyLayerNorm(cfg)
        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)
    def forward(self, in_idx):
        # in_idx.shape = [batch_size, seq_len]
        bs, seq_len = in_idx.shape
        x = self.tok_emb(in_idx) # [batch_size, seq_len, emb_dim]
        pos_embed = self.pos_emb(
            torch.arange(seq_len, device=in_idx.device)
        ) # [seq_len, emb_dim]
        x = x + pos_embed
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        return
    
    def forward(self):
        return
    
class DummyLayerNorm(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        return
    def forward(self):
        return

## 4.2 实现归一化层

In [3]:
class LayerNorm(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(embed_dim))
        self.shift = nn.Parameter(torch.zeros(embed_dim))

    def forward(self, x):
        bs, seq_len, emb_dim = x.shape
        # 期望和方差的计算都是逐样本的
        mean = torch.mean(x, dim=-1, keepdim=True)
        # 注意：此处使用有偏方差是为了和GPT-2源码对齐，方便后续导入参数
        var = torch.var(x, dim=-1, keepdim=True, unbiased=False)
        x = (x - mean) / torch.sqrt(var + self.eps)
        # 缩放和偏移是逐特征的，对于不同特征可能需要不同的尺度
        x = self.scale * x + self.shift
        return x

* 为什么不省略归一化，直接学 γ 和 β？	因为归一化提供了训练稳定性，而直接学习仿射变换容易受输入分布影响
* 两者数学上等价吗？	不等价，LayerNorm 是非线性操作，且解耦了优化过程
* γ 和 β 是多余的吗？	❌ 不是！它们让网络可以有选择地恢复所需的分布
* LayerNorm 的核心价值是什么？	解耦 + 稳定 + 加速收敛，而不是单纯的“变换”

Layer Norm VS. Batch Norm
如果您熟悉批量归一化（神经网络的一种常见且传统的归一化方法），您可能想知道它与层归一化相比如何。与批量归一化不同，批量归一化在整个批次维度上归一化，层归一化在特征维度上归一化。

可用的硬件或特定用例可以决定训练或推理期间的批量大小。由于层归一化独立于批量大小规范化每个输入，因此它在这些场景中提供了更大的灵活性和稳定性。这对于分布式训练或在资源受限的环境中部署模型时特别有利。

下面先编写GPT-2中用到的激活函数GELU，然后用其实现前馈层。

## 4.3 用GELU激活函数实现前馈层

In [4]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        return 0.5 * x * ( 1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))
    
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg['emb_dim'], 4*cfg['emb_dim']),
            GELU(),
            nn.Linear(4*cfg['emb_dim'], cfg['emb_dim']),
        )
    def forward(self, x):
        return self.layers(x)

GELU 的平滑度可以在训练过程中带来更好的优化属性，因为它允许对模型参数进行更细致的调整。相比之下，ReLU 在零处有一个尖角，这有时会使优化变得更加困难，尤其是在非常深或具有复杂架构的网络中。此外，与对任何负输入输出零的 ReLU 不同，GELU 允许对负值输出较小的非零输出。这一特征意味着在训练过程中，接受负输入的神经元仍然可以为学习过程做出贡献，尽管程度低于正输入。

## 4.4 添加残差连接

残差连接是为了解决深度神经网络中梯度消失的问题。当网络层数过多时，由于反向传播的链式法则，深层梯度会由于小于1的系数累乘而变得很小，使得权重更新缓慢。为了解决，我们可以直接将输入连接至输出，为深层网络添加直接更新权重的路径。

In [5]:
# 用一个例子演示
class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList(
            [
                nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
                nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
                nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
                nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
                nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU()),
            ]
        )
    def forward(self, x):
        for layer in self.layers:
            layer_out = layer(x)
            # 必须保证输入输出的形状相同才能连接
            if self.use_shortcut and x.shape == layer_out.shape:
                x = x + layer_out
            else:
                x = layer_out
        return x

In [6]:
layer_size = [3, 3, 3, 3, 3, 1]
sample_input = torch.tensor([[-1., 0., 1.]])
torch.manual_seed(123)
model_without_shorcut = ExampleDeepNeuralNetwork(layer_sizes=layer_size, use_shortcut=False)


In [7]:
def print_gradients(model, x):
    target = torch.tensor([[0.]])
    output = model(x)
    metric = nn.MSELoss()
    loss = metric(target, output)
    loss.backward()
    for name, p in model.named_parameters():
        # 只打印权重，不要bias
        if 'weight' in name:
            print(f'{name} has gradient mean of {p.grad.abs().mean().item()}')

In [8]:
# 观察到梯度的消失
print_gradients(model_without_shorcut, sample_input)

layers.0.0.weight has gradient mean of 0.0002546171599533409
layers.1.0.weight has gradient mean of 8.308066026074812e-05
layers.2.0.weight has gradient mean of 0.0007468174444511533
layers.3.0.weight has gradient mean of 0.001237659016624093
layers.4.0.weight has gradient mean of 0.004640596453100443


In [9]:
layer_size = [3, 3, 3, 3, 3, 1]
sample_input = torch.tensor([[-1., 0., 1.]])
torch.manual_seed(123)
model_with_shorcut = ExampleDeepNeuralNetwork(layer_sizes=layer_size, use_shortcut=True)

In [10]:
# 观察到梯度的稳定。
print_gradients(model_with_shorcut, sample_input)

layers.0.0.weight has gradient mean of 0.022350559011101723
layers.1.0.weight has gradient mean of 0.04712466523051262
layers.2.0.weight has gradient mean of 0.02791227586567402
layers.3.0.weight has gradient mean of 0.013673197478055954
layers.4.0.weight has gradient mean of 0.23397451639175415


## 4.5 将注意力和线性层组合成transformer block

In [11]:
from ch3 import MultiHeadAttention

# GPT_CONFIG_124M = {
#     "vocab_size": 50257,
#     "context_length": 1024,
#     "emb_dim": 768,
#     "n_heads": 12,
#     "n_layers": 12,
#     "drop_rate": 0.1,
#     "qkv_bias": False
# }

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attn = MultiHeadAttention(
            d_in=cfg['emb_dim'],
            d_out=cfg['emb_dim'],
            num_heads=cfg['n_heads'],
            context_len=cfg['context_length'],
            dropout=cfg['drop_rate'],
            qkv_bias=cfg['qkv_bias'],
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg['emb_dim'])
        self.norm2 = LayerNorm(cfg['emb_dim'])
        self.dropout_shorcut = nn.Dropout(cfg['drop_rate'])

    def forward(self, x):
        bs, seq_len, emb_dim = x.shape
        shortcut = x
        x = self.norm1(x)
        x = self.attn(x)
        x = self.dropout_shorcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.dropout_shorcut(x)
        x = x + shortcut

        return x

tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])
context vector v1:
 tensor([[0.9173, 0.4695, 0.9805, 0.6120],
        [0.9572, 0.4880, 1.0261, 0.6365],
        [0.9564, 0.4877, 1.0250, 0.6361],
        [0.9019, 0.4629, 0.9625, 0.6030],
        [0.9080, 0.4668, 0.9653, 0.6084],
        [0.9150, 0.4685, 0.9790, 0.6104]], grad_fn=<MmBackward0>)
context_vector v2:
  tensor([[-0.1180, -0.0476, -0.1710,  0.2517],
        [-0.1190, -0.0470, -0.1756,  0.2561],
        [-0.1192, -0.0469, -0.1758,  0.2563],
        [-0.1195, -0.0466, -0.1756,  0.2562],
        [-0.1214, -0.0458, -0.1777,  0.2592],
        [-0.1184, -0.0472, -0.1744,  0.2546]], grad_fn=<MmBackward0>)
context vector v1:
 tensor([[-0.1180, -0.0476, -0.1710,  0.2517],
        [-0.1190, -0.0470, -0.1756,  0.2561],
        [-0.1192, -0.0469, -0.1758,  0.2563],
        [-0.1195, -0

In [12]:
# test
x = torch.rand(2, 4, 768)
torch.manual_seed(42)
trf_block = TransformerBlock(GPT_CONFIG_124M)
output = trf_block(x)

print('Input Shape:', x.shape)
print('Output Shape:', output.shape)

Input Shape: torch.Size([2, 4, 768])
Output Shape: torch.Size([2, 4, 768])


## 4.6 编写GPT模型

我们已经实现了TransformerBlock和LayerNorm，只需要将上面写的DummyGPTModel中的模块替换成写好的模块就完成了GPT模型的代码。

In [14]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])
        self.trf_blocks = nn.Sequential(*[
            TransformerBlock(cfg) for _ in range(cfg['n_layers'])
        ])
        self.final_norm = LayerNorm(cfg['emb_dim'])
        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)
    def forward(self, in_idx):
        # in_idx.shape = [batch_size, seq_len]
        bs, seq_len = in_idx.shape
        x = self.tok_emb(in_idx) # [batch_size, seq_len, emb_dim]
        pos_embed = self.pos_emb(
            torch.arange(seq_len, device=in_idx.device)
        ) # [seq_len, emb_dim]
        x = x + pos_embed
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [15]:
torch.manual_seed(42)
model = GPTModel(GPT_CONFIG_124M)
input = torch.randint(0, 50256, (2, 4))
output = model(input)
print(input.shape, output.shape)

torch.Size([2, 4]) torch.Size([2, 4, 50257])


In [19]:
total_params = sum(p.numel() for p in model.parameters())
print(f'Total number of parameters: {total_params:,}')

Total number of parameters: 163,009,536


之所以这里参数是163M而不是我们期待的124M，是因为原始的GPT-2采用了weight tying，即tok_emb和out_head是共用的，这会减少很多参数，但是会降低模型性能。

In [24]:
total_size_bytes = total_params * 4 # float32 = 4 bytes
total_size_mb = total_size_bytes / ( 1024 * 1024)
print(f'Total size of the model {total_size_mb:.2f} MB')

Total size of the model 621.83 MB


## 4.7 生成文本

In [29]:
# 编写一个函数用迭代的方法逐步生成新文本
def generate_text_simple(model, idx, max_new_tokens, context_len):
    # idx.shape=[bs, seq_len]
    for _ in range(max_new_tokens):
        with torch.no_grad():
            output = model(idx[-context_len:]) # [bs, seq_len, vocab_size]
        output_token = output[:, -1, :]
        prob = torch.softmax(output_token, dim=-1)
        new_idx = torch.argmax(prob, dim=-1, keepdim=True)
        idx = torch.cat([idx, new_idx], dim=1)
    
    return idx

In [31]:
import tiktoken

tokenizer = tiktoken.get_encoding('gpt2')
started_context = 'Hello, I am'
encoded = tokenizer.encode(started_context)
print(f'Encoded input:{encoded}')
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print(f'Encoded tensor shape:{encoded_tensor.shape}')

Encoded input:[15496, 11, 314, 716]
Encoded tensor shape:torch.Size([1, 4])


In [32]:
model.eval()
output = generate_text_simple(
    model=model,
    idx=encoded_tensor,
    max_new_tokens=6,
    context_len=GPT_CONFIG_124M['context_length']
)
print(f'Output tensor:{output}')
print(f'Output shape:{output.shape}')

Output tensor:tensor([[15496,    11,   314,   716,  4754, 22091, 43072, 19101, 14187, 41501]])
Output shape:torch.Size([1, 10])


In [None]:
decoded = tokenizer.decode(output.squeeze(0).tolist())
print(f'Decoded output: {decoded}')

Decoded output:Hello, I amulf Kai cog Portugal paStudio
