# 训练一个电子鹦鹉

In [1]:
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch import optim
from tqdm.auto import tqdm

In [12]:
with open('trainingdata/三国演义.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    print(f"total characters: {len(text)}")

total characters: 604898


## 模型定义

In [13]:
# hard coded! 人工构建的位置向量
class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len=4096):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, embed_size)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * -(math.log(10000.0) / embed_size))
        
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        return x + self.encoding[:, :x.size(1)].detach()

# 多头注意力机制实现
# B batchsize
# T seq len
# C hidden_size(全局)
# h head
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(MultiHeadAttention, self).__init__()
        # 进行参数校验，即head必须能够整除hidden_size
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads
        assert (self.head_dim * heads == embed_size), "Embedding size needs to be divisible by heads"
        
        # 初始化Q, K, V矩阵作为可学习的参数
        # 在多头自注意力机制中我们还需要一个线性层 [h1, h2, h3, h4] --线性层--> [hidden_state]
        self.values = nn.Linear(self.embed_size, self.embed_size, bias=False)
        self.keys = nn.Linear(self.embed_size, self.embed_size, bias=False)
        self.queries = nn.Linear(self.embed_size, self.embed_size, bias=False)
        self.fc_out = nn.Linear(embed_size, embed_size)

    # 正常的输入：[T, C]，
    # batch下：[B, T, C]
    def forward(self, value, key, query, mask):
        B = query.shape[0]
        # self-attention 情况下value, key, query长度是一致的，cross-attention则不一致，这里做了兼容（不过这是个decoder -_-）
        value_len, key_len, query_len = value.shape[1], key.shape[1], query.shape[1]

        # [B, T, C] -> [B, T, h, C/h] 多头自注意力，这里不涉及数据移动，只修改strides
        value = self.values(value).view(B, value_len, self.heads, self.head_dim)
        key = self.keys(key).view(B, key_len, self.heads, self.head_dim)
        query = self.queries(query).view(B, query_len, self.heads, self.head_dim)
        
        # [T, C] * [C, T] -> [T, T] attention
        # [B, T, C] * [B, C, T] -> [B, T, T] batch
        # [B, T, h, C/h] -> [B, h, T, C/h]
        # [B, h, T, C/h] * [B, h, C/h, T] -> [B, h, T, T] multi-head attention
        queries_t = query.transpose(1,2)
        keys_t = key.transpose(1, 2).transpose(2, 3)
        values_t = value.transpose(1, 2)
        
        # attention: [B, h, T, T]
        energy = torch.matmul(queries_t, keys_t)
        if mask is not None:
            energy = energy.masked_fill(mask == 1, float("-inf"))
        score = energy / (self.embed_size ** (1 / 2))
        attention = torch.softmax(score, dim=3)
        
        # [B, h, T, T] * [B, h, T, C/h] -> [B, h, T, C/h]
        # [B, h, T, C/h] -> [B, T, h, C/h]
        out = torch.matmul(attention, values_t).transpose(1,2).reshape(B, value_len, self.heads * self.head_dim)
        # [B, T, C]
        out = self.fc_out(out)
        return out

# 单个decoder层
class TransformerDecoderLayer(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout):
        super(TransformerDecoderLayer, self).__init__()
        self.attention = MultiHeadAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, src_mask):
        attn_out = self.attention(x, x, x, src_mask)
        x = self.dropout(self.norm1(attn_out + x))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

# 整个decoder
class TransformerDecoder(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, vocab_size, num_layers, max_len):
        super(TransformerDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.pos_encoding = PositionalEncoding(embed_size, max_len)
        self.layers = nn.ModuleList([
            TransformerDecoderLayer(embed_size, heads, forward_expansion, dropout)
            for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(embed_size, vocab_size)

    def forward(self, x, src_mask):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, src_mask)
        x = self.fc_out(x)
        return x


In [14]:
class CharDataset(Dataset):
    def __init__(self, data, sequence_length):
        self.chars = sorted(list(set(data)))
        self.char_to_index = {ch: i for i, ch in enumerate(self.chars)}
        self.index_to_char = {i: ch for i, ch in enumerate(self.chars)}
        self.data = data
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.data) - self.sequence_length

    def __getitem__(self, index):
        inputs = self.data[index:index+self.sequence_length]
        targets = self.data[index+1:index+self.sequence_length+1]
        return torch.tensor([self.char_to_index[ch] for ch in inputs], dtype=torch.long), \
               torch.tensor([self.char_to_index[ch] for ch in targets], dtype=torch.long)

## 模型架构设置

- seq_len: 训练时上下文长度，T
- embed_size：隐含状态维度：C
- head：注意力头数：h
- forward_expansion：FFN层膨胀系数，h -> nh -> h

In [15]:
seq_len = 128
embed_size = 256
heads = 4
forward_expansion = 1
decoder_layers = 2
max_position_embeddings = 4096

## 训练超参数设置

- dropout：在训练的时候我们随即将一些参数置为0，推理的时候启动全部参数然后乘以一个归一化系数 1/(1-dropout)
- learning_rate: 每一步更新的大小
- EPOCH：所有数据看一遍叫做一个epoch，在大模型场景下我们一般只看一遍
- BATCH_SIZE：训练批次大小，一次性喂入GPU

In [23]:
DROPOUT = 0.1
LEARNING_RATE = 0.001
EPOCH = 1
BATCH_SIZE = 8

## 初始化

- dataset & dataloader：PyTorch提供，shuffle，batch，str2idx
- model：传入模型
- device：CPU or GPU
- criterion：分类问题，使用交叉熵损失函数
- optimizer：使用Adam，比朴素SGD更好，恒定lr应该不是最好的策略 -_-

In [24]:
dataset = CharDataset(text, seq_len)
vocab_size = len(dataset.chars)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
model = TransformerDecoder(embed_size, heads, forward_expansion, DROPOUT, vocab_size, decoder_layers, max_position_embeddings)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
print(f"vocab size: {vocab_size}")
print(f"embedding lookup table: {dataset.char_to_index}")

vocab size: 3951
embedding lookup table: {'\n': 0, ' ': 1, '*': 2, '?': 3, '[': 4, ']': 5, '—': 6, '‘': 7, '’': 8, '“': 9, '”': 10, '…': 11, '□': 12, '\u3000': 13, '、': 14, '。': 15, '《': 16, '》': 17, '【': 18, '】': 19, '一': 20, '丁': 21, '七': 22, '万': 23, '丈': 24, '三': 25, '上': 26, '下': 27, '不': 28, '与': 29, '丐': 30, '丑': 31, '专': 32, '且': 33, '丕': 34, '世': 35, '丘': 36, '丙': 37, '业': 38, '丛': 39, '东': 40, '丝': 41, '丞': 42, '丢': 43, '两': 44, '严': 45, '丧': 46, '个': 47, '中': 48, '丰': 49, '临': 50, '丸': 51, '丹': 52, '为': 53, '主': 54, '丽': 55, '举': 56, '乂': 57, '乃': 58, '久': 59, '么': 60, '义': 61, '之': 62, '乌': 63, '乎': 64, '乏': 65, '乐': 66, '乔': 67, '乖': 68, '乘': 69, '乙': 70, '九': 71, '乞': 72, '也': 73, '习': 74, '乡': 75, '书': 76, '买': 77, '乱': 78, '乳': 79, '乾': 80, '了': 81, '予': 82, '争': 83, '事': 84, '二': 85, '于': 86, '亏': 87, '云': 88, '互': 89, '五': 90, '井': 91, '亘': 92, '亚': 93, '些': 94, '亟': 95, '亡': 96, '亢': 97, '交': 98, '亥': 99, '亦': 100, '产': 101, '亨': 102, '亩': 103, '享': 104, '京': 105, '亭

## 训练之前的准备

- 创建mask：在训练的时候，我们希望输出第i个位置时，注意力分数在i,i+1,...的位置都是0
- 训练函数：两层for循环，每次输入的inputs是str2idx后的一个数字组成的矩阵[B, T]，在模型侧进行embedding

In [36]:
def create_look_ahead_mask(size):
    mask = torch.triu(torch.ones(size, size), diagonal=1)
    # mask = torch.zeros(size, size) 
    return mask 


def train(dataloader, model, optimizer, criterion, device):
    model.train()
    step = 0
    for epoch in range(EPOCH):
        for inputs, targets in tqdm(dataloader):
            inputs, targets = inputs.to(device), targets.to(device)
            mask = create_look_ahead_mask(inputs.size(1)).to(device)
            optimizer.zero_grad()
            output = model(inputs, mask) 
            loss = criterion(output.view(-1, vocab_size), targets.view(-1))
            if(step % 10 == 1):
                print(f"Epoch {epoch} step {step}, Loss: {loss.item()}")
            loss.backward()
            optimizer.step()
            step += 1

def generate_sequence_topk(model, start_sequence, length, dataset, device, k=5):
    model.eval()
    generated_sequence = start_sequence

    # 将初始序列转换为索引
    inputs = torch.tensor([dataset.char_to_index[ch] for ch in start_sequence], dtype=torch.long).unsqueeze(0).to(device)
    # 在推理时不计算梯度
    with torch.no_grad(): 
        for _ in range(length):
            mask = create_look_ahead_mask(inputs.size(dim=1))
            output = model(inputs, mask)
            # [B, T, C]，只关心最后一个token
            last_output = output[:, -1, :]
            probabilities = F.softmax(last_output, dim=1)
            top_probabilities, top_indices = torch.topk(probabilities, k, dim=1)
            top_prob_distribution = torch.distributions.Categorical(top_probabilities)
            chosen_index = top_indices[0][top_prob_distribution.sample()].item()
            generated_sequence += dataset.index_to_char[chosen_index]
            inputs = torch.cat([inputs, torch.tensor([[chosen_index]], device=device)], dim=1)

    return generated_sequence



In [37]:
train(dataloader, model, optimizer, criterion, device)

  0%|          | 0/75597 [00:00<?, ?it/s]

Epoch 0 step 1, Loss: 4.170106410980225
Epoch 0 step 11, Loss: 4.059938430786133
Epoch 0 step 21, Loss: 3.8189704418182373
Epoch 0 step 31, Loss: 4.006127834320068


KeyboardInterrupt: 

## 使用刚才训练的模型进行推理

In [40]:
start_sequence = "玄德与孔明"
generated_length = 100
k = 1
generated_sequence = generate_sequence_topk(model, start_sequence, generated_length, dataset, device, k)
print(f"top-k 输出:\n {generated_sequence}")

top-k 输出:
 玄德与孔明同宗亲，请玄德与玄德同宗族，不可。”玄德曰：“吾与玄德同宗族，不可轻敌。”玄德曰：“吾与我同谋，何故不相见？”孔明曰：“吾与我同谋，何故不相见。”玄德曰：“吾与玄德同谋，不可轻敌。”玄德曰：“吾与汝等


## 模型参数展示：
FYI：
- 一个相对有用的GPT模型：GPT3：175B + 580GB data
- 主流大模型：7B, 13B, 33B, 70B  + 2～20TB data
- 常见的chatbot & 垂直领域大模型：33B / 70B  + 2～20TB data

In [ ]:
# Calculate the total number of parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")

# Calculate the total number of trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total number of trainable parameters: {trainable_params}")
for name, param in model.named_parameters():
    print(f"{name}: {param.size()}")

## 交叉熵误差

- 预测值（概率）：[0.7, 0.2, 0.1]
- 真实值：0
- 真实值（概率）：[1, 0, 0]
- CEL: -[ 1 * log(0.7) + 0 * log(0.2) + 0 * log(0.1) ] = -log(0.7) ~ 0.36

但目标预测下，真实值是一个one-hot vector，
- 瞎猜（现在词表大小约20，平均下来概率为5%）：-log(0.05) ~ 2.99
- 假设某个大模型词表100k，每个词概率1e-5：-log(1e-5) ~ 11.5 （大模型初始误差，经过训练收敛后大概在1～2左右）
- 20%（有20%概率预测正确）：-log(0.2) ~ 1.61
- 50%（有50%概率预测正确）：-log(0.5) ~ 0.69
- 80% (有80%概率预测正确)：-log(0.8) ~ 0.22

In [158]:

batch_size = 2
seq_len = 3
vocab_size = 4

logits = torch.tensor([[[10, -20, -20, -20], [10, -20, -20, -20], [10, -20, -20, -20]],
                       [[10, -20, -20, -20], [10, -20, -20, -20], [10, -20, -20, -20]]], dtype=torch.float32)
# logits = torch.tensor([[[1,0,0,0], [1,0,0,0], [1,0,0,0]],[[1,0,0,0], [1,0,0,0], [1,0,0,0]]], dtype=torch.float32)
print(logits)
print(logits.shape)

targets = torch.tensor([[0,0,0], [0,0,0]], dtype=torch.long)

logits_flattened = logits.view(-1, vocab_size)
targets_flattened = targets.view(-1)

criterion = nn.CrossEntropyLoss()
loss = criterion(logits_flattened, targets_flattened)

print(f"Logits (flattened): \n{logits_flattened}")
print(f"Targets (flattened): \n{targets_flattened}")
print(f"Loss: {loss.item()}")

tensor([[[ 10., -20., -20., -20.],
         [ 10., -20., -20., -20.],
         [ 10., -20., -20., -20.]],

        [[ 10., -20., -20., -20.],
         [ 10., -20., -20., -20.],
         [ 10., -20., -20., -20.]]])
torch.Size([2, 3, 4])
Logits (flattened): 
tensor([[ 10., -20., -20., -20.],
        [ 10., -20., -20., -20.],
        [ 10., -20., -20., -20.],
        [ 10., -20., -20., -20.],
        [ 10., -20., -20., -20.],
        [ 10., -20., -20., -20.]])
Targets (flattened): 
tensor([0, 0, 0, 0, 0, 0])
Loss: 0.0


## 大模型推理优化
- 训练时可以使用mask来进行并行训练
- 输出的时候是：auto regressive，不可能并行，如何加快？

### KV cache
回顾我们刚才的实现：每次推理的时候都需要调用模型进行完整的计算，包括：
1. input乘以3个矩阵得到k,q,v（实际上我们只需要前t-1个词的k和v，以及最后一个词的q）
2. 进行前向传播

### 其他策略
1. 量化处理，int4 int8量化有望在手机上实现大模型部署， （FP32)7B -> 28GB   （int8)7B -> 7GB
2. kernel fusion & 高性能算子：flashAttention
3. 更合理的模型结构：MOE，增加参数量，但是推理的时候参数量不变
4. 流式现实：提高用户体验-_-!

In [ ]:
length = None
input = None
for _ in range(length):
    mask = create_look_ahead_mask(inputs.size(dim=1))
    # 调用模型
    output = model(inputs, mask)
    # [B, T, C]，只关心最后一个token
    last_output = output[:, -1, :]
    # 拿到概率然后转换成char
    probabilities = F.softmax(last_output, dim=1)
    top_probabilities, top_indices = torch.topk(probabilities, k, dim=1)
    top_prob_distribution = torch.distributions.Categorical(top_probabilities)
    chosen_index = top_indices[0][top_prob_distribution.sample()].item()
    # 放入seq作为输入
    generated_sequence += dataset.index_to_char[chosen_index]
    inputs = torch.cat([inputs, torch.tensor([[chosen_index]], device=device)], dim=1)