导入所需的库

In [1]:
# 公共库
import os
import sys
from ast import literal_eval
import torch
from torch.utils.data.dataloader import DataLoader
# 词元化所需
from transformers import BertTokenizer
# 训练所需
import time
from collections import defaultdict
# 架构所需
import math
import torch.nn as nn
from torch.nn import functional as F
from transformers import GPT2LMHeadModel
# 配置所需
import random
import numpy as np
# 主函数所需
from torch.utils.data import Dataset
import tqdm.notebook as tq

基本函数和配置

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [3]:
class CfgNode:

    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

    def __str__(self):
        return self._str_helper(0)

    def _str_helper(self, indent):
        parts = []
        for k, v in self.__dict__.items():
            if isinstance(v, CfgNode):
                parts.append("%s:\n" % k)
                parts.append(v._str_helper(indent + 1))
            else:
                parts.append("%s: %s\n" % (k, v))
        parts = [' ' * (indent * 4) + p for p in parts]
        return "".join(parts)

    def to_dict(self):
        return { k: v.to_dict() if isinstance(v, CfgNode) else v for k, v in self.__dict__.items() }

    def merge_from_dict(self, d):
        self.__dict__.update(d)

    def merge_from_args(self, args):
        for arg in args:

            keyval = arg.split('=')
            assert len(keyval) == 2, "expecting each override arg to be of form --arg=value, got %s" % arg
            key, val = keyval
            try:
                val = literal_eval(val)
            except ValueError:
                pass
            assert key[:2] == '--'
            key = key[2:]
            keys = key.split('.')
            obj = self
            for k in keys[:-1]:
                obj = getattr(obj, k)
            leaf_key = keys[-1]

            assert hasattr(obj, leaf_key), f"{key} is not an attribute that exists in the config"

            print("command line overwriting config attribute %s with %s" % (key, val))
            setattr(obj, leaf_key, val)

GPT架构

In [4]:
class NewGELU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

In [5]:
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 判断嵌入维度是否可以整除头的个数 防止出现浮点数
        assert config.n_embd % config.n_head == 0
        # 这个线性层输入维度是单个输入向量的维度，
        # 输出是3 * 输入向量维度是为了一次性将Q K V计算出来, 参见forward中的计算
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)# c_attn有所不同
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)

        # 设置dropout
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)

        # 注册两个tensor，并且这些tensor在训练过程中不会更新
        # 第一个是实现mask机制的掩码矩阵
        # 第二个在transformers源码中没有找到显式的使用， 猜测可能是一种实现mask策略的常数偏置
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                     .view(1, 1, config.block_size, config.block_size))
        '''
        注册一个tensor为模型的一部分，并且这个tensor不会被当作模型参数来处理，类似于一个常量。
        '''
        # 根据transformers源码增加的修复
        #self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)

        # 头的个数和嵌入空间的维度
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        # 获取batch size, sequence length, 模型维度
        B, T, C = x.size() 
        # 通过c_attn一次性计算q, k, v并按head数进行分割
        q, k ,v  = self.c_attn(x).split(self.n_embd, dim=2)
        # 三个向量的形状处理为(B, nh, T, hs)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) 
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        
        # scaled dot product
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        
        y = self.resid_dropout(self.c_proj(y))
        # 获得最终得分
        return y

In [6]:
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        # Layernorm其一
        self.ln_1 = nn.LayerNorm(config.n_embd)
        # 掩码注意力层 Masked Multi-Head Attention
        self.attn = CausalSelfAttention(config)
        # Layernorm其二
        self.ln_2 = nn.LayerNorm(config.n_embd)
        # FFN层
        self.mlp = nn.ModuleDict(dict(
            c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd),
            c_proj  = nn.Linear(4 * config.n_embd, config.n_embd),
            act     = NewGELU(),
            dropout = nn.Dropout(config.resid_pdrop),
        ))
        m = self.mlp
        # 映射到4 * 模型维度-GELU激活-再映射回模型维度-dropout
        # self.mlpf是一个函数对象, 而不是一个变量对象 
        self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x))))

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlpf(self.ln_2(x))
        return x

In [7]:
class GPT(nn.Module):
    @staticmethod
    def get_default_config():
        C = CfgNode()
        # 配置中必须给出 model_type 或 （n_layer， n_head， n_embd）
        C.model_type = 'gpt'
        C.n_layer = None
        C.n_head = None
        C.n_embd =  None
        # 这些选项必须在外部填写
        C.vocab_size = None
        C.block_size = None
        # Dropout 超参数
        C.embd_pdrop = 0.1
        C.resid_pdrop = 0.1
        C.attn_pdrop = 0.1
        return C

    def __init__(self, config):
        super().__init__()
        # 第一步, 我们判断vocab_size和block_size不为空
        # assert的作用是用来判断后面的布尔表达式
        # 如果真则没影响, 否则会raise一个Attribute Error(和try exception有点类似)
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.block_size = config.block_size

        type_given = config.model_type is not None
        # 判断是否指定模型预设类型
        params_given = all([config.n_layer is not None, config.n_head is not None, config.n_embd is not None])
        # 如果config中既给了layer, head又给了embd数, 说明这是一个自定义的参数设置
        assert type_given ^ params_given
        # 异或, 不能既使用指定预设,又给具体的参数, 这样有冲突
        if type_given:
            # 将model_type转换为详细配置
            config.merge_from_dict({
                # GPT-1
                'openai-gpt':   dict(n_layer=12, n_head=12, n_embd=768),  # 117M params
                # GPT-2 configs
                'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
                'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
                'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
                'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
                # Gophers
                'gopher-44m':   dict(n_layer=8, n_head=16, n_embd=512),
                # 超小模型
                'gpt-mini':     dict(n_layer=6, n_head=6, n_embd=192),
                'gpt-micro':    dict(n_layer=4, n_head=4, n_embd=128),
                'gpt-nano':     dict(n_layer=3, n_head=3, n_embd=48),
            }[config.model_type])

        '''
        vocab_size(词表大小)
        n_embd(模型的维度)
        block_size(输入的长度)
        embd_pdrop(嵌入层的dropout比率)
        n_layer(block的个数)
        '''
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            # word token Embedding, 用来词嵌入
            wpe = nn.Embedding(config.block_size, config.n_embd),
            # word Positional Embedding 用来位置嵌入
            drop = nn.Dropout(config.embd_pdrop),
            # dropout设置
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            # 堆叠Decoder layer或者说Block
            ln_f = nn.LayerNorm(config.n_embd)))
            # Layernorm层
        
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        # 将预测向量映射回词典

        # 根据 GPT-2 论文，init 所有权重，并将特殊缩放的 init 应用于残差投影
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
        '''
        c_proj是每个Attention的最后一个线性层, 还有一个c_proj是FFN(也就是self.MLP)中的c_proj, 
        也就是说在一个Block中会进行两次的Layernorm操作. 因此考虑到多个LayerNorm层对权重和梯度的累积效果。
        通过调整c_proj.weight的初始化，可能可以更好地平衡这种累积效果，从而帮助模型更稳定地训练。
        '''
        n_params = sum(p.numel() for p in self.transformer.parameters())
        print("number of parameters: %.2fM" % (n_params/1e6,))
        # 输出参数量

    def _init_weights(self, module):
        # isinstance用于判断第一个参数是否是第二个参数的实例
        # 对Linear层初始化
        if isinstance(module, nn.Linear):
            # 对线形层初始化
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            # 对Embedding层初始化
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            # 对LayerNorm层初始化
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)

    @classmethod
    def from_pretrained(cls, model_type):
        '''
        通过从 huggingface/transformers 检查点复制权重来初始化预训练的 GPT 模型。
        '''
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}

        # 从头开始创建初始化的 minGPT 模型
        config = cls.get_default_config()
        config.model_type = model_type
        config.vocab_size = 50257 # openai's model vocabulary
        config.block_size = 1024  # openai's model block_size
        model = GPT(config)
        # 初始化model
        sd = model.state_dict()
        # 获取参数字典

        # 从hugging face中加载一个训练好的模型
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        # 获取参数字典
        sd_hf = model_hf.state_dict()

        # 确保所有参数在名称和形状上对齐并匹配
        keys = [k for k in sd_hf if not k.endswith('attn.masked_bias')]
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # 判断长度是否相同
        for k in keys:
            if any(k.endswith(w) for w in transposed):
                # 由于hugging face使用的线性层是自定义的Conv1D，对我们需要转置的 Conv1D 权重进行特殊处理
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # 正常复制其他参数
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def configure_optimizers(self, train_config):
        '''
        将模型的所有参数分成两个部分：将经历正则化的权重衰减和不正则化的权重衰减（偏差和层规范/嵌入权重）。
        weight decay防止过拟合，但是不是所有参数都需要weight decay, 因此对Optimizer进行预先配置。
        weight decay鼓励模型变得简单, 参数接近0, 而dropout通过关闭一些节点,让模型变得略有不同,防止模型过于依赖某些节点, 通常都会一起使用。
        '''
        # 需要weight decay以及不需要的参数集合
        decay = set()
        no_decay = set()
        # 通常来说我们只对线性层做weight decay, 主要是Attention
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
        # 遍历模型参数, mn是Moudle name m是Moudle
            for pn, p in m.named_parameters():
            # 参数名, 参数
                fpn = '%s.%s' % (mn, pn) if mn else pn # 完整的参数名称
                if pn.endswith('bias'):
                    # bias是不需要weight decay的
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                # 白名单模块的权重将进行权重衰减（判断是否在白名单中, 也就是判断它是否是Linear类）
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                # 黑名单模块的权重不会衰减
                    no_decay.add(fpn)

        # 判断有没有参数既被判定需要weight decay又被判定不需要(同时存在于两个集合当中)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # 创建 PyTorch 优化器对象
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0}]
        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        # 判断一下输入的tensor有没有超出最大长度
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
        
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # 形状是(1, t)
        tok_emb = self.transformer.wte(idx) # 获取token embeddings (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # 获取position embeddings (1, t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # logits是一个长度为token词表大小, 对应位置是输出这个词的概率

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)

        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
        # idx仍然是原始输入的序列, 在这里idx会被叫做原始输入, 而输入默认指代idx_cond
        # max_new_token是我们设定的本次生成时最多生成的长度
        # temperature是温度, 用于控制生成的多样性
        # do_sample是是否启用多项式生成策略
        # top_k也是一种生成策略
        for _ in range(max_new_tokens):
            idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
            # 如果随着输入的增长超出了模型的最大序列长就做截断, 只要倒数self.block_size个
            logits, _ = self(idx_cond)
            # 然后我们获取logits, _是loss但是在generate过程中不太需要,所以没有特意命名
            # slef的调用类似于在外部直接调用model(input),也是调用了forward方法
            # 在最后一步输出对数，并按所需temperature缩放
            logits = logits[:, -1, :] / temperature
            # （可选）将 logits 裁剪为仅前 k 个选项
            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float('Inf')
            # 应用 SoftMax 将 logits 转换为（归一化）概率
            probs = F.softmax(logits, dim=-1)
            # 要么从分布中抽样，要么选取最可能的元素
            if do_sample:
                idx_next = torch.multinomial(probs, num_samples=1)
            # 否则，选择最可能的 token
            else:
                _, idx_next = torch.topk(probs, k=1, dim=-1)
            # 将采样索引追加到运行序列并继续
            idx = torch.cat((idx, idx_next), dim=1)

        return idx[:,idx_len:]

训练模块

In [8]:
class Trainer:
    @staticmethod
    def get_default_config():
        C = CfgNode()
        C.device = 'auto'
        C.num_epochs = 100
        C.learning_rate = 3e-4
        C.betas = (0.9, 0.95)
        C.weight_decay = 0.1
        C.grad_norm_clip = 1.0
        return C

    def __init__(self, config, model, train_loader):
        self.config = config
        self.model = model
        self.optimizer = None
        self.train_loader = train_loader

        if config.device == 'auto':
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        else:
            self.device = config.device
        self.model = self.model.to(self.device)
        print("running on device", self.device)

        self.count = 0
        self.iter_time = 0.0
        self.iter_dt = 0.0
        self.sum_loss = 0.0

    def run(self):
        model, config = self.model, self.config # 初始化模型和配置
        self.optimizer = model.configure_optimizers(config) # 初始化优化器

        for epoch in range(1,config.num_epochs+1):
            model.train() # 模型设置为训练模式(会正常使用dropout等机制)
            self.sum_loss = 0.0
            self.count
            self.iter_time = time.time()

            for i, (x, y) in enumerate(tq.tqdm(self.train_loader)):
                samples = x.shape[0]
                x, y = x.to(self.device), y.to(self.device)
                logits, self.loss = model(x, y)
                model.zero_grad(set_to_none=True)
                self.loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
                self.optimizer.step()
                self.sum_loss += self.loss.item() * samples
                self.count += samples
                if ((i+1) % 1000 == 0 and i!=0) or i==len(train_loader)-1:
                    tnow = time.time()
                    self.iter_dt = tnow - self.iter_time
                    self.iter_time = tnow
                    f = open(loggin_dir, "a")
                    f.write("Epoch [{}][{}/{}]\tLoss: {:.5f}  time: {:.2f}".format(epoch, i+1, len(train_loader),
                                                                                   self.sum_loss/self.count,
                                                                                   self.iter_dt)+'s'+'\n')
                    print("Epoch [{}][{}/{}]\tLoss: {:.5f}  time: {:.2f}".format(epoch, i+1, len(train_loader),
                                                                                 self.sum_loss/self.count,
                                                                                 self.iter_dt)+'s')
                    f.close()
                    
            torch.save(model.state_dict(), save_dir+str(epoch)+'.pth')

数据集制作

In [9]:
class Dataset(Dataset):
    def __init__(self, data, max_len, vocab_size):
        self.data = data
        self.max_len = max_len
        self.vocab_size = vocab_size

    def __len__(self):
        return len(self.data)
    
    def get_vocab_size(self):
        return self.vocab_size
    
    def get_block_size(self):
        # 馈入transformer的序列长度, 包含串联的输入和输出，但 -1，因为transformer在最后一个输入元素处开始进行预测
        return self.max_len * 2 - 1

    def __getitem__(self, i):
        
        question = torch.LongTensor(self.data[i][0])
        reply = torch.LongTensor(self.data[i][1])

        # 将问题规范和解决方案连接起来
        cat = torch.cat((question, reply), dim=0)

        # transformer的输入将是偏移序列
        x = cat[:-1].clone()
        y = cat[1:].clone()
        # 在输出位置进行预测，在输入位置掩盖损失
        y[:self.max_len-1] = -1
        return x, y

In [10]:
# 随机种子设置：3407或42
seed = 3407

# 路径设置
data_dir = './'# 预处理数据路径
bert_path = '../tokenizer'# tokenizer路径
save_dir = './checkpoint/checkpoint_'# 模型参数保存路径
loggin_dir = "training_data.txt"# 训练结果日志

# 数据集制作参数
batch_size = 16
num_workers = 0
max_len = 200

# 模型参数设置
n_layer = 6
n_head = 8
n_embd = 512

# 训练参数设置
num_epochs = 1000
learning_rate = 5e-4

In [11]:
with open(data_dir, 'r', encoding='gb18030') as file:
    pairs = []
    for line in file:
        pair = eval(line)
        pairs.append(pair)

In [12]:
# 创建Bert分词器
tokenizer = BertTokenizer.from_pretrained(bert_path)
# 读取字典信息
word_map = tokenizer.get_vocab()
print("Total words are: {}".format(len(word_map)))

Total words are: 21128


In [13]:
train_dataset = Dataset(data=pairs, max_len=max_len, vocab_size=len(word_map))

In [14]:
train_loader = DataLoader(
            train_dataset,
            shuffle=True,
            pin_memory=True,
            batch_size=batch_size,
            num_workers=num_workers)

模型参数设置

In [15]:
set_seed(seed)
model_config = GPT.get_default_config()
model_config.model_type = None
model_config.n_layer = n_layer
model_config.n_head = n_head
model_config.n_embd = n_embd
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model = GPT(model_config)

number of parameters: 29.94M


超参数设置

In [16]:
train_config = Trainer.get_default_config()
train_config.num_epochs = num_epochs
train_config.learning_rate = learning_rate
trainer = Trainer(train_config, model, train_loader)

running on device cuda


In [17]:
trainer.run()

  0%|          | 0/3225 [00:00<?, ?it/s]

Epoch [1][1000/3225]	Loss: 2.570  time: 305.08s


KeyboardInterrupt: 