接下来真真真真的要开始训练**Transformer**模型了。

为了更好的理解接下来要讲的训练过程，下面会使用到大量在前面几篇文章中使用到的代码，部分会重新贴出，部分则直接引用，需要clone代码仓库中的代码才能够运行，所以，建议clone完GitHub代码后，直接看对应的notebook进行学习。

其实，当模型设计好，数据处理好，让模型训练起来这一步，反而是最简单的，下面的代码基本上也很容易看懂，无非就是把前面讲过的全部串联起来，开始训练。

# 导包 / Import packages
首先是导包，并加载tokenizer和词表。

In [1]:
from EncoderDecoder import make_model
from utils.LabelSmoothing import LabelSmoothing
from DataLoader import create_dataloaders
from Train.learning_rate import rate
from Train.train import TrainState
from utils.batch import Batch
from Train.SimpleLossCompute import SimpleLossCompute
from DataLoader import load_tokenizers, load_vocab
from conf.settings import DummyOptimizer, DummyScheduler

import torch.distributed as dist
import torch
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim.lr_scheduler import LambdaLR
import time, os
import GPUtil
spacy_de, spacy_en = load_tokenizers()
vocab_src, vocab_tgt = load_vocab(spacy_de, spacy_en) 

  from .autonotebook import tqdm as notebook_tqdm


Finished.
Vocabulary sizes:
8317
6384


# 训练一个epoch / Run one epoch
下面是训练一个epoch的代码，一个epoch的训练过程包括：**前向传播***，**计算Loss**，**反向传播**，**更新参数**，**更新学习率**。

In [2]:
def run_epoch(
    data_iter,
    model,
    loss_compute,
    optimizer,
    scheduler,
    mode="train",
    accum_iter=1,
    train_state=TrainState(),
):
    """Train a single epoch"""
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    n_accum = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(
            batch.src, batch.tgt, batch.src_mask, batch.tgt_mask
        )
        loss, loss_node = loss_compute(out, batch.tgt_y, batch.ntokens)
        # loss_node = loss_node / accum_iter
        if mode == "train" or mode == "train+log":
            loss_node.backward()
            train_state.step += 1
            train_state.samples += batch.src.shape[0]
            train_state.tokens += batch.ntokens
            if i % accum_iter == 0:
                optimizer.step()
                optimizer.zero_grad(set_to_none=True)
                n_accum += 1
                train_state.accum_step += 1
            scheduler.step()

        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        if i % 40 == 1 and (mode == "train" or mode == "train+log"):
            lr = optimizer.param_groups[0]["lr"]
            elapsed = time.time() - start
            print(
                (
                    "Epoch Step: %6d | Accumulation Step: %3d | Loss: %6.2f "
                    + "| Tokens / Sec: %7.1f | Learning Rate: %6.1e"
                )
                % (i, n_accum, loss / batch.ntokens, tokens / elapsed, lr)
            )
            start = time.time()
            tokens = 0
        del loss
        del loss_node
    return total_loss / total_tokens, train_state

# 训练准备
训练任何模型都离不开以下这几个部分：**模型**，**分布式训练**，**损失函数**，**数据迭代器加载**，**学习率初始化**，**训练一个epoch**，**保存模型**，**模型评估**。

In [3]:
def train_worker(
    gpu,
    ngpus_per_node,
    vocab_src,
    vocab_tgt,
    spacy_de,
    spacy_en,
    config,
    is_distributed=False,
):
    print(f"Train worker process using GPU: {gpu} for training", flush=True)
    torch.cuda.set_device(gpu)

    pad_idx = vocab_tgt["<blank>"]
    d_model = 512
    # 初始化模型
    model = make_model(len(vocab_src), len(vocab_tgt), N=6, d_model=d_model)
    model.cuda(gpu)
    module = model
    is_main_process = True
    
    # 初始化分布式训练
    if is_distributed:
        dist.init_process_group( # 初始化进程组
            "nccl", init_method="env://", rank=gpu, world_size=ngpus_per_node
        )
        model = DDP(model, device_ids=[gpu]) # 初始化分布式数据并行
        module = model.module
        is_main_process = gpu == 0

    # 初始化损失函数
    criterion = LabelSmoothing(
        size=len(vocab_tgt), padding_idx=pad_idx, smoothing=0.1
    )
    criterion.cuda(gpu)

    # 初始化数据加载器
    train_dataloader, valid_dataloader = create_dataloaders(
        gpu,
        vocab_src,
        vocab_tgt,
        spacy_de,
        spacy_en,
        batch_size=config["batch_size"] // ngpus_per_node,
        max_padding=config["max_padding"],
        is_distributed=is_distributed,
    )

    # 初始化优化器和学习率调度器
    optimizer = torch.optim.Adam(
        model.parameters(), lr=config["base_lr"], betas=(0.9, 0.98), eps=1e-9
    )
    # 初始化学习率调度器
    lr_scheduler = LambdaLR(
        optimizer=optimizer,
        lr_lambda=lambda step: rate(
            step, d_model, factor=1, warmup=config["warmup"]
        ),
    )
    train_state = TrainState()

    # 开始训练
    for epoch in range(config["num_epochs"]):
        if is_distributed:
            train_dataloader.sampler.set_epoch(epoch)
            valid_dataloader.sampler.set_epoch(epoch)

        model.train() # 设置模型为训练模式
        print(f"[GPU{gpu}] Epoch {epoch} Training ====", flush=True)
        
        # 训练一个epoch
        _, train_state = run_epoch(
            (Batch(b[0], b[1], pad_idx) for b in train_dataloader),
            model,
            SimpleLossCompute(module.generator, criterion),
            optimizer,
            lr_scheduler,
            mode="train+log",
            accum_iter=config["accum_iter"],
            train_state=train_state,
        )

        GPUtil.showUtilization() # 显示GPU使用情况
        if is_main_process: # 是否为主进程
            file_path = "%s%.2d.pt" % (config["file_prefix"], epoch)
            torch.save(module.state_dict(), file_path) # 保存模型
        torch.cuda.empty_cache() # 清空GPU缓存

        print(f"[GPU{gpu}] Epoch {epoch} Validation ====", flush=True)
        model.eval() # 设置模型为评估模式
        sloss = run_epoch(
            (Batch(b[0], b[1], pad_idx) for b in valid_dataloader),
            model,
            SimpleLossCompute(module.generator, criterion),
            DummyOptimizer(),
            DummyScheduler(),
            mode="eval",
        )
        print(sloss)
        torch.cuda.empty_cache()

    if is_main_process:
        file_path = "%sfinal.pt" % config["file_prefix"]
        torch.save(module.state_dict(), file_path)

In [None]:
def train_distributed_model(vocab_src, vocab_tgt, spacy_de, spacy_en, config):

    ngpus = torch.cuda.device_count()
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12356"
    print(f"Number of GPUs detected: {ngpus}")
    print("Spawning training processes ...")
    mp.spawn(
        train_worker,
        nprocs=ngpus,
        args=(ngpus, vocab_src, vocab_tgt, spacy_de, spacy_en, config, True),
    )


def train_model(vocab_src, vocab_tgt, spacy_de, spacy_en, config):
    if config["distributed"]:
        train_distributed_model(
            vocab_src, vocab_tgt, spacy_de, spacy_en, config
        )
    else:
        train_worker(
            0, 1, vocab_src, vocab_tgt, spacy_de, spacy_en, config, False
        )


def load_trained_model():
    config = {
        "batch_size": 32,
        "distributed": False,
        "num_epochs": 8,
        "accum_iter": 10,
        "base_lr": 1.0,
        "max_padding": 72,
        "warmup": 3000,
        "file_prefix": "multi30k_model_",
    }
    model_path = "multi30k_model_final.pt"
    if not os.path.exists(model_path):
        train_model(vocab_src, vocab_tgt, spacy_de, spacy_en, config)

    model = make_model(len(vocab_src), len(vocab_tgt), N=6)
    model.load_state_dict(torch.load("multi30k_model_final.pt"))
    return model


model = load_trained_model()

Train worker process using GPU: 0 for training
[GPU0] Epoch 0 Training ====
Epoch Step:      1 | Accumulation Step:   1 | Loss:   7.66 | Tokens / Sec:   673.7 | Learning Rate: 5.4e-07
Epoch Step:     41 | Accumulation Step:   5 | Loss:   7.44 | Tokens / Sec:  6590.7 | Learning Rate: 1.1e-05
Epoch Step:     81 | Accumulation Step:   9 | Loss:   7.00 | Tokens / Sec:  6369.6 | Learning Rate: 2.2e-05
Epoch Step:    121 | Accumulation Step:  13 | Loss:   6.70 | Tokens / Sec:  6510.9 | Learning Rate: 3.3e-05
Epoch Step:    161 | Accumulation Step:  17 | Loss:   6.43 | Tokens / Sec:  6089.0 | Learning Rate: 4.4e-05
Epoch Step:    201 | Accumulation Step:  21 | Loss:   6.39 | Tokens / Sec:  6147.8 | Learning Rate: 5.4e-05
Epoch Step:    241 | Accumulation Step:  25 | Loss:   6.21 | Tokens / Sec:  6111.0 | Learning Rate: 6.5e-05
Epoch Step:    281 | Accumulation Step:  29 | Loss:   5.97 | Tokens / Sec:  6329.0 | Learning Rate: 7.6e-05
Epoch Step:    321 | Accumulation Step:  33 | Loss:   5.76 |

这一部分的学习建议多去对比不同模型训练代码上的差异，整体步骤其实相差不大，最大的差别其实还是在数据的预处理上。