# 全局

## 导入全局库

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt

## 定义全局参数

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
# !pip install subword-nmt

In [4]:
%matplotlib inline

# 数据准备模块
以下三个代码块，是为了获取用于机器翻译的数据集，我们已经按以下代码跑通，相应数据已在此代码所在文件夹中。所以可以不必执行此代码。供了解。
相应数据包括：
1. ted-en_zh.codes
2. dict.zh-cn
3. dict.en
4. ted-data/*.bped一系列数据

如果你想自己尝试，可以在文件中删除以上数据文件，自己动手尝试下~

In [None]:
# cat ted-data/train.* ted-data/dev.* | subword-nmt learn-bpe -s 32000 > ted-en_zh.codes

In [None]:
# %%bash
# for f in ted-data/*;
# do
#     subword-nmt apply-bpe -c ted-en_zh.codes < $f > $f.bped;
# done

In [None]:
# %%bash
# cat ted-data/train.zh-cn.bped ted-data/dev.zh-cn.bped | subword-nmt get-vocab -o dict.zh-cn
# cat ted-data/train.en.bped ted-data/dev.en.bped | subword-nmt get-vocab -o dict.en

以下三个代码块，适合于Win系统是为了获取用于机器翻译的数据集，我们已经按以下代码跑通，相应数据已在此代码所在文件夹中。所以可以不必执行此代码。供了解。

In [None]:
# %%cmd
# type ted-data\train.* ted-data\dev.* | subword-nmt learn-bpe -s 32000 >> ted-en_zh.codes

In [None]:
# %%cmd
# for %f in (ted-data/*)
# do
#     subword-nmt apply-bpe -c ted-en_zh.codes < ted-data\%f > ted-data\%f.bped

In [None]:
# %%cmd
# type ted-data\train.zh-cn.bped ted-data\dev.zh-cn.bped | subword-nmt get-vocab -o dict.zh-cn
# type ted-data\train.en.bped ted-data\dev.en.bped | subword-nmt get-vocab -o dict.en

# 模型定义

## 基本构建定义

### 定义嵌入层

In [3]:
#定义词嵌入层，
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        #下方之所以乘个缩放值，主要是embedding的值一般太小，需要适度放大，再和位置编码相加，会合适些
        #因为和标量的运算会自动对齐设备，所以下面不用考虑设备迁移
        return self.lut(x) * math.sqrt(self.d_model)

### 定义位置编码

$$  PE(pos,2i)   = sin( \frac{pos} {1000^{\frac{2i}{dim}}})  $$
$$  PE(pos,2i+1) = cos( \frac{pos} {1000^{\frac{2i}{dim}}})  $$

In [4]:
#定义位置编码类
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0) #将形状修改为1,L,D,为了便于后续的广播，因为对于不同的样本，位置编码其实一样
        self.register_buffer('pe', pe) #将位置编码结果，放入缓存中，且不会纳入模型参数内，主要是确保随模型迁移设备
        
    def forward(self, x):
        # x = x + torch.tensor(self.pe[:, :x.size(1)], 
        #                  requires_grad=False)
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

### 定义注意力层

注意力计算统一计算公式如下：
$$  Attention(Q,K,V) = softmax( \frac{ QK^{T} } {\sqrt{d_{model}}} + mask\_matirx )  V $$

In [5]:
#定义注意力计算层，主要返回注意力矩阵，以及注意力加权后结果
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        #如果有掩码矩阵，则对掩码矩阵是0的位置，填充为一个很大的负数，相当于对该位置不纳入注意力计算
        scores = scores.masked_fill(mask == 0, -1e4)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn   #对应形状分别为N,L,D  以及 N,L,L
    

### 定义多头注意力

In [6]:
    
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
    
#实现多头注意力
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        # self.linears = [nn.Linear(d_model, d_model) for i in range(4) ] #如果使用这种方式，不会被包含到model.child里面，也就不会随设备迁移，尤其注意
        self.linears = clones( nn.Linear(d_model, d_model), 4 )
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model = h * d_k 
        # 注意，论文里写的是对每个头单独做线性映射，但此处是一次性做投射，然后切分为多个头，本质没啥区别
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]

        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)  #此处最后对注意力又投射了一次，主要还是增加模型的表现力，可以考虑不加


### 定义层归一化和残差模块

In [7]:
#实现LN层，layernorm主要是对一个批次内的每个样本单独做归一化
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        #使用Parameter定义，会自动纳入model的Parameters里，便于后续训练等
        #针对每个特征，即D，独立有自己的a和b，且值可以学习
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)  #N,L,1
        std = x.std(-1, keepdim=True)  #N,L,1
        #下面计算的时候，mean会自动广播为N,L,D，self.a_2会自动广播为1,1,D
        #是从右向左广播
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

#实现残差和层归一化标准组件
class SublayerConnection(nn.Module):
    """
    pre-layer_norm
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        #注意，此处的实现和论文内不一样，属于pre_LN，即在输入到注意力网络前，应用层归一化
        return x + self.dropout(sublayer(self.norm(x)))
        

### 定义前馈网络

In [8]:

#实现前馈神经网络，FFN，即先升维再降维
#d_ff一般设置为2048或者3072
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))
        

### 定义输出头

In [9]:
#定义生成头，即将模型输出的形状N,L,D，映射为N,L,D,V,其中V是词典大小，便于后面计算损失
class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        out = self.proj(x)
        return F.log_softmax( out, dim=-1)

## 定义模型

### 定义编码层和编码器

In [10]:
#定义编码器层
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

#基于编码器层，定义编码器
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        #对encoder编码器最终的输出，再加一层层归一化，确保输出在特征维度分布稳定
        return self.norm(x)

### 定义解码层和解码器

In [11]:
#定义下三角掩码矩阵
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)

        #和encoder类似，对dencoder解码器最终的输出，再加一层层归一化，确保输出在特征维度分布稳定
        return self.norm(x)

### 定义编码解码器

In [12]:
#组装成encoder-decoder
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask,
                            tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

## 定义辅助工具

### 定义模型组装函数

In [13]:
#定义一个函数，专门用来获取最终的Transformer模型

def make_model(src_vocab, tgt_vocab, N=1, 
               d_model=128, d_ff=512, h=1, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    #对模型内，只要维度大于1的参数，均做xavier初始化
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

### 定义损失函数

KL散度计算公式：
* $Q为目标概率分布，P为模型输出， KL(Q,P)  = \sum_{i} Q_{i} ( log(Q_{i}) - log(P_{i}) )  $

In [14]:
#定义一个支持平滑的损失函数，基于KL散度
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        #下面使用的是kl散度损失，不是交叉熵损失，应该主要是考虑训练模型输出和目标的概率分布就行，但其实应该使用交叉熵
        self.criterion = nn.KLDivLoss(reduction='sum')  
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        
        assert x.size(1) == self.size  #x的形状为N,L,C
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)  #沿着指定的dim轴，将self.confidence值，赋值给index指定的位置
        # true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
            
        self.true_dist = true_dist  #self.true_dist，最终的数据形状为  N,C ，其中C为分类数量，代表每个样本属于指定分类的概率值
        
        return self.criterion(x, Variable(true_dist, requires_grad=False))

### 定义优化器

In [15]:
#很有参考价值
class NoamOpt:
    #自定义一个优化器，且支持自动调整学习率
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        #对优化器内的待优化参数进行遍历，每个待优化参数组，都有独立的学习率等参数，可以根据需要灵活调整
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        #作用类似于学习率调度器，专门用来计算学习率，供优化器使用
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
        
def get_std_opt(model):
    return NoamOpt(model.src_embed[0].d_model, 2, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))


### 定义损失值计算函数

In [16]:
from torch.amp import autocast, GradScaler
scaler = GradScaler()
class SimpleLossCompute:
    #用来计算
    "A simple loss compute and train function."
    def __init__(self,generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt
        
    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), 
                              y.contiguous().view(-1)) / norm
        
        scaler.scale(loss).backward()
        scaler.step(self.opt.optimizer)
        scaler.update()
        
        # loss.backward()
        
        if self.opt is not None:
            torch.nn.utils.clip_grad_norm_(
                                            self.opt.optimizer.param_groups[0]['params'], 
                                            max_norm=1.0)  # 这里的 max_norm 是裁剪的阈值
            self.opt.step()
            self.opt.optimizer.zero_grad()
        return loss.item() * norm
        

### 定义训练工具

In [17]:
from torch.amp import autocast, GradScaler
from torch.utils.checkpoint import checkpoint

def print_cuda_mem(tag=""):
    print(f"[{tag}] Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB, "
          f"Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

#统一训练工具
def run_epoch(data_iter, model, loss_compute):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    max_token_len = 0
    model = model.to(device)

    #下面添加监控，包括对GPU使用的监控
    with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(
        wait=1,
        warmup=1,
        active=2),
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./result', worker_name='worker0'),
    record_shapes=True,
    profile_memory=True,  # This will take 1 to 2 minutes. Setting it to False could greatly speedup.
    with_stack=True
) as p:
        for i, batch in enumerate(data_iter):
            #收集训练数据内最大序列长度
            batch_max_len =max(batch.src.shape[1],batch.trg.shape[1])
            max_token_len = batch_max_len if batch_max_len > max_token_len else max_token_len
            print(f'[step：{i}] 当前训练数据长度为{batch_max_len}')
            #控制最大长度，避免显存爆炸
            # maxlen = 150
            # if batch_max_len > maxlen:
            #     print(f'当前训练数据长度为{batch_max_len}大于{maxlen}，跳过该条数据')
            #     continue
            
            batch.src = batch.src.to(device)
            batch.trg = batch.trg.to(device)
            batch.src_mask = batch.src_mask.to(device)
            batch.trg_mask = batch.trg_mask.to(device)
            batch.trg_y = batch.trg_y.to(device)
            batch.ntokens = batch.ntokens.to(device)
            # with torch.autocast(device_type="cuda"):
            
            with autocast(device_type = device,dtype=torch.float16):
                # out = checkpoint(model, batch.src, batch.trg, 
                #                     batch.src_mask, batch.trg_mask)
                out = model.forward(batch.src, batch.trg, 
                                    batch.src_mask, batch.trg_mask)
                loss = loss_compute(out, batch.trg_y, batch.ntokens)
    
            total_loss += loss
            total_tokens += batch.ntokens
            tokens += batch.ntokens

            p.step()

        torch.cuda.empty_cache()
        if i % 50 == 1:
            elapsed = time.time() - start
            print(  "[Step：{0}]  Loss: {1} Tokens per Sec: {2}".format(i, loss.cpu() / batch.ntokens.cpu(), tokens / elapsed)  )
            start = time.time()
            tokens = 0
            print(f'[Step：{i}]  当前记录序列最大长度{max_token_len}')
            print_cuda_mem(f"Step：{i}")

            
    return total_loss.cpu() / total_tokens.cpu(),max_token_len


### 定义封装batch数据函数

In [18]:
class Batch:
    #返回一个批次数据，数据形状还是N,L，待输入给模型
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)  #mask数据形状为N,1,L,之所以加1，是因为考虑到注意力加mask的时候，注意力矩阵是L，L的，所以搞成N，1，L，便于广播
        if trg is not None:
            self.trg = trg[:, :-1]  #模型的解码器输入，
            self.trg_y = trg[:, 1:]  
            self.trg_mask = \
                self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()

    @staticmethod
    def make_std_mask(tgt, pad):
        #返回一个下三角掩码矩阵，标准型的，主要给解码器使用
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & Variable(
            subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
        return tgt_mask


# 使用随机生成数据进行模型训练

## 定义随机数据生成函数

In [19]:
def data_gen(V, batch_size, nbatches):
    "Generate random data for a src-tgt copy task."
    for i in range(nbatches):
        data = torch.from_numpy(np.random.randint(1, V, size=(batch_size, 10)))
        data[:, 0] = 1
        src = Variable(data, requires_grad=False)
        tgt = Variable(data, requires_grad=False)
        yield Batch(src, tgt, 0)


## 开始训练

In [20]:
V = 11  #定义字典大小
criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
model = make_model(V, V, N=2)
model_opt = NoamOpt(model.src_embed[0].d_model, 1, 400,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

for epoch in range(10):
    model.train()
    run_epoch(data_gen(V, 30,  1), model, 
              SimpleLossCompute(model.generator, criterion, model_opt))
    model.eval()
    print(run_epoch(data_gen(V, 30, 1), model, 
                    SimpleLossCompute(model.generator, criterion, model_opt)))
    

[step：0] 当前训练数据长度为10
[step：0] 当前训练数据长度为10
(tensor(2.9025), 10)
[step：0] 当前训练数据长度为10
[step：0] 当前训练数据长度为10
(tensor(2.7637), 10)
[step：0] 当前训练数据长度为10
[step：0] 当前训练数据长度为10
(tensor(2.6535), 10)
[step：0] 当前训练数据长度为10
[step：0] 当前训练数据长度为10
(tensor(2.6232), 10)
[step：0] 当前训练数据长度为10
[step：0] 当前训练数据长度为10
(tensor(2.4635), 10)
[step：0] 当前训练数据长度为10
[step：0] 当前训练数据长度为10
(tensor(2.2433), 10)
[step：0] 当前训练数据长度为10
[step：0] 当前训练数据长度为10
(tensor(2.2499), 10)
[step：0] 当前训练数据长度为10
[step：0] 当前训练数据长度为10
(tensor(2.1452), 10)
[step：0] 当前训练数据长度为10
[step：0] 当前训练数据长度为10
(tensor(2.1164), 10)
[step：0] 当前训练数据长度为10
[step：0] 当前训练数据长度为10
(tensor(1.9853), 10)


# 使用真实数据进行模型训练

## 查看网络结构

In [21]:
from torchinfo import summary

V_zh = 12203 + 2
V_en = 30684 + 2
V = V_en

criterion = LabelSmoothing(size=V_zh, padding_idx=0, smoothing=0.0)
model = make_model(V_en, V_zh, N=1)

inputdata = [batch.src, batch.trg, batch.src_mask, batch.trg_mask]
summary(model,input_data =inputdata  )

NameError: name 'batch' is not defined

## 定义字典和padding函数

In [21]:
from itertools import zip_longest

def get_dict(filename, pad=0, bos=1, eos=2, unk=3):
    token_map = {"<PAD>": 0, "<BOS>": 1, "<EOS>": 2, "<UNK>": 3}
    with open(filename) as f:
        for i, l in enumerate(f, start=4):
            keys = l.strip().split()
            token_map[keys[0]] = i
    return token_map

def batch_padding(batch, padding_idx=0):
    max_len = len(max(batch, key=lambda x: len(x)))
    for sent in batch:
        padding_len = max_len - len(sent)
        if padding_len:
            sent.extend([padding_idx] * padding_len)

    return batch

## 定义真实训练数据生成器

In [22]:

def real_data_gen(V, batch_size):
    
    dict_zh = get_dict("dict.zh-cn")
    dict_en = get_dict("dict.en")

    train_en = open("./ted-data/train.en.bped")
    train_zh = open("./ted-data/train.zh-cn.bped")

    batch_en = []
    batch_zh = []
    
    for sent_en, sent_zh in zip(train_en, train_zh):
        sent_en = "<BOS> {} <EOS>".format(sent_en.strip())
        sent_zh = "<BOS> {} <EOS>".format(sent_zh.strip())
        batch_en.append([dict_en[token] for token in sent_en.split()])
        batch_zh.append([dict_zh[token] for token in sent_zh.split()])
        
        if len(batch_en) % batch_size == 0:
            src = torch.tensor(batch_padding(batch_en, 0), dtype=torch.int)
            tgt = torch.tensor(batch_padding(batch_zh, 0), dtype=torch.int)
            src = src.long()
            tgt = tgt.long()
            yield Batch(src, tgt, 0)
    else:
        src = src.long()
        tgt = tgt.long()
        yield Batch(src, tgt, 0)

## 开始训练

In [23]:

# wc -l dict.en dict.zh-cn
V_zh = 12203 + 2
V_en = 30684 + 2
V = V_en

criterion = LabelSmoothing(size=V_zh, padding_idx=0, smoothing=0.0)
model = make_model(V_en, V_zh, N=1)
# model = model.to(torch.float16)

# model_opt = NoamOpt(model.src_embed[0].d_model, 1, 400,
#         torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

model_opt = NoamOpt(model.src_embed[0].d_model, 1, 400,
        torch.optim.SGD(model.parameters(), lr=0))

for epoch in range(20):
    model.train()
    run_epoch(real_data_gen(V,  1 ), model, 
              SimpleLossCompute(model.generator, criterion, model_opt))
    # model.eval()
    # print(run_epoch(real_data_gen(V, 1), model, 
    #                 SimpleLossCompute(model.generator, criterion, None)))

[step：0] 当前训练数据长度为99
[step：1] 当前训练数据长度为99
[step：2] 当前训练数据长度为99
[step：3] 当前训练数据长度为99
[step：4] 当前训练数据长度为99
[step：5] 当前训练数据长度为99


[W530 14:59:46.052558139 CPUAllocator.cpp:245] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event


[step：6] 当前训练数据长度为99
[step：7] 当前训练数据长度为99
[step：8] 当前训练数据长度为99
[step：9] 当前训练数据长度为99
[step：10] 当前训练数据长度为99
[step：11] 当前训练数据长度为99
[step：12] 当前训练数据长度为99
[step：13] 当前训练数据长度为99
[step：14] 当前训练数据长度为99
[step：15] 当前训练数据长度为99
[step：16] 当前训练数据长度为99
[step：17] 当前训练数据长度为99
[step：18] 当前训练数据长度为99
[step：19] 当前训练数据长度为99
[step：20] 当前训练数据长度为99
[step：21] 当前训练数据长度为99
[step：22] 当前训练数据长度为99
[step：23] 当前训练数据长度为99
[step：24] 当前训练数据长度为99
[step：25] 当前训练数据长度为99
[step：26] 当前训练数据长度为99
[step：27] 当前训练数据长度为99
[step：28] 当前训练数据长度为99
[step：29] 当前训练数据长度为99
[step：30] 当前训练数据长度为99
[step：31] 当前训练数据长度为99
[step：32] 当前训练数据长度为99
[step：33] 当前训练数据长度为99
[step：34] 当前训练数据长度为99
[step：35] 当前训练数据长度为99
[step：36] 当前训练数据长度为99
[step：37] 当前训练数据长度为99
[step：38] 当前训练数据长度为99
[step：39] 当前训练数据长度为99
[step：40] 当前训练数据长度为99
[step：41] 当前训练数据长度为99
[step：42] 当前训练数据长度为99
[step：43] 当前训练数据长度为99
[step：44] 当前训练数据长度为99
[step：45] 当前训练数据长度为99
[step：46] 当前训练数据长度为99
[step：47] 当前训练数据长度为99
[step：48] 当前训练数据长度为99
[step：49] 当前训练数据长度为99
[step：50] 当前训练数据长度为99
[step：51] 当前训练

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.92 GiB. GPU 0 has a total capacity of 23.63 GiB of which 2.88 GiB is free. Process 4073 has 536.00 MiB memory in use. Process 4074 has 1.36 GiB memory in use. Process 4067 has 346.00 MiB memory in use. Process 659678 has 462.00 MiB memory in use. Process 2272425 has 3.78 GiB memory in use. Including non-PyTorch memory, this process has 14.28 GiB memory in use. Of the allocated memory 13.27 GiB is allocated by PyTorch, and 593.92 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [45]:
datasets = real_data_gen(V, 1 )
batch = next(datasets)

In [25]:
max_l_len = 0
for batch in datasets:
    temp = max(batch.src.shape[1],batch.trg.shape[1])
    max_l_len = temp if temp > max_l_len else max_l_len


KeyboardInterrupt: 

In [None]:
max_l_len