In [1]:
import collections
import math
import torch
from torch import nn
from d2l import torch as d2l

In [2]:
#实现Encoder
class Seq2SeqEncoder(d2l.Encoder):
    def __init__(self,vocab_size,embed_size,num_hiddens,num_layers,dropout=0,**kwargs):
        super(Seq2SeqEncoder,self).__init__(**kwargs)
        self.embedding=nn.Embedding(vocab_size,embed_size)
        self.rnn=nn.GRU(embed_size,num_hiddens,num_layers,dropout=dropout)
    def forward(self,X,*args):
        X=self.embedding(X)
        X=X.permute(1,0,2)
        output,state=self.rnn(X)
        return output,state

In [3]:
encoder=Seq2SeqEncoder(vocab_size=10,embed_size=8,num_hiddens=16,num_layers=2)
X=torch.zeros((4,7),dtype=torch.long)
output,state=encoder(X)
output.shape,state.shape

(torch.Size([7, 4, 16]), torch.Size([2, 4, 16]))

In [4]:
#实现Decoder
class Seq2SeqDecoder(d2l.Decoder):
    def __init__(self,vocab_size,embed_size,num_hiddens,num_layers,dropout=0,**kwargs):
        super(Seq2SeqDecoder,self).__init__(**kwargs)
        self.embedding=nn.Embedding(vocab_size,embed_size)
        self.rnn=nn.GRU(embed_size+num_hiddens,num_hiddens,num_layers,dropout=dropout)
        self.dense=nn.Linear(num_hiddens,vocab_size)
    def init_state(self,enc_outputs,*arg):
        return enc_outputs[1]
    def forward(self,X,state):
        X=self.embedding(X).permute(1,0,2)
        context=state[-1].repeat(X.shape[0],1,1)
        X_and_context=torch.cat((X,context),2)
        output,state=self.rnn(X_and_context,state)
        output=self.dense(output).permute(1,0,2)
        return output,state

In [5]:
decoder=Seq2SeqDecoder(vocab_size=10,embed_size=8,num_hiddens=16,num_layers=2)
state=decoder.init_state(encoder(X))
output,X=decoder(X,state)
output.shape,state.shape

(torch.Size([4, 7, 10]), torch.Size([2, 4, 16]))

In [6]:
#通过零值化屏蔽掉不相关的项
def sequence_mask(X,valid_len,value=0):
    maxlen=X.size(1)
    mask=torch.arange((maxlen),dtype=torch.float32,device=X.device)[None,:] < valid_len[:,None]
    X[~mask]=value
    return X
X=torch.tensor([[1,2,3],[4,5,6]])
sequence_mask(X,torch.tensor([1,2]))

tensor([[1, 0, 0],
        [4, 5, 0]])

In [7]:
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
    def forward(self,pred,label,valid_len):
        weights=torch.ones_like(label)
        weights=sequence_mask(weights,valid_len)
        print(weights)
        self.reduction='none'
        unweighted_loss=super().forward(pred.permute(0,2,1),label)
        print(unweighted_loss)
        weighted_loss=(unweighted_loss*weights).mean(dim=1)
        return weighted_loss

In [8]:
loss=MaskedSoftmaxCELoss()
loss(torch.ones(3,4,10),torch.ones((3,4),dtype=torch.long),torch.tensor([4,2,0]))

tensor([[1, 1, 1, 1],
        [1, 1, 0, 0],
        [0, 0, 0, 0]])
tensor([[2.3026, 2.3026, 2.3026, 2.3026],
        [2.3026, 2.3026, 2.3026, 2.3026],
        [2.3026, 2.3026, 2.3026, 2.3026]])


tensor([2.3026, 1.1513, 0.0000])

In [9]:
#训练过程
# 训练
def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device):
    """训练序列到序列模型"""
    def xavier_init_weights(m):
        # 如果是线性层
        if type(m) == nn.Linear:
            # 使用Xavier均匀初始化权重
            nn.init.xavier_uniform_(m.weight)
        # 如果是GRU层
        if type(m) == nn.GRU:
            # 对于GRU层的每个参数
            for param in m._flat_weights_names:
                # 如果是权重参数
                if "weight" in param:
                    # 使用Xavier均匀初始化该权重参数
                    nn.init.xavier_uniform_(m._parameters[param])
    
    # 应用xavier_init_weights函数，对网络模型的参数进行初始化
    net.apply(xavier_init_weights)
    # 将网络模型移动到指定设备上
    net.to(device)
    # 创建Adam优化器，将网络模型的参数传入优化器
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    # 创建MaskedSoftmaxCELoss损失函数对象
    loss = MaskedSoftmaxCELoss()
    # 将网络模型设置为训练模式
    net.train()
    # 创建动画绘制对象，用于绘制损失随训练epoch的变化情况
    animator = d2l.Animator(xlabel='epoch', ylabel='loss',xlim=[10, num_epochs])  
    
    for epoch in range(num_epochs):
        # 创建计时器对象，用于计算每个epoch的训练时间
        timer = d2l.Timer()
        # 创建累加器对象，用于累加损失和标记的数量
        metric = d2l.Accumulator(2)
        for batch in data_iter:
            # 将输入数据移动到指定设备上
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            # 创建起始符号的张量bos，并移动到指定设备上
            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
                              device=device).reshape(-1,1)
            # 构造解码器的输入，将bos和去除最后一列的标签张量Y拼接起来
            dec_input = torch.cat([bos, Y[:,:-1]],1)
            # 前向传播，得到预测结果Y_hat
            Y_hat, _ = net(X, dec_input, X_valid_len)
            # 计算损失
            l = loss(Y_hat, Y, Y_valid_len)
            # 反向传播，计算梯度
            l.sum().backward()
            # 对梯度进行裁剪，防止梯度爆炸
            d2l.grad_clipping(net,1)
            # 计算标记的数量
            num_tokens = Y_valid_len.sum()
            # 更新模型参数
            optimizer.step()
            # 使用torch.no_grad()上下文管理器，关闭梯度计算，避免计算图的构建
            with torch.no_grad():
                # 累加损失和标记的数量
                metric.add(l.sum(), num_tokens)
        # 每10个epoch打印一次损失
        if (epoch + 1) % 10 == 0:
            # 绘制损失随训练epoch的变化情况
            animator.add(epoch+1, (metric[0]/metric[1],))   
    # 打印最终的损失值和训练速度
    print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} '
         f'tokens/sec on {str(device)}')      

In [11]:
embed_size,num_hiddens,num_layers,dropout=32,32,2,0.1
batch_size,num_steps=64,10
lr,num_epochs,device=0.005,300,d2l.try_gpu()
train_iter,src_vocab,tgt_vocab=d2l.load_data_nmt(batch_size,num_steps)
encoder=Seq2SeqEncoder(len(src_vocab),embed_size,num_hiddens,num_layers,dropout=dropout)
decoder=Seq2SeqDecoder(len(tgt_vocab),embed_size,num_hiddens,num_layers,dropout=dropout)
net = d2l.EncoderDecoder(encoder, decoder)
train_seq2seq(net, train_iter, lr, num_epochs, tgt_vocab, device)

UnicodeDecodeError: 'gbk' codec can't decode byte 0xaf in position 33: illegal multibyte sequence