# Transformer的pythoch实现
![](./picture/image0.png)

## 数据预处理
使用transformer执行文本翻译任务，数据集选用英文和法语数据集

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from batch import *
from process import *
import numpy as np
import time

# 数据
src_file = 'data/english.txt'
trg_file = 'data/french.txt'
src_lang = 'en_core_web_sm'
trg_lang = 'fr_core_news_sm'
max_strlen = 80
batchsize = 1500
src_data, trg_data = read_data(src_file, trg_file)  # 一个包含所有源语言（英语）句子的字符串列表。154883
EN_TEXT, FR_TEXT = create_fields(src_lang, trg_lang)
train_iter, src_pad, trg_pad = create_dataset(src_data, trg_data, EN_TEXT, FR_TEXT, max_strlen, batchsize) # , 1 , 1
src_vocab = len(EN_TEXT.vocab) #13724 源语句的token数量
trg_vocab = len(FR_TEXT.vocab) #23469 目标语句的token数量


loading spacy tokenizers...
creating dataset and iterator... 
1091


In [27]:
# 获取训练集中的前2个batch，便于测试
sample_batches = []
for i, batch in enumerate(train_iter):
    sample_batches.append(batch)
    if i >= 0:  # 只取前1个batch
        break

# 打印第一个batch的源语言和目标语言内容（索引形式）
print("第一个batch的源语言（索引）:")
print(sample_batches[0].src)
print("第一个batch的目标语言（索引）:")
print(sample_batches[0].trg)
# shape为torch.Size([7, 214])，214句话，每句话有7个token
# 如果想要将索引还原为单词，可以这样做：
src_vocab_obj = EN_TEXT.vocab
trg_vocab_obj = FR_TEXT.vocab

def indices_to_words(indices, vocab):
    return [vocab.itos[idx] for idx in indices]

# 以第一个batch的第一句话为例，转换为单词
src_indices = sample_batches[0].src[:, 0]  # 第一句
trg_indices = sample_batches[0].trg[:, 0]  # 第一句

print("第一个batch第一句源语言（单词）:")
print(indices_to_words(src_indices, src_vocab_obj))
print("第一个batch第一句目标语言（单词）:")
print(indices_to_words(trg_indices, trg_vocab_obj))

第一个batch的源语言（索引）:
tensor([[  10,   93,   12,  ...,    3,   13,   25],
        [   9,   10,   16,  ...,   37,  350, 1095],
        [ 204,   24,   68,  ...,    5,    5,   10],
        [  47,  252,    5,  ...,   33,   49,    4],
        [1503,  848,    4,  ..., 3155,   79,   40],
        [   2,    7,    2,  ...,    2,    2,    7]])
第一个batch的目标语言（索引）:
tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [  14,   87,   40,  ...,   19,   11, 1250],
        [  21,  122, 1575,  ...,  108,  271, 1209],
        ...,
        [   8,   27,   89,  ..., 7435,   81,    9],
        [   4,    7,    4,  ...,    4,    4,    7],
        [   3,    3,    3,  ...,    3,    3,    3]])
第一个batch第一句源语言（单词）:
['do', "n't", 'call', 'him', 'names', '.']
第一个batch第一句目标语言（单词）:
['<sos>', 'ne', "l'", 'insulte', 'pas', '.', '<eos>']


## 模型参数

In [28]:
d_model = 512
heads = 8
N = 6
dropout = 0.1

## Embedding

In [29]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, d_model) -> None:
        super().__init__()
        self.d_model = d_model
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)

## Positional Encoding
![positional encoding](./picture/image1.png)

In [30]:
import math
from torch.autograd import Variable
class positionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_len=80, dropout=0.1) -> None:
        super().__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
        #根据输入语句的token数和嵌入向量的维度构造位置编码矩阵
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** (i / d_model)))
                if i + 1 < d_model:
                    pe[pos, i + 1] = math.cos(pos / (10000 ** (i / d_model)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe) 

    def forward(self, x):
        # 使得单词嵌入表示相对大一些
        x = x * math.sqrt(self.d_model)
        # 增加位置常量到单词嵌入表示中
        seq_len = x.size(1)
        x = x + Variable(self.pe[:, :seq_len, :], requires_grad=False)
        return self.dropout(x)

## Self-Attention Layer
对于输入的句子X，通过 WordEmbedding 得到该句子中每个字的字向量，同时通过Positional Encoding 得到所有字的位置向量，将其相加(维度相同，可以直接相加)，得到该字真正的向量表示。第t个字的向量记作$x_t$

接着我们定义三个矩阵 $W_Q$,$W_K$,$W_V$,使用这三个矩阵分别对所有的字向量进行三次线性变换，于是所有的字向量又衍生出三个新的向量$q_t$,$k_t$,$v_t$。我们将所有的q向量拼成一个大矩阵，记作查询矩阵Q，将所有的k向量拼成一个大矩阵，记作键矩阵K，将所有的v向量拼成一个大矩阵，记作值矩阵V(见下图)

![](./picture/image2.png)

接下来将Q和$K^T$相乘，得到注意力分数矩阵，其中的每一行代表一个query(当前token)对所有的key的相关性分数，然后除以$\sqrt{d_k}$(这是论文中提到的一个 trick),经过 softmax后,每一行是一个概率分布，表示该query“关注”每个key的程度，再乘以 V 得到输出，此时的每一行代表当前query（比如当前单词）在全局信息加权融合后的新表示，这个新表示综合了序列中所有位置的信息。

![](./picture/image3.png)

### Multi-Head Attention
上面所定义的一组Q,K,V得到的融合向量只能从一个“视角”去捕捉信息，我们可以定义多组Q,K,V，让每个头关注不同的特则会给你、关系或位置，从而获得更丰富的信息表达。计算 Q,K,V 的过程还是一样，只不过线性变换的矩阵从一组($W^Q,W^K,W^V$)变成了多组$(W_0^Q,W_0^K,W_0^V),(W_1^Q,W_1^Q,W_1^Q)$,..如下图所示

![](./picture/image4.png)

对于输入矩阵 X，每一组 Q、K 和 V 都可以得到一个输出矩阵Z。如下图所示

![](./picture/image5.png)

### Src_input Paddding Mask
对于encodee的源语句输入时，其中每个mini-batch是由多个不等长的句子组成的，我们需要按照这个mini-batch中最大的句长对剩余的句子进行补齐，一般用0进行填充，这个过程叫做 padding但这时在进行 softmax 就会产生问题。回顾softmax函数$\sigma(z_i) = \frac{e^{z_i}}{\sum_{j=1}^{K} e^{z_j}}$,$e^0$是1，是有值的，这样的话 softmax 中被 padding 的部分就参与了运算，相当于让无效的部分参与了运算，这可能会产生很大的隐患。因此需要做一个 mask 操作，让这些无效的区域不参与运算，一般是给无效区域加一个很大的负数偏置，即
$$
Z_{\text{illegal}} = Z_{\text{illegal}} + \text{bias}_{\text{illegal}}
$$
$$
\text{bias}_{\text{illegal}} \rightarrow -\infty
$$

In [31]:
import math
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout=0.1):
        super().__init__()

        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)

    def attention(self, q, k, v, d_k, mask=None, dropout=None):
        #(166*8*9*64) * (166*8*64*9) --> (166*8*9*9)  
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)

        # 此时mask的维度为(166*1*8)
        # 掩盖那些为了补全长度而增加的单元，使其通过Softmax计算后为0
        if mask is not None:
            mask = mask.unsqueeze(1)
            scores = scores.masked_fill(mask == 0, -1e9)
        scores = F.softmax(scores, dim=-1)
        if dropout is not None:
            scores = dropout(scores)
        
        output = torch.matmul(scores, v)
        return output
    
    def forward(self, q, k, v, mask=None):
        bs = q.size(0)

        # 为了通过一个线性层实现多个头并行计算，将输入维度减小，即
        # 输入向量的维度为(166*9*512)，经过线性层的维度仍为(166*9*512)，
        # 接着改变特征维度为(166*9*8*64) (batch_size * max_length * heads * d_k)
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)

        # 矩阵转置
        k = k.transpose(1, 2)
        q = q.transpose(1, 2)
        v = v.transpose(1, 2)
        
        #multi_output的维度为（166*8*9*64）
        multi_output = self.attention(q, k, v, self.d_k, mask, dropout=self.dropout)
        
        # 连接多个头并输入最后的线性层 （166*9*512）
        concat = multi_output.transpose(1, 2).contiguous().view(bs, -1, self.d_model)

        output = self.out(concat)

        return output

    

## 残差连接和Layer Normalization
### 残差连接
我们在上一步得到了经过 self-attention 加权之后输出，也就是Self-Attention(Q，K，V)，然后把他们加起来做残差连接
$$
X_{embedding}= self-Attention(Q,K,V)
$$
### Layer Normalization
Layer Normalization 的作用是把神经网络中隐藏层归一为标准正态分布，也就是 i.i.d 独立同分布，以起到加快训练速度，加速收敛的作用.
$$
\mu_j = \frac{1}{m} \sum_{i=1}^{m} x_{ij}
$$
上式以矩阵的列为单位求均值
$$
\sigma_j^2 = \frac{1}{m} \sum_{i=1}^{m} (x_{ij} - \mu_j)^2
$$
上式以矩阵的列为单位求方差
$$
\mathrm{LayerNorm}(x) = \frac{x_{ij} - \mu_j}{\sqrt{\sigma_j^2 + \epsilon}}
$$
然后用每一列的每一个元素减去这列的均值，再除以这列的标准差，从而得到归一化后的数值，加$\epsilon$是为了防止分母为 0

![](./picture/image6.png)

In [32]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()

        self.d_model = d_model

        # 层归一化包含两个可学习参数
        self.alpha = nn.Parameter(torch.ones(self.d_model))
        self.bias = nn.Parameter(torch.zeros(self.d_model))

        self.eps = eps

    def forward(self, x):
        #对于每句话的每个token得到的特征向量求均值方差
        mean = x.mean(dim=-1, keepdim = True)
        std = x.std(dim=-1, keepdim = True)
        norm = self.alpha * (x - mean) / (std + self.eps) + self.bias
        return norm


## FeedForward Layer
前馈层就是简单做了两次线性变换，中间有个dropout

In [33]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super().__init__()

        # d_ff 默认设为 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

## Transformer Encoder 整体结构
用公式把一个Encoder block的计算过程表达出来：
1) 字向量与位置编码
$$
X = \mathrm{Embedding-Lookup}(X) + \mathrm{Positional-Encoding}
$$
2) 自注意力机制
$$
Q = \mathrm{Linear}_q(X) = XW_Q 
$$
$$
K = \mathrm{Linear}_k(X) = XW_K 
$$
$$
V = \mathrm{Linear}_v(X) = XW_V 
$$
$$
X_{\text{attention}} = \text{Self-Attention}(Q, K, V)
$$
3) self-attention 残差连接与Layer Normalization
$$
X_{\text{attention}} = X + X_{\text{attention}} 
$$
$$
X_{\text{attention}} = \text{LayerNorm}(X_{\text{attention}})
$$
4) 前馈层feedforward，其实就是两层线性映射并用激活函数激活，比如说 ReLU
$$
X_{\mathrm{hidden}} = \mathrm{Linear}(\mathrm{ReLU}(\mathrm{Linear}(X_{\mathrm{attention}})))
$$
5) FeedForward 残差连接与 Layer Normalization
$$
X_{\text{hidden}} = X_{\text{attention}} + X_{\text{hidden}}
$$
$$
X_{\text{hidden}} = \text{LayerNorm}(X_{\text{hidden}})
$$
其中
$$
X_{\mathrm{hidden}} \in \mathbb{R}^{\mathrm{batch\_size} \times \mathrm{seq\_len.} \times \mathrm{embed\_dim}}
$$

In [34]:
#可重复的编码器层
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        
        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.dropout_1 = nn.Dropout(dropout)
        self.norm_1 = Norm(d_model)
        self.ff = FeedForward(d_model, dropout=dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.norm_2 = Norm(d_model)

    def forward(self, x, mask):
        attn_output = self.attn(x, x, x, mask)
        attn_output = self.dropout_1(attn_output)
        x = x + attn_output
        x = self.norm_1(x)
        ff_output = self.ff(x)
        ff_output = self.dropout_2(ff_output)
        x = x + ff_output
        x = self.norm_2(x)
        return x

In [35]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super().__init__()
        self.d_model = d_model
        self.heads = heads
        self.N = N
        self.vocab_size = vocab_size
        self.dropout = dropout
        self.embedding = Embedding(vocab_size, d_model)
        self.pos_encoding = positionalEncoding(d_model, dropout=dropout)
        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
        #self.norm = Norm(d_model)
    def forward(self, src, mask):
        x = self.embedding(src)
        x = self.pos_encoding(x)
        for i in range(self.N):
            x = self.layers[i](x, mask)  
        return x

## Transformer Decoder 整体结构
我们先从HighLevel 的角度观察-下 Decoder 结构，从下到上依次是:
1) Masked Multi-Head Self-Attention
2) Multi-Head Encoder-Decoder Attention
3) FeedForward Network
和 Encoder 一样，上面三个部分的每一个部分，都有一个残差连接，后接一个LayerNormalization Decoder 的中间部件并不复杂，大部分在前面 Encoder 里我们已经介绍过了，但是 Decoder 由于其特殊的功能，因此在训练时会涉及到一些细节
![](./picture/image0.png)

## Masked Self-Attention
具体来说，传统 Seq2Seq中 Decoder使用的是 RNN 模型，因此在训练过程中输入t时刻的词，模型无论如何也看不到未来时刻的词，因为循环神经网络是时间驱动的，只有当t时刻运算结束了，才能看到t+1时刻的词。而 Transformer Decoder 抛弃了 RNN，改为 Self-Attention，由此就产生了一个问题，在训练过程中，整个 ground truth 都暴露在 Decoder中，这显然是不对的，我们需要对 Decoder 的输入进行一些处理，该处理被称为 Mask.
举个例子，Decoder 的 ground truth 为"<start> l am fine"，我们将这个句子输入到Decoder 中，经过 WordEmbedding 和 Positional Encoding 之后，将得到的矩阵做三次线性变换$W_Q, W_K, W_V$。然后进行 self-attention 操作，首先通过$ \frac{Q \times K^T}{\sqrt{d_k}}$得到layarScaled Scores，接下来非常关键，我们要对 Scaled Scores 进行 Mask，举个例子，当我们输入"I"时，模型目前仅知道包括"I"在内之前所有字的信息，即"<start>"和"I"的信息,不应该让其知道"I"之后词的信息。道理很简单，我们做预测的时候是按照顺序一个字一个字的预测，怎么能这个字都没预测完，就已经知道后面字的信息了呢?Mask 非常简单，首先生成一个下三角全 0，上三角全为负无穷的矩阵，然后将其与 Scaled Scores 相加即可
![](./picture/image7.png)
之后再做 softmax，就能将-inf 变为 0，得到的这个矩阵即为每个字之间的权重
![](./picture/image8.png)

In [42]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)

        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)

        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.ff = FeedForward(d_model, dropout=dropout)

    def forward(self, x, e_outputs, src_mask, trg_mask):
        attn_output_1 = self.attn_1(x, x, x, trg_mask)
        attn_output_1 = self.dropout_1(attn_output_1)
        x = x + attn_output_1
        x = self.norm_1(x)
        attn_output_2 = self.attn_2(x, e_outputs, e_outputs, src_mask)
        attn_output_2 = self.dropout_2(attn_output_2)
        x = x + attn_output_2
        x = self.norm_2(x)

        ff_output = self.ff(x)
        ff_output = self.dropout_3(ff_output)
        x = x + ff_output
        x = self.norm_3(x)

        return x

In [43]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super().__init__()
        self.N = N
        self.embed = Embedding(vocab_size, d_model)
        self.pe = positionalEncoding(d_model, dropout=dropout)
        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)

    def forward(self, trg, e_outputs, src_mask, trg_mask):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
        return self.norm(x)

In [44]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
        self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout)
        self.out = nn.Linear(d_model, trg_vocab)

    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask) #166*8*512
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = self.out(d_output) #166*8*13568
        return output

In [46]:
##训练代码
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
def nopeak_mask(size, device):
    np_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
    np_mask = (torch.from_numpy(np_mask == 0)).to(device) # Move mask to device
    # Using Variable is deprecated, but keeping for compatibility
    # np_mask = Variable(np_mask)
    return np_mask

def create_masks(src, trg, src_pad, trg_pad):
    
    src_mask = (src != src_pad).unsqueeze(-2) # 136*1*8

    if trg is not None:
        trg_mask = (trg != trg_pad).unsqueeze(-2) # 136*1*11
        size = trg.size(1)
        # Pass the device of trg to nopeak_mask
        np_mask = nopeak_mask(size, trg.device)
        trg_mask = trg_mask & np_mask  #136*11*11
        #此时的trg_mask是一个上三角为0、下三角及对角线为1的mask，且pad部分也为0
    else:
        trg_mask = None
    return src_mask, trg_mask

model = Transformer(src_vocab, trg_vocab, d_model, N, heads, dropout)
model.to(device) # Move model to device
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# 模型训练
def train_model(epochs, print_every=100):
    model.train()

    start = time.time()
    temp = start

    total_loss = 0

    for epoch in range(epochs):
        for i, batch in enumerate(train_iter):
            src = batch.src.transpose(0, 1).to(device) # Move src to device
            trg = batch.trg.transpose(0, 1).to(device) # Move trg to device (107*14)
            # 将我们输入的英语句子中的所有单词翻译成法语
            # 除了最后一个单词，因为它为结束符，不需要进行下一个单词的预测

            trg_input = trg[:, :-1] # trg_input is already on device as it's a slice of trg

            # 试图预测单词 这一变量用于计算损失，其中每一句目标语句都删去起始符<sos>
            targets = trg[:, 1:].contiguous().view(-1).to(device) # Move targets to device

            # 使用掩码代码创建函数来制作掩码
            src_mask, trg_mask = create_masks(src, trg_input, src_pad, trg_pad)
            src_mask = src_mask.to(device) # Move src_mask to device
            trg_mask = trg_mask.to(device) # Move trg_mask to device

            preds = model(src, trg_input, src_mask, trg_mask)

            optim.zero_grad()

            loss = F.cross_entropy(preds.view(-1, preds.size(-1)), 
                                   targets, ignore_index=trg_pad)
            loss.backward()
            optim.step()

            total_loss += loss.item()
            if (i + 1) % print_every == 0:
                loss_avg = total_loss / print_every
                print("time = %dm, epoch %d, iter = %d, loss = %.3f, %ds per %d iters" % 
                      ((time.time() - start) // 60, epoch + 1, i + 1, loss_avg, 
                       time.time() - temp, print_every))
                total_loss = 0
                temp = time.time()

Using device: cuda


In [47]:
#模型测试
def translate(src, max_len=80, custom_string=False):
    model.eval()  # 设置模型为评估模式（推理时不启用dropout等）
    if custom_string == True:
        src = tokenize_en(src, EN_TEXT)  # 如果输入是自定义字符串，先分词并转为索引
        src = torch.LongTensor(src)      # 转为LongTensor类型
    print(src)  # 打印输入的索引序列 (1*5)
    src = src.to(device)  # 将输入移动到指定设备（CPU或GPU）
    src_mask = (src != src_pad).unsqueeze(-2).to(device)  # 构造源语言的mask，并移动到device
    e_outputs = model.encoder(src.unsqueeze(0), src_mask)  # 通过encoder得到编码结果，增加batch维度

    # 创建目标输出序列的张量，初始全为0，放到device上
    outputs = torch.zeros(max_len).type_as(src.data).to(device)
    # 设置第一个token为<sos>（起始符），并移动到device
    outputs[0] = torch.LongTensor([FR_TEXT.vocab.stoi['<sos>']]).to(device)

    # 逐步生成目标序列
    for i in range(1, max_len):
        # 构造目标mask（下三角矩阵，防止看到未来信息）
        trg_mask = np.triu(np.ones((1, i, i)).astype('uint8'))
        trg_mask = Variable(torch.from_numpy(trg_mask) == 0).to(device)  # 转为torch张量并移动到device

        # 解码器生成下一个token的概率分布
        out = model.out(model.decoder(outputs[:i].unsqueeze(0), 
                                      e_outputs, src_mask, trg_mask))
        out = F.softmax(out, dim=-1)  # 对最后一维做softmax，得到概率分布
        val, ix = out[:, -1].data.topk(1)  # 取概率最大的token索引

        outputs[i] = ix[0][0]  # 将预测的token写入outputs
        if ix[0][0] == FR_TEXT.vocab.stoi['<eos>']:  # 如果预测到<eos>，停止生成
            break
    # 将索引序列转为单词，并用空格拼接成句子返回
    return ' '.join(
        [FR_TEXT.vocab.itos[ix] for ix in outputs[:i]]
    )

In [48]:
train_model(1)
words = 'Let me see.'
print(translate(words, custom_string=True))

time = 0m, epoch 1, iter = 100, loss = 7.153, 11s per 100 iters
time = 0m, epoch 1, iter = 200, loss = 5.715, 11s per 100 iters
time = 0m, epoch 1, iter = 300, loss = 5.492, 11s per 100 iters
time = 0m, epoch 1, iter = 400, loss = 5.406, 11s per 100 iters
time = 0m, epoch 1, iter = 500, loss = 5.370, 11s per 100 iters
time = 1m, epoch 1, iter = 600, loss = 5.302, 11s per 100 iters
time = 1m, epoch 1, iter = 700, loss = 5.207, 11s per 100 iters
time = 1m, epoch 1, iter = 800, loss = 5.234, 11s per 100 iters
time = 1m, epoch 1, iter = 900, loss = 5.169, 11s per 100 iters
time = 1m, epoch 1, iter = 1000, loss = 5.218, 11s per 100 iters
tensor([88, 19, 86,  2])
<sos> . . . . .
