[machine translation corpus](https://www.manythings.org/anki/)

In [2]:
!wget https://www.manythings.org/anki/cmn-eng.zip -P ./data/

--2023-02-04 18:40:22--  https://www.manythings.org/anki/cmn-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1270861 (1.2M) [application/zip]
Saving to: ‘./data/cmn-eng.zip’


2023-02-04 18:40:27 (370 KB/s) - ‘./data/cmn-eng.zip’ saved [1270861/1270861]



In [6]:
!unzip ./data/cmn-eng.zip -d ./data/

Archive:  ./data/cmn-eng.zip
  inflating: ./data/cmn.txt          
  inflating: ./data/_about.txt       


In [1]:
import time
from collections import Counter
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

import zhconv
import jieba

## load corpus

load sentences pair in chinese and english

In [2]:
def load_data():
    chi_list = []
    eng_list = []
    with open('./data/cmn.txt') as f:
        for line in f:
            eng_sent, chi_sent, _ = line.split('\t')
            
            # traditional to simplified
            # zhconv.convert('走開！', 'zh-cn')
            chi_sent = zhconv.convert(chi_sent, 'zh-cn')
            
            chi_list.append(chi_sent)
            eng_list.append(eng_sent)
    return chi_list, eng_list

In [3]:
chi_list, eng_list = load_data()

In [4]:
list(zip(chi_list[:3], eng_list[:3]))

[('嗨。', 'Hi.'), ('你好。', 'Hi.'), ('你用跑的。', 'Run.')]

In [5]:
list(zip(chi_list[-3:], eng_list[-3:]))

[('你很容易把母语说得通顺流畅，却很容易把非母语说得不自然。',
  "It's very easy to sound natural in your own native language, and very easy to sound unnatural in your non-native language."),
 ('虽然我被公司解雇了，但是我还有点存款，所以目前不用担心生计问题。',
  "I got fired from the company, but since I have a little money saved up, for the time being, I won't have trouble with living expenses."),
 ('如果一个人在成人前没有机会习得目标语言，他对该语言的认识达到母语者程度的机会是相当小的。',
  "If a person has not had a chance to acquire his target language by the time he's an adult, he's unlikely to be able to reach native speaker level in that language.")]

## data preprocess

In [7]:
def is_space(word):
    if not word:
        # None ''
        return True
    if word.isspace():
        #\t \n \r \u202f \xa0
        return True
    return False

In [8]:
words = ['', ' ', 'i' , 'recommend', 'item']
[w for w in words if not is_space(w)]

['i', 'recommend', 'item']

In [9]:
def fill_space(text):
    # for english only
    # fill space before ,.!? which is convenient to tokenize
    def no_space(char, prev_char):
        return char in set(',.!?') and prev_char != ' '
    
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char for i, char in enumerate(text)]
    return ''.join(out)

In [10]:
fill_space("It's very easy to sound natural in your own native language, and very easy to sound unnatural in your non-native language.")

"It's very easy to sound natural in your own native language , and very easy to sound unnatural in your non-native language ."

In [11]:
def tokenize(chi_list, eng_list):
    
    source = []
    for sent in chi_list:
        words = jieba.cut(sent)
        words = [w for w in words if not is_space(w)]
        source.append(words)
        
    target = []
    for sent in eng_list:
        sent = sent.lower()
        sent = fill_space(sent)
        words = sent.split(' ')
        words = [w for w in words if not is_space(w)]
        target.append(words)
        
    return source, target

In [12]:
source, target = tokenize(chi_list, eng_list)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.410 seconds.
Prefix dict has been built successfully.


In [13]:
source[:5]

[['嗨', '。'], ['你好', '。'], ['你', '用', '跑', '的', '。'], ['住手', '！'], ['等等', '！']]

In [14]:
target[:5]

[['hi', '.'], ['hi', '.'], ['run', '.'], ['stop', '!'], ['wait', '!']]

In [22]:
arr = []
for sent_cn, sent_eng in zip(source, target):
    arr.append([len(sent_cn), len(sent_eng)])
arr = torch.tensor(arr, dtype=torch.float32)

In [23]:
arr.max(dim=0)

torch.return_types.max(
values=tensor([30., 34.]),
indices=tensor([29362, 29370]))

In [24]:
arr.mean(dim=0)

tensor([6.6265, 7.2073])

In [25]:
# num_steps = 10, is ok.

In [26]:
class Vocab(object):
    """vocabulary"""
    
    def __init__(self, tokens, min_freq=0, reserved_tokens=['<pad>', '<bos>', '<eos>']):
        tokens = [token for line in tokens for token in line]
        self._token_freqs = sorted(Counter(tokens).items(), key=lambda x: x[1], reverse=True)
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
        
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) -1
                
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]
    
    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token(indices)
        return [self.idx_to_token[indice] for indice in indices]
    
    @property
    def unk(self):
        # index for the unknown token
        return 0
    
    @property
    def token_freqs(self):
        return self._token_freqs

### vocabulary

In [27]:
src_vocab = Vocab(source, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])
tgt_vocab = Vocab(target, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])

In [28]:
src_vocab['<pad>'], src_vocab['<unk>']

(1, 0)

In [29]:
src_vocab[['<unk>', '<pad>', '<bos>', '<eos>', '。']]

[0, 1, 2, 3, 4]

In [30]:
def truncate_pad(line, num_steps, padding_token):
    """truncate or pad sequence"""
    if len(line) > num_steps: 
        # trancate
        return line[:num_steps]
    return line + [padding_token] * (num_steps - len(line)) # pad

In [31]:
def to_tensor(lines, vocab, num_steps):
    """transfrom text sequence of machine traslation into minibatches."""
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab['<eos>']] for l in lines]
    arr = [ truncate_pad(l, num_steps, vocab['<pad>'])  for l in lines]
    arr = torch.tensor(arr)
    
    valid_len = (arr != vocab['<pad>']).type(torch.int32).sum(dim=1)
    
    return arr, valid_len

### parameters

In [32]:
batch_size, num_steps = 128, 10

In [33]:
src_arr, src_valid_len = to_tensor(source, src_vocab, num_steps=num_steps)
tgt_arr, tgt_valid_len = to_tensor(target, tgt_vocab, num_steps=num_steps)

In [34]:
src_arr[:5], src_valid_len[:5]

(tensor([[2500,    4,    3,    1,    1,    1,    1,    1,    1,    1],
         [1210,    4,    3,    1,    1,    1,    1,    1,    1,    1],
         [   8,  112,  324,    6,    4,    3,    1,    1,    1,    1],
         [4522,   72,    3,    1,    1,    1,    1,    1,    1,    1],
         [2501,   72,    3,    1,    1,    1,    1,    1,    1,    1]]),
 tensor([3, 3, 6, 3, 3]))

In [35]:
def to_dataloader(arr_list, batch_size, is_train=True):
    dataset = TensorDataset(*arr_list)
    return DataLoader(dataset, batch_size, shuffle=is_train)

In [36]:
src_valid_len[:5], src_valid_len[-5:]

(tensor([3, 3, 6, 3, 3]), tensor([10, 10, 10, 10, 10]))

In [37]:
dataloader = to_dataloader((src_arr, src_valid_len, tgt_arr, tgt_valid_len), batch_size=batch_size)

In [38]:
for e in iter(dataloader):
    print(e[0].shape, e[1].shape, e[2].shape, e[3].shape)
    break

torch.Size([128, 10]) torch.Size([128]) torch.Size([128, 10]) torch.Size([128])


## seq2seq with attention

### functions

In [46]:
def sequence_mask(X, valid_len, value=0):
    """Mask irrelevant entries in sequences."""
    maxlen = X.size(1)
    mask = torch.arange((maxlen), dtype=torch.float32,
                        device=X.device)[None, :] < valid_len[:, None]
    X[~mask] = value
    return X

In [47]:
def masked_softmax(X, valid_lens):
    """Perform softmax operation by masking elements on the last axis."""

    # `X`: 3D tensor, `valid_lens`: 1D or 2D tensor
    if valid_lens is None:
        return nn.functional.softmax(X, dim=-1)
    else:
        shape = X.shape
        if valid_lens.dim() == 1:
            valid_lens = torch.repeat_interleave(valid_lens, shape[1])
        else:
            valid_lens = valid_lens.reshape(-1)
        # On the last axis, replace masked elements with a very large negative
        # value, whose exponentiation outputs 0
        X = sequence_mask(X.reshape(-1, shape[-1]), valid_lens,
                              value=-1e6)
        return nn.functional.softmax(X.reshape(shape), dim=-1)

### interface

In [48]:
class Encoder(nn.Module):
    """The base encoder interface for the encoder-decoder architecture."""
    def __init__(self, **kwargs):
        super(Encoder, self).__init__(**kwargs)

    def forward(self, X, *args):
        raise NotImplementedError

class Decoder(nn.Module):
    """The base decoder interface for the encoder-decoder architecture.

    Defined in :numref:`sec_encoder-decoder`"""
    def __init__(self, **kwargs):
        super(Decoder, self).__init__(**kwargs)

    def init_state(self, enc_outputs, *args):
        raise NotImplementedError

    def forward(self, X, state):
        raise NotImplementedError

class EncoderDecoder(nn.Module):
    """The base class for the encoder-decoder architecture.

    Defined in :numref:`sec_encoder-decoder`"""
    def __init__(self, encoder, decoder, **kwargs):
        super(EncoderDecoder, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, enc_X, dec_X, *args):
        enc_outputs = self.encoder(enc_X, *args)
        dec_state = self.decoder.init_state(enc_outputs, *args)
        return self.decoder(dec_X, dec_state)
        
class AttentionDecoder(Decoder):
    """带有注意力机制解码器的基本接口"""
    
    def __init__(self, **kwargs):
        super(AttentionDecoder, self).__init__(**kwargs)
        
    @property
    def attention_weights(self):
        raise NotImplementedError

In [49]:
class AdditiveAttention(nn.Module):
    """Additive attention.

    Defined in :numref:`sec_attention-scoring-functions`"""
    def __init__(self, key_size, query_size, num_hiddens, dropout, **kwargs):
        super(AdditiveAttention, self).__init__(**kwargs)
        self.W_k = nn.Linear(key_size, num_hiddens, bias=False)
        self.W_q = nn.Linear(query_size, num_hiddens, bias=False)
        self.w_v = nn.Linear(num_hiddens, 1, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, queries, keys, values, valid_lens):
        queries, keys = self.W_q(queries), self.W_k(keys)
        # After dimension expansion, shape of `queries`: (`batch_size`, no. of
        # queries, 1, `num_hiddens`) and shape of `keys`: (`batch_size`, 1,
        # no. of key-value pairs, `num_hiddens`). Sum them up with
        # broadcasting
        features = queries.unsqueeze(2) + keys.unsqueeze(1)
        features = torch.tanh(features)
        # There is only one output of `self.w_v`, so we remove the last
        # one-dimensional entry from the shape. Shape of `scores`:
        # (`batch_size`, no. of queries, no. of key-value pairs)
        scores = self.w_v(features).squeeze(-1)
        self.attention_weights = masked_softmax(scores, valid_lens)
        # Shape of `values`: (`batch_size`, no. of key-value pairs, value
        # dimension)
        return torch.bmm(self.dropout(self.attention_weights), values)

In [50]:
class Seq2SeqEncoder(Encoder):
    """The RNN encoder for sequence to sequence learning.

    Defined in :numref:`sec_seq2seq`"""
    # encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8, num_hiddens=16, num_layers=2)
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super(Seq2SeqEncoder, self).__init__(**kwargs)
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, num_hiddens, num_layers,
                          dropout=dropout)

    def forward(self, X, *args):
        # The output `X` shape: (`batch_size`, `num_steps`, `embed_size`)
        X = self.embedding(X)
        # In RNN models, the first axis corresponds to time steps
        X = X.permute(1, 0, 2)
        # When state is not mentioned, it defaults to zeros
        output, state = self.rnn(X)
        # `output` shape: (`num_steps`, `batch_size`, `num_hiddens`)
        # `state` shape: (`num_layers`, `batch_size`, `num_hiddens`)
        return output, state

In [51]:
class Seq2SeqAttentionDecoder(AttentionDecoder):
    
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0, **kwargs):
        super(Seq2SeqAttentionDecoder, self).__init__()
        
        self.attention = AdditiveAttention(num_hiddens, num_hiddens, num_hiddens, dropout)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size+num_hiddens, num_hiddens, num_layers, dropout=dropout)
        self.dense = nn.Linear(num_hiddens, vocab_size)
        
    def init_state(self, enc_outputs, enc_valid_lens, *args):
        # encodet output:
        # `output` shape: (`num_steps`, `batch_size`, `num_hiddens`)
        # `state` shape: (`num_layers`, `batch_size`, `num_hiddens`)
        
        outputs, hidden_state = enc_outputs
        return (outputs.permute(1, 0, 2), hidden_state, enc_valid_lens)
    
        # return:
        # outputs.shape = [batch_size，num_steps，num_hiddens]
        # hidden_state.shape = [num_layers，batch_size，num_hiddens]
    
    def forward(self, X, state):
        # X.shape = [batch_size, num_steps]
        
        # return:
        # enc_outputs = outputs.shape = [batch_size，num_steps，num_hiddens]
        # hidden_state.shape = [num_layers，batch_size，num_hiddens]
        enc_outputs, hidden_state, enc_valid_lens = state
        
        X = self.embedding(X).permute(1, 0, 2)
        # X.embedding.shape = [batch_size, num_steps, embed_size]
        # X.shape = [num_steps, batch_size, embed_size]
        
        outputs, self._attention_weights = [], []
        
        # iterate num_steps or words:
        for x in X:
            # x.shape = [batch_size, embed_size]
            
            # hidden_state[-1].shape = [batch_size, num_hiddens] , alwasys only use the finnal layrers hidden_state
            # unsqueeze.shape = [batch_size, 1, num_hiddens]
            query = torch.unsqueeze(hidden_state[-1], dim=1)
            # query.shape = [batch_size, 1, num_hiddens]
            
            # additive attention API:
            # forward(self, queries, keys, values, valid_lens):
            # queries = [batch_size, 查询的个数, 查询的维度] = [batch_size, num_queries, q]
            # keys = [batch_size, “键－值”对的个数, 键的维度] = [batch_size, num_keys, k]
            # values = [batch_size, “键－值”对的个数, 值得维度]
            
            # query.shape = [batch_size, num_steps(num_words), num_hiddens]
            # query.shape = [batch_size, 1, num_hiddens]
            # enc_outputs = keys = outputs.shape = [batch_size，num_steps，num_hiddens]
            # enc_outputs =values = outputs.shape = [batch_size，num_steps，num_hiddens]
            context = self.attention(query, enc_outputs, enc_outputs, enc_valid_lens)
            # context.shape = : 加权平均weight_mean = [batch_size, 查询的个数, 值的维度)
            # context.shape = : 加权平均weight_mean = [batch_size, 1, num_hiddens)
            
            # x.unsqueeze.shape = [batch_size, 1, embed_size]
            x = torch.cat((context, torch.unsqueeze(x, dim=1)), dim=-1) # cat on the last dim
            # x.shape = [batch_size, 1, embed_size+num_hiddens]
            
            # x.permute.shape = [1, batch_size, embed_size+num_hiddens]
            # GRU.input.shape = [seq_len(num_steps, word_len), batch_size, num_hiddens=embed_size+num_hiddens] 符合预期
            
            # hidden_state.shape = [num_layers，batch_size，num_hiddens]
            # GRU.h_0.shape = [number_layers, batch_size, num_hiddens]  符合预期，这个不需要cat拼接
            out, hidden_state = self.rnn(x.permute(1, 0, 2), hidden_state)
            # out.shape = [num_steps, batch_size, num_hiddens]  见下图， rnn 将x_t(embed_size+num_hiddens) 与 h_t-1 合并为 num_hiddens
            # 注意哦，RNN的输入包含了Linear的in_features, 也包含Linear.out_features也是Linear的输入
            # hidden_state = [num_layers, batch_size, num_hiddens=hidden_size]
           
            outputs.append(out)
            # [ inner.shape = num_steps, batch_size, num_hiddens ] 经RNN合并后的num_hiddens
            
            
            self._attention_weights.append(self.attention.attention_weights)
            
        # definition: dense = nn.Linear(num_hiddens, vocab_size)
        # outpus.cat.shape = out.shape = [num_steps, batch_size, num_hiddens] 经RNN合并后的num_hiddens, 符合预期
        outputs = self.dense(torch.cat(outputs, dim=0)) # dim=0, 列叠加
        # outputs.shape = [num_steps,batch_size,vocab_size], 符合预期 
        
        return outputs.permute(1, 0, 2), [enc_outputs, hidden_state, enc_valid_lens]
        # outputs.shape = [batch_size, num_steps, vocab_size]
        # enc_outputs = encoder_outputs keep it's value from encoder layer
        # hidden_state.shape = [num_layers，batch_size，num_hiddens],   hidden_state keep it's shape, and it's values are updated
    
    @property
    def attention_weights(self):
        return self._attention_weights

In [52]:
len(tgt_vocab)

4581

In [53]:
len(src_vocab)

6537

### parameter

In [54]:
def grad_clipping(net, theta):
    """Clip the gradient."""

    if isinstance(net, nn.Module):
        params = [p for p in net.parameters() if p.requires_grad]
    else:
        params = net.params
    norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm

In [55]:
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
    """The softmax cross-entropy loss with masks."""

    # `pred` shape: (`batch_size`, `num_steps`, `vocab_size`)
    # `label` shape: (`batch_size`, `num_steps`)
    # `valid_len` shape: (`batch_size`,)
    def forward(self, pred, label, valid_len):
        weights = torch.ones_like(label)
        weights = sequence_mask(weights, valid_len)
        self.reduction='none'
        unweighted_loss = super(MaskedSoftmaxCELoss, self).forward(
            pred.permute(0, 2, 1), label)
        weighted_loss = (unweighted_loss * weights).mean(dim=1)
        return weighted_loss

In [56]:
class Accumulator(object):
    """For accumulating sums over `n` variables."""
    def __init__(self, n):
        """Defined in :numref:`sec_softmax_scratch`"""
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [57]:
class Timer:
    """Record multiple running times."""
    def __init__(self):
        """Defined in :numref:`subsec_linear_model`"""
        self.times = []
        self.start()

    def start(self):
        """Start the timer."""
        self.tik = time.time()

    def stop(self):
        """Stop the timer and record the time in a list."""
        self.times.append(time.time() - self.tik)
        return self.times[-1]

    def avg(self):
        """Return the average time."""
        return sum(self.times) / len(self.times)

    def sum(self):
        """Return the sum of time."""
        return sum(self.times)

    def cumsum(self):
        """Return the accumulated time."""
        return np.array(self.times).cumsum().tolist()

In [58]:
def try_gpu(i=0):
    """Return gpu(i) if exists, otherwise return cpu()."""
    
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

## Train

In [66]:
embed_size, num_hiddens, num_layers, dropout = 100, 128, 2, 0.1
batch_size, num_steps = 128, 10
lr = 0.005
num_epochs = 100
device = try_gpu()

In [67]:
encoder = Seq2SeqEncoder(len(src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqAttentionDecoder(len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout)

In [68]:
net = EncoderDecoder(encoder, decoder)

In [69]:
net

EncoderDecoder(
  (encoder): Seq2SeqEncoder(
    (embedding): Embedding(6537, 100)
    (rnn): GRU(100, 128, num_layers=2, dropout=0.1)
  )
  (decoder): Seq2SeqAttentionDecoder(
    (attention): AdditiveAttention(
      (W_k): Linear(in_features=128, out_features=128, bias=False)
      (W_q): Linear(in_features=128, out_features=128, bias=False)
      (w_v): Linear(in_features=128, out_features=1, bias=False)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embedding): Embedding(4581, 100)
    (rnn): GRU(228, 128, num_layers=2, dropout=0.1)
    (dense): Linear(in_features=128, out_features=4581, bias=True)
  )
)

In [70]:
def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device):
    """Train a model for sequence to sequence."""
    
    def xavier_init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])
    net.apply(xavier_init_weights)
    net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    net.train()
    for epoch in range(num_epochs):
        timer = Timer()
        metric = Accumulator(2)  # Sum of training loss, no. of tokens
        for batch in data_iter:
            optimizer.zero_grad()
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
                               device=device).reshape(-1, 1)
            dec_input = torch.cat([bos, Y[:, :-1]], 1)  # Teacher forcing
            Y_hat, _ = net(X, dec_input, X_valid_len)
            l = loss(Y_hat, Y, Y_valid_len)
            l.sum().backward()  # Make the loss scalar for `backward`
            grad_clipping(net, 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()
            with torch.no_grad():
                metric.add(l.sum(), num_tokens)
        print(f'epoch {epoch} loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} '
              f'tokens/sec on {str(device)}')

In [71]:
train_seq2seq(net, dataloader, lr, num_epochs, tgt_vocab, device)

epoch 0 loss 0.473, 54622.5 tokens/sec on cuda:0
epoch 1 loss 0.352, 54548.2 tokens/sec on cuda:0
epoch 2 loss 0.290, 54331.1 tokens/sec on cuda:0
epoch 3 loss 0.247, 54878.1 tokens/sec on cuda:0
epoch 4 loss 0.217, 55273.2 tokens/sec on cuda:0
epoch 5 loss 0.194, 55305.3 tokens/sec on cuda:0
epoch 6 loss 0.178, 54110.5 tokens/sec on cuda:0
epoch 7 loss 0.164, 55230.0 tokens/sec on cuda:0
epoch 8 loss 0.154, 55048.5 tokens/sec on cuda:0
epoch 9 loss 0.145, 55441.6 tokens/sec on cuda:0
epoch 10 loss 0.137, 55625.6 tokens/sec on cuda:0
epoch 11 loss 0.131, 55383.3 tokens/sec on cuda:0
epoch 12 loss 0.125, 55051.3 tokens/sec on cuda:0
epoch 13 loss 0.121, 53920.6 tokens/sec on cuda:0
epoch 14 loss 0.117, 54853.7 tokens/sec on cuda:0
epoch 15 loss 0.113, 55092.7 tokens/sec on cuda:0
epoch 16 loss 0.110, 54641.0 tokens/sec on cuda:0
epoch 17 loss 0.107, 54876.7 tokens/sec on cuda:0
epoch 18 loss 0.105, 55404.4 tokens/sec on cuda:0
epoch 19 loss 0.103, 54162.2 tokens/sec on cuda:0
epoch 20 l

## predict

In [98]:
def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab, num_steps,
                    device, save_attention_weights=False):
    """Predict for sequence to sequence."""

    # Set `net` to eval mode for inference
    net.eval()
    # print(src_sentence)
    words = list(jieba.cut(src_sentence))
    # print(words)
    
    src_tokens = src_vocab[ words ] + [src_vocab['<eos>']]
    enc_valid_len = torch.tensor([len(src_tokens)], device=device)
    src_tokens = truncate_pad(src_tokens, num_steps, src_vocab['<pad>'])
    
    # Add the batch axis
    enc_X = torch.unsqueeze(
        torch.tensor(src_tokens, dtype=torch.long, device=device), dim=0)
    enc_outputs = net.encoder(enc_X, enc_valid_len)
    dec_state = net.decoder.init_state(enc_outputs, enc_valid_len)
    # Add the batch axis
    dec_X = torch.unsqueeze(torch.tensor(
        [tgt_vocab['<bos>']], dtype=torch.long, device=device), dim=0)
    output_seq, attention_weight_seq = [], []
    for _ in range(num_steps):
        Y, dec_state = net.decoder(dec_X, dec_state)
        # We use the token with the highest prediction likelihood as the input
        # of the decoder at the next time step
        dec_X = Y.argmax(dim=2)
        pred = dec_X.squeeze(dim=0).type(torch.int32).item()
        # Save attention weights (to be covered later)
        if save_attention_weights:
            attention_weight_seq.append(net.decoder.attention_weights)
        # Once the end-of-sequence token is predicted, the generation of the
        # output sequence is complete
        if pred == tgt_vocab['<eos>']:
            break
        output_seq.append(pred)
    return ' '.join(tgt_vocab.to_tokens(output_seq)), attention_weight_seq

In [80]:
num = 100
for chn, eng in zip(chi_list[:num], eng_list[:num]):
    translation, dec_attention_weight_seq = predict_seq2seq(
        net, chn, src_vocab, tgt_vocab, num_steps, device, True)
    print(f'{chn} => {translation}')

嗨。 => hi .
你好。 => hello .
你用跑的。 => run to run .
住手！ => stop !
等等！ => wait !
等一下！ => hang !
开始！ => begin !
你好。 => hello .
我试试。 => i try .
我赢了。 => i won !
不会吧。 => oh no !
干杯! => <unk> !
知道了没有？ => do you know the way ?
懂了吗？ => do you follow ?
你懂了吗？ => do you have a good time ?
他跑了。 => he ran .
跳进来。 => the <unk> is on strike .
我知道。 => i know .
我退出。 => i quit .
我不干了。 => i quit .
我没事。 => i'm ok .
我已经起来了。 => i've lost .
听着。 => listen .
不可能！ => no problem !
没门！ => get out !
你确定？ => are you sure ?
谢谢！ => thanks !
试试吧。 => try it .
我们来试试。 => we try .
为什么是我？ => why is that me ?
去问汤姆。 => ask tom for .
好棒！ => awesome !
冷静点。 => cool down .
公平点。 => be fair .
友善点。 => be friendly .
友好点。 => be friendly .
和气点。 => take a look at the same .
友善点。 => be friendly .
联系我。 => call me .
联系我们。 => call us .
进来。 => come inside .
找到汤姆。 => find tom .
滚出去！ => get out !
出去！ => go ! ! will hurry up .
走开！ => go away !
滚！ => get lost !
走开！ => go away !
回家。 => go home .
回家吧。 => go home .
再见！ => see you again .
告辞！ => get out

In [81]:
num = -20
for chn, eng in zip(chi_list[num:], eng_list[num:]):
    translation, dec_attention_weight_seq = predict_seq2seq(
        net, chn, src_vocab, tgt_vocab, num_steps, device, True)
    print(f'{chn} => {translation}')

汤姆试过还回泳衣来换成更大一号的，但是员工告诉他那是不被允许的。 => tom tried to return the <unk> for the past days
在十九世纪三十年代的大萧条时期，许多富人在股市崩盘中失去了一切。 => during the depression in the <unk> , many wealthy people
我觉得当汤姆发现他买来的画是赝品的时候，他会很生气。 => i think that tom is going to be pretty mad
为了不被洪水冲走，有的人紧紧地抱着树干长达数个钟头。 => not some people like some tree , but even <unk>
这个工人本来应该在中午十二点到达, 但他被交通堵塞困住了几个小时。 => the boy was divided before noon .
我父母通常用法语对话，即使我母亲的母语是英语。 => my parents usually speak to each other in french ,
就像马克·诺弗勒早期演唱的歌曲《金钱无用》一样，绝大多数的人依然高呼赞成“金钱无用论”。 => the <unk> was made of <unk> <unk> .
假如你在老师讲课的时候再集中一点去听讲的话，你应该就能弄明白了。 => you'd better continue your wish more carefully .
当汤姆开着他破旧的雷泽车来接女儿放学时，他的女儿假装不认识他。 => tom's daughter pretended not to get him to his daughter
许多自然环境保护主义者担心持续屠杀鲸鱼正推动这些动物走向灭绝。 => a lot of <unk> fear .
去年在菲律宾，地震和海啸造成了超过6000人的死亡。 => last people and in the beach last year , earthquakes
“又是汤姆的电话？” “嗯。最近他每天晚上都会打过来。当时就不该给他我的号码的。” => "is he calling tom ?" "i got married ." "i'll
我母亲的法语比我父亲的英语要好，所以他们通常用法语交流。 => 

In [85]:
sentence = "我要去上学"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

我要去上学=>i go to school .


In [86]:
sentence = "多可爱啊！"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

多可爱啊！=>how cute !


In [87]:
sentence = "抓住他"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

抓住他=>grab him .


In [88]:
sentence = "我是一名律师"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

我是一名律师=>i'm a lawyer .


In [93]:
sentence = "我是一名程序员"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

我是一名程序员=>i'm a computer programmer .


In [94]:
sentence = "我是一名飞行员"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

我是一名飞行员=>i'm a professional of brave .


In [97]:
sentence = "我在踢足球"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

我在踢足球=>i stay in bed .


In [100]:
sentence = "我在飞"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

我在飞=>i'm on a foreigner .
