In [47]:
import time
import math
from collections import Counter
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

import zhconv
import jieba

In [2]:
import sys
sys.path.append('./codes/')

In [3]:
%load_ext autoreload
%autoreload 1
%aimport common

# Data

## load corpus

load sentences pair in chinese and english

In [4]:
def load_data():
    chi_list = []
    eng_list = []
    with open('./data/cmn.txt') as f:
        for line in f:
            eng_sent, chi_sent, _ = line.split('\t')
            
            # traditional to simplified
            # zhconv.convert('走開！', 'zh-cn')
            chi_sent = zhconv.convert(chi_sent, 'zh-cn')
            
            chi_list.append(chi_sent)
            eng_list.append(eng_sent)
    return chi_list, eng_list

In [5]:
chi_list, eng_list = load_data()

In [6]:
list(zip(chi_list[:3], eng_list[:3]))

[('嗨。', 'Hi.'), ('你好。', 'Hi.'), ('你用跑的。', 'Run.')]

In [7]:
list(zip(chi_list[-3:], eng_list[-3:]))

[('你很容易把母语说得通顺流畅，却很容易把非母语说得不自然。',
  "It's very easy to sound natural in your own native language, and very easy to sound unnatural in your non-native language."),
 ('虽然我被公司解雇了，但是我还有点存款，所以目前不用担心生计问题。',
  "I got fired from the company, but since I have a little money saved up, for the time being, I won't have trouble with living expenses."),
 ('如果一个人在成人前没有机会习得目标语言，他对该语言的认识达到母语者程度的机会是相当小的。',
  "If a person has not had a chance to acquire his target language by the time he's an adult, he's unlikely to be able to reach native speaker level in that language.")]

## data preprocess

In [8]:
def is_space(word):
    if not word:
        # None ''
        return True
    if word.isspace():
        #\t \n \r \u202f \xa0
        return True
    return False

In [9]:
words = ['', ' ', 'i' , 'recommend', 'item']
[w for w in words if not is_space(w)]

['i', 'recommend', 'item']

In [10]:
def fill_space(text):
    # for english only
    # fill space before ,.!? which is convenient to tokenize
    def no_space(char, prev_char):
        return char in set(',.!?') and prev_char != ' '
    
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char for i, char in enumerate(text)]
    return ''.join(out)

In [11]:
fill_space("It's very easy to sound natural in your own native language, and very easy to sound unnatural in your non-native language.")

"It's very easy to sound natural in your own native language , and very easy to sound unnatural in your non-native language ."

In [12]:
def tokenize(chi_list, eng_list):
    
    source = []
    for sent in chi_list:
        words = jieba.cut(sent)
        words = [w for w in words if not is_space(w)]
        source.append(words)
        
    target = []
    for sent in eng_list:
        sent = sent.lower()
        sent = fill_space(sent)
        words = sent.split(' ')
        words = [w for w in words if not is_space(w)]
        target.append(words)
        
    return source, target

In [13]:
source, target = tokenize(chi_list, eng_list)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.410 seconds.
Prefix dict has been built successfully.


In [14]:
source[:5]

[['嗨', '。'], ['你好', '。'], ['你', '用', '跑', '的', '。'], ['住手', '！'], ['等等', '！']]

In [15]:
target[:5]

[['hi', '.'], ['hi', '.'], ['run', '.'], ['stop', '!'], ['wait', '!']]

In [16]:
arr = []
for sent_cn, sent_eng in zip(source, target):
    arr.append([len(sent_cn), len(sent_eng)])
arr = torch.tensor(arr, dtype=torch.float32)

In [17]:
arr.max(dim=0)

torch.return_types.max(
values=tensor([30., 34.]),
indices=tensor([29362, 29370]))

In [18]:
arr.mean(dim=0)

tensor([6.6265, 7.2073])

In [19]:
# num_steps = 10, is ok.

In [20]:
class Vocab(object):
    """vocabulary"""
    
    def __init__(self, tokens, min_freq=0, reserved_tokens=['<pad>', '<bos>', '<eos>']):
        tokens = [token for line in tokens for token in line]
        self._token_freqs = sorted(Counter(tokens).items(), key=lambda x: x[1], reverse=True)
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
        
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) -1
                
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]
    
    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token(indices)
        return [self.idx_to_token[indice] for indice in indices]
    
    @property
    def unk(self):
        # index for the unknown token
        return 0
    
    @property
    def token_freqs(self):
        return self._token_freqs

In [21]:
src_vocab = Vocab(source, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])
tgt_vocab = Vocab(target, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])

In [22]:
src_vocab['<pad>'], src_vocab['<unk>']

(1, 0)

In [23]:
src_vocab[['<unk>', '<pad>', '<bos>', '<eos>', '。']]

[0, 1, 2, 3, 4]

In [24]:
def truncate_pad(line, num_steps, padding_token):
    """truncate or pad sequence"""
    if len(line) > num_steps: 
        # trancate
        return line[:num_steps]
    return line + [padding_token] * (num_steps - len(line)) # pad

In [25]:
def to_tensor(lines, vocab, num_steps):
    """transfrom text sequence of machine traslation into minibatches."""
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab['<eos>']] for l in lines]
    arr = [ truncate_pad(l, num_steps, vocab['<pad>'])  for l in lines]
    arr = torch.tensor(arr)
    
    valid_len = (arr != vocab['<pad>']).type(torch.int32).sum(dim=1)
    
    return arr, valid_len

In [26]:
batch_size, num_steps = 128, 10

In [27]:
src_arr, src_valid_len = to_tensor(source, src_vocab, num_steps=num_steps)
tgt_arr, tgt_valid_len = to_tensor(target, tgt_vocab, num_steps=num_steps)

In [28]:
src_arr[:5], src_valid_len[:5]

(tensor([[2500,    4,    3,    1,    1,    1,    1,    1,    1,    1],
         [1210,    4,    3,    1,    1,    1,    1,    1,    1,    1],
         [   8,  112,  324,    6,    4,    3,    1,    1,    1,    1],
         [4522,   72,    3,    1,    1,    1,    1,    1,    1,    1],
         [2501,   72,    3,    1,    1,    1,    1,    1,    1,    1]]),
 tensor([3, 3, 6, 3, 3]))

In [29]:
def to_dataloader(arr_list, batch_size, is_train=True):
    dataset = TensorDataset(*arr_list)
    return DataLoader(dataset, batch_size, shuffle=is_train)

In [30]:
src_valid_len[:5], src_valid_len[-5:]

(tensor([3, 3, 6, 3, 3]), tensor([10, 10, 10, 10, 10]))

In [31]:
dataloader = to_dataloader((src_arr, src_valid_len, tgt_arr, tgt_valid_len), batch_size=batch_size)

In [32]:
for e in iter(dataloader):
    print(e[0].shape, e[1].shape, e[2].shape, e[3].shape)
    break

torch.Size([128, 10]) torch.Size([128]) torch.Size([128, 10]) torch.Size([128])


# Transformer

## Positional encoding

In [33]:
class PositionalEncoding(nn.Module):
    """位置编码"""
    def __init__(self, num_hiddens, dropout, max_len=1000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        # 创建一个足够长的P
        self.P = torch.zeros((1, max_len, num_hiddens))
        X = torch.arange(max_len, dtype=torch.float32).reshape(
            -1, 1) / torch.pow(10000, torch.arange(
            0, num_hiddens, 2, dtype=torch.float32) / num_hiddens)
        self.P[:, :, 0::2] = torch.sin(X)
        self.P[:, :, 1::2] = torch.cos(X)

    def forward(self, X):
        X = X + self.P[:, :X.shape[1], :].to(X.device)
        return self.dropout(X)

## Multi-head attention

## Add&Norm
residual connection and layer normalization

In [34]:
class AddNorm(nn.Module):
    """残差连接后进行层规范化"""
    def __init__(self, normalized_shape, dropout, **kwargs):
        super(AddNorm, self).__init__(**kwargs)
        self.dropout = nn.Dropout(dropout)
        self.ln = nn.LayerNorm(normalized_shape)

    def forward(self, X, Y):
        return self.ln(self.dropout(Y) + X)

## PositionwiseFFN
positionwise feed-forward network

In [35]:
class PositionWiseFFN(nn.Module):
    """基于位置的前馈网络"""
    def __init__(self, ffn_num_input, ffn_num_hiddens, ffn_num_outputs,
                 **kwargs):
        super(PositionWiseFFN, self).__init__(**kwargs)
        self.dense1 = nn.Linear(ffn_num_input, ffn_num_hiddens)
        self.relu = nn.ReLU()
        self.dense2 = nn.Linear(ffn_num_hiddens, ffn_num_outputs)

    def forward(self, X):
        return self.dense2(self.relu(self.dense1(X)))

## Encoder

In [48]:
class EncoderBlock(nn.Module):
    """Transformer编码器块"""
    def __init__(self, key_size, query_size, value_size, num_hiddens,
                 norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
                 dropout, use_bias=False, **kwargs):
        super(EncoderBlock, self).__init__(**kwargs)
        self.attention = common.MultiHeadAttention(
            key_size, query_size, value_size, num_hiddens, num_heads, dropout,
            use_bias)
        self.addnorm1 = AddNorm(norm_shape, dropout)
        self.ffn = PositionWiseFFN(
            ffn_num_input, ffn_num_hiddens, num_hiddens)
        self.addnorm2 = AddNorm(norm_shape, dropout)

    def forward(self, X, valid_lens):
        Y = self.addnorm1(X, self.attention(X, X, X, valid_lens))
        return self.addnorm2(Y, self.ffn(Y))

In [49]:
class TransformerEncoder(common.Encoder):
    """Transformer编码器"""
    def __init__(self, vocab_size, key_size, query_size, value_size,
                 num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens,
                 num_heads, num_layers, dropout, use_bias=False, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.embedding = nn.Embedding(vocab_size, num_hiddens)
        self.pos_encoding = PositionalEncoding(num_hiddens, dropout)
        self.blks = nn.Sequential()
        for i in range(num_layers):
            self.blks.add_module("block"+str(i),
                EncoderBlock(key_size, query_size, value_size, num_hiddens,
                             norm_shape, ffn_num_input, ffn_num_hiddens,
                             num_heads, dropout, use_bias))

    def forward(self, X, valid_lens, *args):
        # 因为位置编码值在-1和1之间，
        # 因此嵌入值乘以嵌入维度的平方根进行缩放，
        # 然后再与位置编码相加。
        X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens))
        self.attention_weights = [None] * len(self.blks)
        for i, blk in enumerate(self.blks):
            X = blk(X, valid_lens)
            self.attention_weights[
                i] = blk.attention.attention.attention_weights
        return X

## Decoder

In [38]:
class DecoderBlock(nn.Module):
    """解码器中第i个块"""
    def __init__(self, key_size, query_size, value_size, num_hiddens,
                 norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
                 dropout, i, **kwargs):
        super(DecoderBlock, self).__init__(**kwargs)
        self.i = i
        self.attention1 = common.MultiHeadAttention(
            key_size, query_size, value_size, num_hiddens, num_heads, dropout)
        self.addnorm1 = AddNorm(norm_shape, dropout)
        self.attention2 = common.MultiHeadAttention(
            key_size, query_size, value_size, num_hiddens, num_heads, dropout)
        self.addnorm2 = AddNorm(norm_shape, dropout)
        self.ffn = PositionWiseFFN(ffn_num_input, ffn_num_hiddens,
                                   num_hiddens)
        self.addnorm3 = AddNorm(norm_shape, dropout)

    def forward(self, X, state):
        enc_outputs, enc_valid_lens = state[0], state[1]
        # 训练阶段，输出序列的所有词元都在同一时间处理，
        # 因此state[2][self.i]初始化为None。
        # 预测阶段，输出序列是通过词元一个接着一个解码的，
        # 因此state[2][self.i]包含着直到当前时间步第i个块解码的输出表示
        if state[2][self.i] is None:
            key_values = X
        else:
            key_values = torch.cat((state[2][self.i], X), axis=1)
        state[2][self.i] = key_values
        if self.training:
            batch_size, num_steps, _ = X.shape
            # dec_valid_lens的开头:(batch_size,num_steps),
            # 其中每一行是[1,2,...,num_steps]
            dec_valid_lens = torch.arange(
                1, num_steps + 1, device=X.device).repeat(batch_size, 1)
        else:
            dec_valid_lens = None

        # 自注意力
        X2 = self.attention1(X, key_values, key_values, dec_valid_lens)
        Y = self.addnorm1(X, X2)
        # 编码器－解码器注意力。
        # enc_outputs的开头:(batch_size,num_steps,num_hiddens)
        Y2 = self.attention2(Y, enc_outputs, enc_outputs, enc_valid_lens)
        Z = self.addnorm2(Y, Y2)
        return self.addnorm3(Z, self.ffn(Z)), state

In [39]:
class TransformerDecoder(common.AttentionDecoder):
    def __init__(self, vocab_size, key_size, query_size, value_size,
                 num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens,
                 num_heads, num_layers, dropout, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, num_hiddens)
        self.pos_encoding = PositionalEncoding(num_hiddens, dropout)
        self.blks = nn.Sequential()
        for i in range(num_layers):
            self.blks.add_module("block"+str(i),
                DecoderBlock(key_size, query_size, value_size, num_hiddens,
                             norm_shape, ffn_num_input, ffn_num_hiddens,
                             num_heads, dropout, i))
        self.dense = nn.Linear(num_hiddens, vocab_size)

    def init_state(self, enc_outputs, enc_valid_lens, *args):
        return [enc_outputs, enc_valid_lens, [None] * self.num_layers]

    def forward(self, X, state):
        X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens))
        self._attention_weights = [[None] * len(self.blks) for _ in range (2)]
        for i, blk in enumerate(self.blks):
            X, state = blk(X, state)
            # 解码器自注意力权重
            self._attention_weights[0][
                i] = blk.attention1.attention.attention_weights
            # “编码器－解码器”自注意力权重
            self._attention_weights[1][
                i] = blk.attention2.attention.attention_weights
        return self.dense(X), state

    @property
    def attention_weights(self):
        return self._attention_weights

# Train

In [50]:
batch_size = 128
num_steps = 10
lr = 0.005
num_epochs = 100

num_hiddens = 32
num_layers = 2
dropout = 0.1
ffn_num_input, ffn_num_hiddens, num_heads = 32, 64, 4
key_size, query_size, value_size = 32, 32, 32
norm_shape = [32]

device = common.try_gpu()

In [51]:
encoder = TransformerEncoder(
    len(src_vocab), key_size, query_size, value_size, num_hiddens,
    norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
    num_layers, dropout)
decoder = TransformerDecoder(
    len(tgt_vocab), key_size, query_size, value_size, num_hiddens,
    norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
    num_layers, dropout)
net = common.EncoderDecoder(encoder, decoder)

In [52]:
def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device):
    """Train a model for sequence to sequence."""
    
    def xavier_init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])
    net.apply(xavier_init_weights)
    net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    loss = common.MaskedSoftmaxCELoss()
    net.train()
    for epoch in range(num_epochs):
        timer = common.Timer()
        metric = common.Accumulator(2)  # Sum of training loss, no. of tokens
        for batch in data_iter:
            optimizer.zero_grad()
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
                               device=device).reshape(-1, 1)
            dec_input = torch.cat([bos, Y[:, :-1]], 1)  # Teacher forcing
            Y_hat, _ = net(X, dec_input, X_valid_len)
            l = loss(Y_hat, Y, Y_valid_len)
            l.sum().backward()  # Make the loss scalar for `backward`
            common.grad_clipping(net, 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()
            with torch.no_grad():
                metric.add(l.sum(), num_tokens)
        print(f'epoch {epoch} loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} '
              f'tokens/sec on {str(device)}')

In [53]:
train_seq2seq(net, dataloader, lr, num_epochs, tgt_vocab, device)

epoch 0 loss 0.493, 50184.6 tokens/sec on cuda:0
epoch 1 loss 0.396, 54745.0 tokens/sec on cuda:0
epoch 2 loss 0.364, 55667.9 tokens/sec on cuda:0
epoch 3 loss 0.344, 55739.7 tokens/sec on cuda:0
epoch 4 loss 0.328, 55888.8 tokens/sec on cuda:0
epoch 5 loss 0.315, 53381.9 tokens/sec on cuda:0
epoch 6 loss 0.303, 54444.9 tokens/sec on cuda:0
epoch 7 loss 0.292, 54526.9 tokens/sec on cuda:0
epoch 8 loss 0.282, 56100.1 tokens/sec on cuda:0
epoch 9 loss 0.273, 55739.1 tokens/sec on cuda:0
epoch 10 loss 0.265, 56169.6 tokens/sec on cuda:0
epoch 11 loss 0.258, 55569.7 tokens/sec on cuda:0
epoch 12 loss 0.252, 54848.6 tokens/sec on cuda:0
epoch 13 loss 0.246, 55099.9 tokens/sec on cuda:0
epoch 14 loss 0.241, 55345.0 tokens/sec on cuda:0
epoch 15 loss 0.236, 55521.7 tokens/sec on cuda:0
epoch 16 loss 0.231, 55169.6 tokens/sec on cuda:0
epoch 17 loss 0.227, 54947.8 tokens/sec on cuda:0
epoch 18 loss 0.223, 55385.4 tokens/sec on cuda:0
epoch 19 loss 0.220, 54577.6 tokens/sec on cuda:0
epoch 20 l

# Predict

In [54]:
def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab, num_steps,
                    device, save_attention_weights=False):
    """Predict for sequence to sequence."""

    # Set `net` to eval mode for inference
    net.eval()
    # print(src_sentence)
    words = list(jieba.cut(src_sentence))
    # print(words)
    
    src_tokens = src_vocab[ words ] + [src_vocab['<eos>']]
    enc_valid_len = torch.tensor([len(src_tokens)], device=device)
    src_tokens = truncate_pad(src_tokens, num_steps, src_vocab['<pad>'])
    
    # Add the batch axis
    enc_X = torch.unsqueeze(
        torch.tensor(src_tokens, dtype=torch.long, device=device), dim=0)
    enc_outputs = net.encoder(enc_X, enc_valid_len)
    dec_state = net.decoder.init_state(enc_outputs, enc_valid_len)
    # Add the batch axis
    dec_X = torch.unsqueeze(torch.tensor(
        [tgt_vocab['<bos>']], dtype=torch.long, device=device), dim=0)
    output_seq, attention_weight_seq = [], []
    for _ in range(num_steps):
        Y, dec_state = net.decoder(dec_X, dec_state)
        # We use the token with the highest prediction likelihood as the input
        # of the decoder at the next time step
        dec_X = Y.argmax(dim=2)
        pred = dec_X.squeeze(dim=0).type(torch.int32).item()
        # Save attention weights (to be covered later)
        if save_attention_weights:
            attention_weight_seq.append(net.decoder.attention_weights)
        # Once the end-of-sequence token is predicted, the generation of the
        # output sequence is complete
        if pred == tgt_vocab['<eos>']:
            break
        output_seq.append(pred)
    return ' '.join(tgt_vocab.to_tokens(output_seq)), attention_weight_seq

In [55]:
num = 100
for chn, eng in zip(chi_list[:num], eng_list[:num]):
    translation, dec_attention_weight_seq = predict_seq2seq(
        net, chn, src_vocab, tgt_vocab, num_steps, device, True)
    print(f'{chn} => {translation}')

嗨。 => hi .
你好。 => hi .
你用跑的。 => you ran into the bank .
住手！ => stop !
等等！ => wait !
等一下！ => wait !
开始！ => it began .
你好。 => hi .
我试试。 => i try it .
我赢了。 => i won .
不会吧。 => let's not be not .
干杯! => put a <unk> !
知道了没有？ => no idea occurred for ?
懂了吗？ => did you understand ?
你懂了吗？ => have you understand ?
他跑了。 => he ran away .
跳进来。 => the <unk> absorbs water .
我知道。 => i know .
我退出。 => i quit .
我不干了。 => i don't have done .
我没事。 => i'm okay .
我已经起来了。 => i've already .
听着。 => listen to .
不可能！ => it may not be done !
没门！ => put a <unk> .
你确定？ => are you sure ?
谢谢！ => thanks !
试试吧。 => try it .
我们来试试。 => we try to try .
为什么是我？ => why are i ?
去问汤姆。 => ask tom .
好棒！ => awesome !
冷静点。 => calm down .
公平点。 => be fair .
友善点。 => be friendly .
友好点。 => be friendly .
和气点。 => <unk> and <unk> are <unk> .
友善点。 => be friendly .
联系我。 => let's contact with me .
联系我们。 => let's contact us .
进来。 => come in .
找到汤姆。 => tom found tom .
滚出去！ => put a <unk> .
出去！ => let's go out !
走开！ => leave away !
滚！ => get lost !

In [56]:
num = -20
for chn, eng in zip(chi_list[num:], eng_list[num:]):
    translation, dec_attention_weight_seq = predict_seq2seq(
        net, chn, src_vocab, tgt_vocab, num_steps, device, True)
    print(f'{chn} => {translation}')

汤姆试过还回泳衣来换成更大一号的，但是员工告诉他那是不被允许的。 => tom tried to show me a bigger size of the
在十九世纪三十年代的大萧条时期，许多富人在股市崩盘中失去了一切。 => many older sister amassed a lot of the wealthy people
我觉得当汤姆发现他买来的画是赝品的时候，他会很生气。 => i think tom found what he was going to be
为了不被洪水冲走，有的人紧紧地抱着树干长达数个钟头。 => a <unk> <unk> <unk> <unk> <unk> <unk> <unk> to the
这个工人本来应该在中午十二点到达, 但他被交通堵塞困住了几个小时。 => the workers been twelve months in the crime .
我父母通常用法语对话，即使我母亲的母语是英语。 => my parents usually talks in french , but i generally
就像马克·诺弗勒早期演唱的歌曲《金钱无用》一样，绝大多数的人依然高呼赞成“金钱无用论”。 => the <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>
假如你在老师讲课的时候再集中一点去听讲的话，你应该就能弄明白了。 => a little talks to the teacher caused the teacher on
当汤姆开着他破旧的雷泽车来接女儿放学时，他的女儿假装不认识他。 => tom took a <unk> call him when he left her
许多自然环境保护主义者担心持续屠杀鲸鱼正推动这些动物走向灭绝。 => many people beginning of whales is a lot of whales
去年在菲律宾，地震和海啸造成了超过6000人的死亡。 => the waves crashed with the mountain in the waves crashed
“又是汤姆的电话？” “嗯。最近他每天晚上都会打过来。当时就不该给他我的号码的。” => "is the phone ?" "tom was 

In [57]:
sentence = "我要去上学"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

我要去上学=>i want to go to school .


In [58]:
sentence = "多可爱啊！"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

多可爱啊！=>how cute !


In [59]:
sentence = "抓住他"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

抓住他=>grab him .


In [60]:
sentence = "我是一名律师"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

我是一名律师=>i'm a lawyer .


In [61]:
sentence = "我是一名程序员"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

我是一名程序员=>i'm a computer programmer .


In [62]:
sentence = "我是一名飞行员"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

我是一名飞行员=>i'm a pilot .


In [63]:
sentence = "我在踢足球"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

我在踢足球=>i play soccer .


In [64]:
sentence = "我在飞"
translation, dec_attention_weight_seq = predict_seq2seq(net, sentence, src_vocab, tgt_vocab, num_steps, device, True)
print(f'{sentence}=>{translation}')

我在飞=>i'm flying kites .
