<a href="https://colab.research.google.com/github/yananma/5_programs_per_day/blob/master/0510.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 10.12 机器翻译

In [0]:
!pip install mxnet d2lzh 

In [0]:
import collections 
import io 
import math 
from mxnet import autograd, gluon, init, nd 
from mxnet.contrib import text 
from mxnet.gluon import data as gdata, loss as gloss, nn, rnn 

PAD, BOS, EOS = '<pad>', '<bos>', '<eos>'

In [0]:
def process_one_seq(seq_tokens, all_tokens, all_seqs, max_seq_len):
    all_tokens.extend(seq_tokens)
    seq_tokens += [EOS] + [PAD] * (max_seq_len - len(seq_tokens) - 1)
    all_seqs.append(seq_tokens)

def build_data(all_tokens, all_seqs):
    vocab = text.vocab.Vocabulary(collections.Counter(all_tokens), reserved_tokens=[PAD, BOS, EOS])
    indices = [vocab.to_indices(seq) for seq in all_seqs]
    return vocab, nd.array(indices)

In [5]:
!git clone https://www.github.com/d2l-ai/d2l-zh.git

Cloning into 'd2l-zh'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 15702 (delta 9), reused 8 (delta 4), pack-reused 15685[K
Receiving objects: 100% (15702/15702), 159.56 MiB | 32.84 MiB/s, done.
Resolving deltas: 100% (11132/11132), done.


In [0]:
!mkdir ../data 

In [0]:
!cp ./d2l-zh/data/fr-en-small.txt ../data/

In [8]:
!ls ../data/

fr-en-small.txt


In [0]:
def read_data(max_seq_len):
    in_tokens, out_tokens, in_seqs, out_seqs = [], [], [], []
    with io.open('../data/fr-en-small.txt') as f:
        lines = f.readlines()
    for line in lines:
        in_seq, out_seq = line.rstrip().split('\t')
        in_seq_tokens, out_seq_tokens = in_seq.split(' '), out_seq.split(' ')
        if max(len(in_seq_tokens), len(out_seq_tokens)) > max_seq_len - 1:
            continue 
        process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len)
        process_one_seq(out_seq_tokens, out_tokens, out_seqs, max_seq_len)
    in_vocab, in_data = build_data(in_tokens, in_seqs)
    out_vocab, out_data = build_data(out_tokens, out_seqs)
    return in_vocab, out_vocab, gdata.ArrayDataset(in_data, out_data)


In [10]:
max_seq_len = 7 
in_vocab, out_vocab, dataset = read_data(max_seq_len)
dataset[0]

(
 [ 6.  5. 46.  4.  3.  1.  1.]
 <NDArray 7 @cpu(0)>, 
 [ 9.  5. 28.  4.  3.  1.  1.]
 <NDArray 7 @cpu(0)>)

In [0]:
class Encoder(nn.Block):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, drop_prob=0, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = rnn.GRU(num_hiddens, num_layers, dropout=drop_prob)

    def forward(self, inputs, state):
        embedding = self.embedding(inputs).swapaxes(0, 1)
        return self.rnn(embedding, state)

    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)

In [12]:
encoder = Encoder(vocab_size=10, embed_size=8, num_hiddens=16, num_layers=2)
encoder.initialize()
output, state = encoder(nd.zeros((4, 7)), encoder.begin_state(batch_size=4))
output.shape, state[0].shape 

((7, 4, 16), (2, 4, 16))

In [13]:
dense = nn.Dense(2, flatten=False)
dense.initialize()
dense(nd.zeros((3, 5, 7))).shape 

(3, 5, 2)

In [0]:
def attention_model(attention_size):
    model = nn.Sequential()
    model.add(nn.Dense(attention_size, activation='tanh', use_bias=False, flatten=False), 
         nn.Dense(1, use_bias=False, flatten=False))
    return model 

In [0]:
def attention_forward(model, enc_states, dec_state):
    dec_states = nd.broadcast_axis(dec_state.expand_dims(0), axis=0, size=enc_states.shape[0])
    enc_and_dec_states = nd.concat(enc_states, dec_states, dim=2)
    e = model(enc_and_dec_states)
    alpha = nd.softmax(e, axis=0)
    return (alpha * enc_states).sum(axis=0)

In [19]:
seq_len, batch_size, num_hiddens = 10, 4, 8
model = attention_model(10)
model.initialize()
enc_states = nd.zeros((seq_len, batch_size, num_hiddens))
dec_state = nd.zeros((batch_size, num_hiddens))
attention_forward(model, enc_states, dec_state).shape 

(4, 8)

In [0]:
class Decoder(nn.Block):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, attention_size, drop_prob=0, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.attention = attention_model(attention_size)
        self.rnn = rnn.GRU(num_hiddens, num_layers, dropout=drop_prob)
        self.out = nn.Dense(vocab_size, flatten=False)

    def forward(self, cur_input, state, enc_states):
        c = attention_forward(self.attention, enc_states, state[0][-1])
        input_and_c = nd.concat(self.embedding(cur_input), c, dim=1)
        output, state = self.rnn(input_and_c.expand_dims(0), state)
        output = self.out(output).squeeze(axis=0)
        return output, state 

    def begin_state(self, enc_state):
        return enc_state 

In [0]:
def batch_loss(encoder, decoder, X, Y, loss):
    batch_size = X.shape[0]
    enc_state = encoder.begin_state(batch_size=batch_size)
    enc_outputs, enc_state = encoder(X, enc_state)
    dec_state = decoder.begin_state(enc_state)
    dec_input = nd.array([out_vocab.token_to_idx[BOS]] * batch_size)
    mask, num_not_pad_tokens = nd.ones(shape=(batch_size,)), 0 
    l = nd.array([0])
    for y in Y.T:
        dec_output, dec_state = decoder(dec_input, dec_state, enc_outputs)
        l = l + (mask * loss(dec_output, y)).sum()
        dec_input = y 
        num_not_pad_tokens += mask.sum().asscalar()
        mask = mask * (y != out_vocab.token_to_idx[EOS])
    return l / num_not_pad_tokens 

In [0]:
def train(encoder, decoder, dataset, lr, batch_size, num_epochs):
    encoder.initialize(init.Xavier(), force_reinit=True)
    decoder.initialize(init.Xavier(), force_reinit=True)
    enc_trainer = gluon.Trainer(encoder.collect_params(), 'adam', {'learning_rate': lr})
    dec_trainer = gluon.Trainer(decoder.collect_params(), 'adam', {'learning_rate': lr})
    loss = gloss.SoftmaxCrossEntropyLoss()
    data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True)
    for epoch in range(num_epochs):
        l_sum = 0.0 
        for X, Y in data_iter:
            with autograd.record():
                l = batch_loss(encoder, decoder, X, Y, loss)
            l.backward()
            enc_trainer.step(1)
            dec_trainer.step(1)
            l_sum += l.asscalar()
        if (epoch + 1) % 10 == 0:
            print("epoch %d, loss %.3f" % (epoch + 1, l_sum / len(data_iter))) 

In [24]:
embed_size, num_hiddens, num_layers = 64, 64, 2
attention_size, drop_prob, lr, batch_size, num_epochs = 10, 0.5, 0.01, 2, 50 
encoder = Encoder(len(in_vocab), embed_size, num_hiddens, num_layers, drop_prob)
decoder = Decoder(len(out_vocab), embed_size, num_hiddens, num_layers, attention_size, drop_prob)
train(encoder, decoder, dataset, lr, batch_size, num_epochs)

epoch 10, loss 0.426
epoch 20, loss 0.244
epoch 30, loss 0.143
epoch 40, loss 0.103
epoch 50, loss 0.053


In [0]:
def translate(encoder, decoder, input_seq, max_seq_len):
    in_tokens = input_seq.split(' ')
    in_tokens += [EOS] + [PAD] * (max_seq_len - len(in_tokens) - 1)
    enc_input = nd.array([in_vocab.to_indices(in_tokens)])
    enc_state = encoder.begin_state(batch_size=1)
    enc_output, enc_state = encoder(enc_input, enc_state)
    dec_input = nd.array([out_vocab.token_to_idx[BOS]])
    dec_state = decoder.begin_state(enc_state)
    output_tokens = []
    for _ in range(max_seq_len):
        dec_output, dec_state = decoder(dec_input, dec_state, enc_output)
        pred = dec_output.argmax(axis=1)
        pred_token = out_vocab.idx_to_token[int(pred.asscalar())]
        if pred_token == EOS:
            break 
        else:
            output_tokens.append(pred_token)
            dec_input = pred 
    return output_tokens 

In [32]:
input_seq = 'ila regardent .'
translate(encoder, decoder, input_seq, max_seq_len)

['they', 'are', 'watching', '.']

In [0]:
def bleu(pred_tokens, label_tokens, k):
    len_pred, len_label = len(pred_tokens), len(label_tokens)
    score = math.exp(min(0, 1 - len_label / len_pred))
    for n in range(1, k + 1):
        num_matches, label_subs = 0, collections.defaultdict(int)
        for i in range(len_label - n + 1):
            label_subs[''.join(label_tokens[i: i + n])] += 1 
        for i in range(len_pred - n + 1):
            if label_subs[''.join(pred_tokens[i : i + n])] > 0:
                num_matches += 1 
                label_subs[''.join(pred_tokens[i: i + n])] -= 1 
        score *= math.pow(num_matches / (len_pred - n + 1), math.pow(0.5, n))
    return score 

In [0]:
def score(input_seq, label_seq, k):
    pred_tokens = translate(encoder, decoder, input_seq, max_seq_len)
    label_tokens = label_seq.split(' ')
    print('bleu %.3f, predict: %s' % (bleu(pred_tokens, label_tokens, k), ' '.join(pred_tokens)))

In [41]:
score('ils regardent .', 'they are watching .', k=2)

bleu 1.000, predict: they are watching .


In [42]:
score('ils sont canadiens .', 'they are canadian .', k=2)

bleu 0.658, predict: they are russian .
