In [None]:
#!/usr/bin/env python

import argparse

from nltk.translate import bleu_score
import numpy
#import progressbar
import six
import sys

import chainer
import math
from chainer import cuda
import chainer.functions as F
import chainer.links as L
from chainer import training
from chainer.training import extensions

UNK = 0
EOS = 1


In [2]:
def sequence_embed(embed, xs):
    x_len = [len(x) for x in xs]
    x_section = numpy.cumsum(x_len[:-1])
    eee=F.concat(xs,axis=0)
    ex = embed(F.concat(xs, axis=0))
    exs = F.split_axis(ex, x_section, 0)
    return exs


In [3]:
def convert(batch, device):
    def to_device_batch(batch):
        if device is None:
            return batch
        elif device < 0:
            return [chainer.dataset.to_device(device, x) for x in batch]
        else:
            xp = cuda.cupy.get_array_module(*batch)
            concat = xp.concatenate(batch, axis=0)
            sections = numpy.cumsum([len(x)
                                     for x in batch[:-1]], dtype=numpy.int32)
            concat_dev = chainer.dataset.to_device(device, concat)
            batch_dev = cuda.cupy.split(concat_dev, sections)
            return batch_dev

    return {'xs': to_device_batch([x for x, _ in batch]),
            'ys': to_device_batch([y for _, y in batch])}


In [4]:
class CalculateBleu(chainer.training.Extension):

    trigger = 1, 'epoch'
    priority = chainer.training.PRIORITY_WRITER

    def __init__(self, model, test_data, key, batch=100, device=-1, max_length=100):
        self.model = model
        self.test_data = test_data
        self.key = key
        self.batch = batch
        self.device = device
        self.max_length = max_length

    def __call__(self, trainer):
        with chainer.no_backprop_mode():
            references = []
            hypotheses = []
            for i in range(0, len(self.test_data), self.batch):
                sources, targets = zip(*self.test_data[i:i + self.batch])
                references.extend([[t.tolist()] for t in targets])

                sources = [
                    chainer.dataset.to_device(self.device, x) for x in sources]
                ys = [y.tolist()
                      for y in self.model.translate(sources, self.max_length)]
                hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1)
        chainer.report({self.key: bleu})


In [5]:
def count_lines(path):
    with open(path) as f:
        return sum([1 for _ in f])


def load_vocabulary(path):
    with open(path) as f:
        # +2 for UNK and EOS
        word_ids = {line.strip(): i + 2 for i, line in enumerate(f)}
    word_ids['<UNK>'] = 0
    word_ids['<EOS>'] = 1
    return word_ids


def load_data(vocabulary, path):
    n_lines = count_lines(path)
    data = []
    print('loading...: %s' % path)
    with open(path) as f:
        for line in f:
            words = line.strip().split()
            array = numpy.array([vocabulary.get(w, UNK) for w in words], numpy.int32)
            data.append(array)
    return data

def calculate_unknown_ratio(data):
    unknown = sum((s == UNK).sum() for s in data)
    total = sum(s.size for s in data)
    return unknown / total


In [6]:
SOURCE =  "./dataset/train.en"
TARGET = "./dataset/train.jp"
SOURCE_VOCAB = "./dataset/vocab.en"
TARGET_VOCAB = "./dataset/vocab.jp"
validation_source ="./dataset/test.en" 
validation_target = "./dataset/test.jp"
batchsize = 1
epoch = 100
gpu = -1
resume = ''
unit = 500
layer = 3
min_source_sentence = 0
max_source_sentence = 50
min_target_sentence = 0
min_target_sentence = 50
log_interval = 2
validation_interval = 4000
out = "result"


In [7]:
source_ids = load_vocabulary(SOURCE_VOCAB)
target_ids = load_vocabulary(TARGET_VOCAB)
train_source = load_data(source_ids, SOURCE)
train_target = load_data(target_ids, TARGET)
assert len(train_source) == len(train_target)
train_data = [(s, t)
              for s, t in six.moves.zip(train_source, train_target)
              if min_source_sentence <= len(s)
              <= max_source_sentence and
              min_source_sentence <= len(t)
              <= max_source_sentence]
train_source_unknown = calculate_unknown_ratio(
    [s for s, _ in train_data])
train_target_unknown = calculate_unknown_ratio(
    [t for _, t in train_data])

print('Source vocabulary size: %d' % len(source_ids))
print('Target vocabulary size: %d' % len(target_ids))
print('Train data size: %d' % len(train_data))
print('Train source unknown ratio: %.2f%%' % (train_source_unknown * 100))
print('Train target unknown ratio: %.2f%%' % (train_target_unknown * 100))


loading...: ./dataset/train.en
loading...: ./dataset/train.jp
Source vocabulary size: 40002
Target vocabulary size: 40002
Train data size: 427910
Train source unknown ratio: 2.17%
Train target unknown ratio: 1.35%


In [8]:
target_words = {i: w for w, i in target_ids.items()}
source_words = {i: w for w, i in source_ids.items()}

In [9]:
class Seq2seq(chainer.Chain):

    def __init__(self, n_layers, n_source_vocab, n_target_vocab, n_units):
        super(Seq2seq, self).__init__()
        with self.init_scope():
            self.embed_x = L.EmbedID(n_source_vocab, n_units)
            self.embed_y = L.EmbedID(n_target_vocab, n_units)
            self.encoder = L.NStepLSTM(n_layers, n_units, n_units, 0.1)
            self.decoder = L.NStepLSTM(n_layers, n_units, n_units, 0.1)
            # add
            self.connecter = L.Linear(None, n_units)
            # end
            self.W = L.Linear(n_units, n_target_vocab)

        self.n_layers = n_layers
        self.n_units = n_units
        # add
        self.prev_hx = None
        self.prev_h = None
        # end

    def __call__(self, xs, ys):
        xs = [x[::-1] for x in xs]  # reverse input      ["i", "am", "taro"] â†’["taro", "am", "I"]

        eos = self.xp.array([EOS], numpy.int32)
        ys_in = [F.concat([eos, y], axis=0) for y in ys]  # [eos,y1,y2,...]
        ys_out = [F.concat([y, eos], axis=0) for y in ys]  # [y1,y2,...,eos]

        exs = sequence_embed(self.embed_x, xs)
        eys = sequence_embed(self.embed_y, ys_in)

        batch = len(xs)

        hx, cx, _ = self.encoder(None, None, exs)
        # add ############################################################################################################
        self.prev_hx = hx

        if xs[0][-1] != 6 and self.prev_hx is not None:
            #print("connect!")
            hx = chainer.functions.concat([hx, self.prev_hx], axis=1).data
            hx = self.connecter(hx)
            hx = F.reshape(hx, (self.n_layers, batch, self.n_units))  # (3, 1, 500)

        # end############################################################################################################
        _, _, os = self.decoder(hx, cx, eys)

        # It is faster to concatenate data before calculating loss
        # because only one matrix multiplication is called.
        concat_os = F.concat(os, axis=0)
        concat_ys_out = F.concat(ys_out, axis=0)
        loss = F.sum(F.softmax_cross_entropy(
            self.W(concat_os), concat_ys_out, reduce='no')) / batch

        chainer.report({'loss': loss.data}, self)
        n_words = concat_ys_out.shape[0]
        perp = self.xp.exp(loss.data * batch / n_words)
        chainer.report({'perp': perp}, self)
        return loss

    def translate(self, xs, max_length=50):
        batch = len(xs)

        with chainer.no_backprop_mode(), chainer.using_config('train', False):
            xs = [x[::-1] for x in xs]

            exs = sequence_embed(self.embed_x, xs)
            h, c, _ = self.encoder(None, None, exs)

            # add
            self.prev_h = h
            if xs[0][-1] != 6 and self.prev_h is not None:
                h = chainer.functions.concat([h, self.prev_h], axis=1).data
                h = self.connecter(h)
                h = F.reshape(h, (self.n_layers, batch, self.n_units))  # (3, 1, 500)
            # end
            
            ys = self.xp.full(batch, EOS, numpy.int32)
            result = []
            for i in range(max_length):
                eys = self.embed_y(ys)
                eys = F.split_axis(eys, batch, 0)
                h, c, ys = self.decoder(h, c, eys)
                cys = F.concat(ys, axis=0)
                wy = self.W(cys)
                ys = self.xp.argmax(wy.data, axis=1).astype(numpy.int32)
                result.append(ys)

        # Using `xp.concatenate(...)` instead of `xp.stack(result)` here to
        # support NumPy 1.9.
        result = cuda.to_cpu(
            self.xp.concatenate([self.xp.expand_dims(x, 0) for x in result]).T)

        # Remove EOS taggs
        outs = []
        for y in result:
            inds = numpy.argwhere(y == EOS)
            if len(inds) > 0:
                y = y[:inds[0, 0]]
            outs.append(y)
        return outs


In [None]:
model = Seq2seq(layer, len(source_ids), len(target_ids), unit)
if gpu >= 0:
    chainer.cuda.get_device(gpu).use()
    model.to_gpu(gpu)

optimizer = chainer.optimizers.Adam()
optimizer.setup(model)

In [None]:
train_iter = chainer.iterators.SerialIterator(train_data, batchsize, True, False)#shuffle=false
updater = training.StandardUpdater(
    train_iter, optimizer, converter=convert, device=gpu)
trainer = training.Trainer(updater, (epoch, 'epoch'), out=out)
trainer.extend(extensions.LogReport(
    trigger=(log_interval, 'iteration')))
trainer.extend(extensions.PrintReport(
    ['epoch', 'iteration', 'main/loss', 'validation/main/loss',
     'main/perp', 'validation/main/perp', 'validation/main/bleu',
     'elapsed_time']),
    trigger=(log_interval, 'iteration'))


if validation_source and validation_target:
    test_source = load_data(source_ids, validation_source)
    test_target = load_data(target_ids, validation_target)
    assert len(test_source) == len(test_target)
    test_data = list(six.moves.zip(test_source, test_target))
    test_data = [(s, t) for s, t in test_data if 0 < len(s) and 0 < len(t)]
    test_source_unknown = calculate_unknown_ratio(
        [s for s, _ in test_data])
    test_target_unknown = calculate_unknown_ratio(
        [t for _, t in test_data])

    print('Validation data: %d' % len(test_data))
    print('Validation source unknown ratio: %.2f%%' %
          (test_source_unknown * 100))
    print('Validation target unknown ratio: %.2f%%' %
          (test_target_unknown * 100))

    @chainer.training.make_extension()#per 1 epoch
    def translate(trainer):
        source, target = test_data[numpy.random.choice(len(test_data))]
        result = model.translate([model.xp.array(source)])[0]

        source_sentence = ' '.join([source_words[x] for x in source])
        target_sentence = ' '.join([target_words[y] for y in target])
        result_sentence = ' '.join([target_words[y] for y in result])
        print('# source : ' + source_sentence)
        print('#  result : ' + result_sentence)
        print('#  expect : ' + target_sentence)

    trainer.extend(
        translate, trigger=(validation_interval, 'iteration'))
    trainer.extend(
        CalculateBleu(
            model, test_data, 'validation/main/bleu', device=gpu),
        trigger=(validation_interval, 'iteration'))

print('start training')
trainer.run()


loading...: ./dataset/test.en
loading...: ./dataset/test.jp
Validation data: 21475
Validation source unknown ratio: 2.35%
Validation target unknown ratio: 1.34%
start training
epoch       iteration   main/loss   validation/main/loss  main/perp   validation/main/perp  validation/main/bleu  elapsed_time
[J0           2           31.4018                           35441.8                                                 4.49662       
[J0           4           55.5456                           23552.9                                                 8.65341       
[J0           6           39.7472                           6684.26                                                 12.6475       
[J0           8           295.067                           59316.8                                                 21.787        
[J0           10          189.03                            25470.9                                                 28.1825       
[J0           12          218.375   

In [17]:
tx = numpy.array([[[1,2,3], [1,2,3]],[[4,5,6], [4,5,6]],[[7,8,9], [7,8,9]]])
print(tx.shape)
px = numpy.array([[[9,8,7], [9,8,7]],[[6,5,4], [6,5,4]],[[3,2,1], [3,2,1]]])
print(px.shape)
cx = numpy.concatenate((tx, px), axis=1)
print(cx)
print(cx.shape)

(3, 2, 3)
(3, 2, 3)
[[[1 2 3]
  [1 2 3]
  [9 8 7]
  [9 8 7]]

 [[4 5 6]
  [4 5 6]
  [6 5 4]
  [6 5 4]]

 [[7 8 9]
  [7 8 9]
  [3 2 1]
  [3 2 1]]]
(3, 4, 3)


In [None]:
ls