In [None]:
import os
import re

from tensorflow.python.platform import gfile
import tensorflow as tf

class DataUtils():

    def __init__(self, from_train_file='data/train.en',
                 from_vocab_file='data/vocab.en',
                 to_train_file='data/train.vi',
                 to_vocab_file='data/vocab.vi',
                 from_dev_file='data/tst2012.en',
                 to_dev_file='data/tst2012.vi',
                 from_test_file='data/tst2013.en',
                 to_test_file='data/tst2013.vi'):

        #self._PAD = b"_PAD"
        #self._GO = b"_GO"
        #self._EOS = b"_EOS"
        #self._UNK = b"_UNK"
        #self._START_VOCAB = [_PAD, _GO, _EOS, _UNK]

        self.PAD_ID = 0
        self.GO_ID = 1
        self.EOS_ID = 2
        self.UNK_ID = 3

        self.from_train_file   = from_train_file
        self.source_vocab_file = from_vocab_file
        self.to_train_file     = to_train_file
        self.to_vocab_file     = to_vocab_file
        self.from_dev_file     = from_dev_file
        self.to_dev_file       = to_dev_file
        self.from_test_file    = from_test_file
        self.to_test_file      = to_test_file

        vocab, rev_vocab = self.initialize_vocabulary(from_train_file)
        self.en_vocab_from_train = vocab
        self.en_rev_vocab_from_train = rev_vocab

        vocab, rev_vocab = self.initialize_vocabulary(to_train_file)
        self.en_vocab_to_train = vocab
        self.en_vocab_to_train = rev_vocab

        vocab, rev_vocab = self.initialize_vocabulary(from_dev_file)
        self.en_vocab_from_dev = vocab
        self.en_rev_vocab_from_dev = rev_vocab

        vocab, rev_vocab = self.initialize_vocabulary(to_dev_file)
        self.en_vocab_to_dev = vocab
        self.en_vocab_to_dev = rev_vocab

        vocab, rev_vocab = self.initialize_vocabulary(from_test_file)
        self.en_vocab_from_dev = vocab
        self.en_rev_vocab_from_dev = rev_vocab

        vocab, rev_vocab = self.initialize_vocabulary(to_test_file)
        self.en_vocab_to_test = vocab
        self.en_vocab_to_test = rev_vocab


    def initialize_vocabulary(self, vocabulary_path):
        if gfile.Exists(vocabulary_path):
            rev_vocab = []
            with gfile.GFile(vocabulary_path, "rb") as f:
                rev_vocab.extend(f.readlines())
            rev_vocab = [tf.compat.as_bytes(line.strip()) for line in rev_vocab]
            vocab = dict([x,y] for (y,x) in enumerate(rev_vocab))
            return vocab, rev_vocab
        else:
            raise ValueError("Vocabulary file %s not found", vocabulary_path)

    def basic_tokenizer(sentence):
        """Very basic tokenizer: split the sentence into a list of tokens."""
        words = []
        for space_separated_fragment in sentence.strip().split():
            words.extend(self._WORD_SPLIT.split(space_separated_fragment))
        return [w for w in words if w]

    def sentence_to_token_ids(sentence, vocabulary_path, tokenizer=None, normalize_digits=True):
        if tokenizer:
            words = tokenizer(sentence)
        else:
            vocabulary, _ = self.initialize_vocabulary(vocabulary_path)
            words = self.basic_tokenizer(sentence)
            if not normalize_digits:
                return [vocabulary.get(w, self.UNK_ID) for w in words]

            # Normalize digits by 0 before looking words up in the vocabulary.
            return [vocabulary.get(self._DIGIT_RE.sub(b"0", w), UNK_ID) for w in words]

    def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):
        if not gfile.Exists(target_path):
            print("Tokenizing data in %s" % data_path)
        vocab, _ = initialize_vocabulary(vocabulary_path)
        with gfile.GFile(data_path, mode="rb") as data_file:
            with gfile.GFile(target_path, mode="w") as tokens_file:
                counter = 0
                for line in data_file:
                    counter += 1
                    if counter % 100000 == 0:
                        print("  tokenizing line %d" % counter)
                    token_ids = sentence_to_token_ids(tf.compat.as_bytes(line), vocab,
                                                tokenizer, normalize_digits)
                tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")

    def prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, from_vocabulary_size,
                 to_vocabulary_size, tokenizer=None):

        # Create token ids for the training data.
        to_train_ids_path = to_train_path + (".ids%d" % to_vocabulary_size)
        from_train_ids_path = from_train_path + (".ids%d" % from_vocabulary_size)
        data_to_token_ids(to_train_path, to_train_ids_path, to_vocab_path, tokenizer)
        data_to_token_ids(from_train_path, from_train_ids_path, from_vocab_path, tokenizer)

        # Create token ids for the development data.
        to_dev_ids_path = to_dev_path + (".ids%d" % to_vocabulary_size)
        from_dev_ids_path = from_dev_path + (".ids%d" % from_vocabulary_size)
        data_to_token_ids(to_dev_path, to_dev_ids_path, to_vocab_path, tokenizer)
        data_to_token_ids(from_dev_path, from_dev_ids_path, from_vocab_path, tokenizer)

        return (from_train_ids_path, to_train_ids_path,
              from_dev_ids_path, to_dev_ids_path,
              from_vocab_path, to_vocab_path)

if __name__ == "__main__":
    du = DataUtils()


In [None]:
import tensorflow as tf
import random

from DataUtils import DataUtils
from Config import Config
from attn_cell import attn_cell, _linear

class Seq2SeqModel(LanguageModel):

    def load_data(self):
        dataset = [[] for _ in self.config.buckets]

        from_train_file = self.config.from_train_file
        to_train_file = self.config.to_train_file

        max_size = self.config.max_size

        with tf.gfile.GFile(from_train_file, "r") as from_file:
            with tf.gfile.GFile(to_train_file, "r") as to_file:
                from_line, to_line = from_file.readline(), to_file.readline()
                while from_line and to_line and (not max_size or count < max_size):
                    count = 0
                    if count % 5000 == 0:
                        print("Line: ", count)
                    from_ids = [int(x) for x in from_line.split()]
                    to_ids = [int(x) for x in to_line.split()]
                    to_ids.append(self.config.EOS_ID)

                    for bucket_id, (from_size, to_size) in enumerate(self.config.buckets):
                        data_set[bucket_id].append([from_ids, to_ids])
                    from_line, to_line = from_line.readline(), to_line.readline()
        return data_set

    def create_feed_dict(self):
        feed_dict = {}
        encoder_inputs = self.bucketize_encoding_layer()
        feed_dict['encoder_inputs'] = encoder_inputs

        decoder_inputs, to_weights, targets = self.bucketize_decoding_layer()

        feed_dict['decoder_inputs'] = decoder_inputs
        feed_dict['to_weights'] = to_weights
        feed_dict['targets'] = targets

        return feed_dict

    def bucketize_encoding_layer():
        encoder_inputs = []

        for i in xrange(self.config.buckets[-1][0]):
            encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                name="encoder{0}".format(i)))
        return encoder_inputs

    def bucketize_decoding_layer():
        decoder_inputs = []
        to_weights = []

        for i in xrange(self.config.buckets[-1][1]+1):
            decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                name="decoder{0}".format(i)))
            to_weights.append(tf.placeholder(tf.float32, shape=[None],
                                name="weight{0}".format(i)))
        targets = [decoder_inputs[i+1] for i in xrange(len(decoder_inputs))]

        return (decoder_inputs, to_weights, targets)

    def add_placeholders(self):
        self.en_input_placeholder = tf.placeholder(tf.int32, shape=[None])
        self.de_input_placeholder = tf.placeholder(tf.int32, shape=[None])

    def add_projection(self):
        num_samples = self.config.num_samples
        if num_samples > 0 and num_samples < self.config.to_vocab_size:
            self.w_t = tf.get_variable("W", [self.config.to_vocab_size, self.config.size], dtype=self.config.dtype)
            self.w = tf.transpose(w_t)
            self.b = tf.get_variable("b", [self.config.to_vocab_size], dtype=self.config.dtype)
            return (w,b)

    # Embedding and attention function
    def add_embedding(self):
        """
        input:
        inputs: input placeholder in shape [batch_size, input_size]
        output:
        embedded: embedded input in shape[batch_size*rnn_hidden, time_step_size]
        """
        with tf.device('/cpu:0'):
            with tf.variable_scope("embedding") as scope:
                L = tf.get_variable("L",[self.config.from_vocab_size, self.config.encode_hidden_size], initializer = self.config.initializer)
                embeds = tf.nn.embedding_lookup(L, self.feed_dict['encoder_inputs'])
                embedded = [tf.squeeze(x) for x in tf.split(embeds, [tf.ones([self.config.encode_num_steps], tf.int32)], axis=1)]
        return embedded

    def add_decode_embedding(self):
        with tf.variable_scope("decode_embedding") as decode_scope:
            L = tf.get_variable("L", [self.config.to_vocab_size, self.decode_hidden_size], initializer=self.config.initializer)
            embeds = tf.nn.embedding_lookup(L, self.feed_dict['decoder_inputs'])
            embedded = [tf.squeeze(x) for x in tf.split(embeds, [tf.ones([self.config.decode_num_steps], tf.int32)], axis=1)]
        return embedded


    def LSTM_cell(self):
        self.cell = tf.contrib.rnn.BasicLSTMCell(self.config.encode_hidden_size)
        self.decoder_cell = tf.contrib.rnn.attn_cell(self.config.encode_hidden_size,en_states)
        self.encoder_cell = tf.contrib.rnn.BasicLSTMCell(self.config.encode_hidden_size)

    def encoder_layer(self, inputs):
        """
        inputs: embedded encoder inputs
        outputs: a tuple of (outputs, states)
        """
        initial_state = (tf.zeros([self.config.batch_size, self.config.encode_hidden_size]), tf.zeros([self.config.batch_size, self.config.encode_hidden_size]))
        state = initial_state
        cell = self.cell
        outputs = []
        states = []

        for i in xrange(self.config.encode_num_steps):
            output, state = cell(inputs, state)
            inputs = output
            outputs.append(output)
            states.append(state)

        return (outputs, states)

    def decoder_layer(self, inputs):
        """
        inputs: embedded encoder inputs
        outputs: a tuple of (outputs, states)
        """
        initial_state = (tf.zeros([self.config.batch_size, self.config.decode_hidden_size]), tf.zeros([self.config.batch_size, self.config.decode_hidden_size]))
        state = initial_state
        cell = self.decoder_cell
        outputs = []
        states = []

        for i in xrange(self.config.decode_num_steps):
            output, state = cell(inputs, state)
            inputs = output
            outputs.append(output)
            states.append(state)

        return (outputs, states)

    # Loss function
    def add_loss_op(self, inputs, labels):
        labels = tf.reshape(labels, [-1,1])
        w_t = tf.cast(self.w_t, self.config.dtype)
        b = tf.cast(self.b, self.config.dtype)
        inputs = tf.cast(inputs, self.config.dtype)

        softmax_loss = tf.cast(
            tf.nn.sampled_softmax_loss(
                weights=w_t,
                biases=b,
                labels=labels,
                inputs=inputs,
                num_sampled=num_samples,
                num_classes=self.config.to_vocab_size),
                self.config.dtype
            )
        return softmax_loss


    def add_embeddings(self, encoder_inputs, decoder_inputs, cell, is_decode):
        embeddings = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
            encoder_inputs, decoder_inputs, cell,
            num_encoder_symbols=self.config.from_vocab_size,
            num_decoder_symbols=self.config.to_vocab_size,
            embedding_size=self.config.size,
            output_projection=projection,
            feed_previous=is_decode,
            dtype=self.config.dtype
        )

    def add_to_model_with_buckets(self, encoder_inputs, decoder_inputs, targets, weights):
        all_inputs = encoder_inputs + decoder_inputs + targets + weights
        losses = []
        outputs = []
        with tf.variable_scope("en_de_model") as model_scope:
            for i, bucket in enumerate(self.config.buckets):
                buckets_outputs, _ = lambda x, y: self.add_embeddings(x, y, cell, projection, do_decode)
                outputs.append(buckets_outputs)
                losses.append(sequence_loss(
                                outputs[-1],
                                targets[:buckets[1]],
                                weights[:buckets[1]],
                                softmax_loss_function=self.add_loss_op
                                ))
        return outputs, losses


    # Add model
    def add_model(self, cell, projection):
        encoder_inputs = self.feed_dict['encoder_inputs']
        decoder_inputs = self.feed_dict['decoder_inputs']
        to_weights     = self.feed_dict['to_weights']
        targets        = self.feed_dict['targets']

        self.outputs, self.losses = self.add_to_model_with_buckets(
                encoder_inputs, decoder_inputs, targets, weights
        )

        if self.config.forward_only:
            if projection is not None:
                for b xrange(len(self.config.buckets)):
                    self.outputs[b] = tf.matmul(x, projection[0] + projection[1] for x in self.outputs[b])
        else:
            params = tf.trainable_variables()
            self.gradient_norms = []
            self.updates = []

            self.lr = tf.Variable(float(self.config.lr),
                                trainable=False, dtype=self.config.dtype)
            self.lr_decay_op = self.lr.assign(self.lr*self.config.learning_rate_decay)
            self.global_step = tf.Variable(0, trainable=False)

            optimizer = tf.train.GradientDescentOptimizer(self.lr)

            for b in xrange(len(self.config.buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                self.config.max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(optimier.apply_gradients(
                    zip(clipped_gradients, params),  global_step=self.global_step
                ))
        self.saver = tf.train.saver(tf.global_variables())

    def step(self, session, encoder_inputs, decoder_inputs, targets, to_weights, b):
        input_feed = {}
        for i in xrange(len(encoder_inputs)):
            name = self.feed_dict['encoder_inputs'][i].name
            input_feed[name] = encoder_inputs[i]

        for i in xrange(len(decoder_inputs)):
            de_name = self.feed_dict['decoder_inputs'][i].name
            to_name = self.feed_dict['to_weights'][i].name
            input_feed[de_name] = decoder_inputs[i]
            input_feed[to_name] = to_weights[i]

        to_last = self.feed_dict['decoder_inputs'][len(decoder_inputs)].name
        input_feed[to_last] = np.zeros([self.config.batch_size], dtype=np.int32)

        if self.config.forward_only:
            output_feed = self.losses[b]
            for i in xrange(len(decoder_inputs)):
                output_feed.append(self.outputs[b][i])
        else:
            output_feed = [self.updates[b], self.gradient_norms[b], self.losses[b]]

        outputs = session.run(output_feed, input_feed)
        if self.config.forward_only:
            return None, outputs[0], outputs[1]
        else:
            return outputs[1], outputs[2], None

    def __init__(self, do_decode=False):
        self.config = Config

        data_set = self.load_data()
        self.add_placeholders()

        self.feed_dict = self.create_feed_dict()

        output_projection = self.add_projection()

        # Create the internal multi-layer cell
        if self.config.num_layers == 1:
            cell = tf.contrib.rnn.BasicLSTMCell(self.config.size)
        else:
            single_cell = tf.contrib.rnn.BasicLSTMCell(self.config.size)
            cell = tf.contrib.rnn.MultiRNNCell([single_cell for _ in self.config.num_layers])


        self.add_model(cell, output_projection, do_decode)
