In [None]:
en_corpus = []
with open('../corpus/europarl-v7.fr-en.en') as o:
    en_corpus = o.readlines()
    
fr_corpus = []
with open('../corpus/europarl-v7.fr-en.fr') as o:
    fr_corpus = o.readlines()
    
en_corpus = en_corpus[0:20000]
fr_corpus = fr_corpus[0:20000]


''' Tokenization '''
import nltk

en_sentences = []
fr_sentences = []
for sent in en_corpus:
    en_sentences += [nltk.word_tokenize(sent.lower())]
    
for sent in fr_corpus:
    fr_sentences += [nltk.word_tokenize(sent.lower())]


''' Build vocabulary list for mapping text to text ID '''
from sklearn.feature_extraction.text import CountVectorizer

cv_en = CountVectorizer(tokenizer = nltk.word_tokenize)
cv_fr = CountVectorizer(tokenizer = nltk.word_tokenize)
cv_en.fit(en_corpus)
cv_fr.fit(fr_corpus)
print('en vocabs size:', len(cv_en.get_feature_names()))
print('fr vocabs size:', len(cv_fr.get_feature_names()))

en_vocab_list = cv_en.get_feature_names()
fr_vocab_list = cv_fr.get_feature_names()
vocab_list = en_vocab_list + fr_vocab_list


''' Mapping words in sentences to ID for training '''
X_train = []
y_train = []

for sent in en_sentences:
    id_sent = []
    for word in sent:
        vocab_id = vocab_list.index(word)
        id_sent += [vocab_id]
    X_train += [id_sent]
    
for sent in fr_sentences:
    id_sent = []
    for word in sent:
        vocab_id = vocab_list.index(word)
        id_sent += [vocab_id]
    y_train += [id_sent]

In [None]:
import math
import sys
import numpy as np
import tensorflow.contrib.seq2seq as seq2seq
from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple, GRUCell
from tensorflow.python.layers import core as layers_core
import utils

class Seq2SeqModel():
    def __init__(self, 
                 encoder_num_units, 
                 decoder_num_units, 
                 embedding_size,
                 num_layers,
                 vocab_size, 
                 batch_size,
                 bidirectional = False,
                 attention = False,
                 beam_search = False,
                 beam_width = None):
        
        self.bidirectional = bidirectional
        self.attention = attention
        self.beam_search = beam_search

        self.embedding_size = embedding_size
        self.num_layers = num_layers
        self.encoder_num_units = encoder_num_units
        self.decoder_num_units = decoder_num_units

        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.beam_width = beam_width

        self._make_graph()

    def _make_graph(self):

        self._init_placeholders()
        
        self._init_decoder_train_connectors()
        
        self._init_embedding()

        if self.bidirectional:
            self._init_bidirectional_encoder()
        else:
            self._init_encoder()

        self._init_decoder()

        self._init_optimizer()

    def _init_placeholders(self):
        self.encoder_inputs = tf.placeholder(
            shape = (None, None),
            dtype = tf.int32,
            name = 'encoder_inputs'
        )

        self.encoder_inputs_length = tf.placeholder(
            shape=(None,),
            dtype = tf.int32,
            name='encoder_inputs_length',
        )
        
        self.decoder_targets = tf.placeholder(
            shape=(None, None),
            dtype=tf.int32,
            name='decoder_targets',
        )
        
        self.decoder_targets_length = tf.placeholder(
            shape=(None,),
            dtype=tf.int32,
            name='decoder_targets_length',
        )
        
    def make_train_inputs(self, x, y, e_m = None):
        inputs_id_, inputs_id_length_ = Touka.prepare_batch(inputs = x, 
                                                            dim = 2,
                                                            time_major = self.time_major)
        targets_id_, targets_id_length_ = Touka.prepare_batch(inputs = y,
                                                              dim = 2,
                                                              time_major = self.time_major)
        if self.pre_emb:
            return {
                self.encoder_inputs: inputs_id_,
                self.encoder_inputs_length: inputs_id_length_,
                self.decoder_targets_length: targets_id_length_,
                self.decoder_targets: targets_id_,
                self.embedding_placeholder: e_m
            }
        else:
            return {
                self.encoder_inputs: inputs_id_,
                self.encoder_inputs_length: inputs_id_length_,
                self.decoder_targets_length: targets_id_length_,
                self.decoder_targets: targets_id_
            }
            

    def _init_decoder_train_connectors(self):
        with tf.name_scope('DecoderTrainFeeds'):                
            self.decoder_train_length = self.decoder_targets_length
            self.loss_weights = tf.ones([
                self.batch_size,
                tf.reduce_max(self.decoder_train_length)
            ], dtype=tf.float32)

    def _init_embedding(self):
        if self.pre_emb:
            self.embedding_encoder = tf.Variable(tf.constant(0.0, shape=[self.vocab_size, self.embedding_size]),
                                                 trainable=False)
            self.embedding_decoder = tf.Variable(tf.constant(0.0, shape=[self.vocab_size, self.embedding_size]),
                                                 trainable=False)
            enc_embedding_init = self.embedding_encoder.assign(self.embedding_placeholder)
            dec_embedding_init = self.embedding_decoder.assign(self.embedding_placeholder)

            self.encoder_embedding_inputs = tf.nn.embedding_lookup(
                self.embedding_encoder, self.encoder_inputs)
            self.decoder_embedding_inputs = tf.nn.embedding_lookup(
                self.embedding_decoder, self.decoder_targets)
            
        else:
            self.embedding_encoder = tf.Variable(tf.random_uniform([self.vocab_size, 
                                                            self.embedding_size]))
            self.encoder_embedding_inputs = tf.nn.embedding_lookup(
                self.embedding_encoder, self.encoder_inputs)

            self.embedding_decoder = tf.Variable(tf.random_uniform([self.vocab_size, 
                                                            self.embedding_size]))
            
            self.decoder_embedding_inputs = tf.nn.embedding_lookup(
                self.embedding_decoder, self.decoder_targets)

    def _init_encoder(self):
        def make_cell(rnn_size):
            enc_cell = tf.nn.rnn_cell.BasicLSTMCell(rnn_size)
            return enc_cell
        with tf.variable_scope("Encoder") as scope:
            num_layers = self.num_layers
            encoder_cell = tf.contrib.rnn.MultiRNNCell([make_cell(self.encoder_num_units) for _ in range(num_layers)])
            self.encoder_outputs, self.encoder_state = tf.nn.dynamic_rnn(
                cell = encoder_cell, 
                inputs = self.encoder_embedding_inputs,
                sequence_length = self.encoder_inputs_length, 
                time_major = self.time_major,
                dtype = tf.float32
            )

    def _init_bidirectional_encoder(self):
        '''
            to be fixed
            
        '''
#         with tf.variable_scope("Bidirectional_Encoder") as scope:
#             def make_cell(rnn_size):
#                 enc_cell = tf.nn.rnn_cell.BasicLSTMCell(rnn_size)
#                 return enc_cell
#             num_layers = 4
#             encoder_cell = tf.contrib.rnn.MultiRNNCell([make_cell(self.encoder_num_units) for _ in range(num_layers)])
#             bi_enc_outputs, bi_enc_state = tf.nn.bidirectional_dynamic_rnn(
#                 cell_fw = encoder_cell, 
#                 cell_bw = encoder_cell, 
#                 inputs = self.encoder_embedding_inputs,
#                 sequence_length = self.encoder_inputs_length, 
#                 time_major = True,
#                 dtype = tf.float32
#             )
#             self.encoder_outputs = tf.concat(bi_enc_outputs, 2)
            
#             encoder_state_c = tf.concat(
#                 (bi_enc_state[0][0], bi_enc_state[1][0]), -1)
#             encoder_state_h = tf.concat(
#                 (bi_enc_state[0][1], bi_enc_state[1][1]), -1)
#             self.encoder_state = LSTMStateTuple(c = encoder_state_c, h = encoder_state_h)
            
#             encoder_state = []
#             for layer_id in range(num_layers):
#                 encoder_state.append(bi_enc_state[0][layer_id])  # forward
#                 encoder_state.append(bi_enc_state[1][layer_id])  # backward
#             self.encoder_state = tuple(encoder_state)
            
#             self.encoder_state = tf.concat(bi_enc_state, 0)

    def _init_decoder(self):
        def make_cell(rnn_size):
            dec_cell = tf.nn.rnn_cell.BasicLSTMCell(rnn_size)
            return dec_cell

        projection_layer = layers_core.Dense(units = self.vocab_size, use_bias=False)
        '''i dont know what this is'''
        self.encoder_state = tuple(self.encoder_state[-1] for _ in range(self.num_layers))
        
        cell = tf.contrib.rnn.MultiRNNCell([make_cell(self.decoder_num_units) for _ in range(self.num_layers)])
        cell_2 = tf.contrib.rnn.MultiRNNCell([make_cell(self.decoder_num_units) for _ in range(self.num_layers)])
        with tf.variable_scope("Decoder") as scope:
            if self.attention:
                # attention_states: [batch_size, max_time, num_units]
                if self.time_major == True:
                    attention_states = tf.transpose(self.encoder_outputs, [1, 0, 2])
                else:
                    attention_states = self.encoder_outputs

                # Create an attention mechanism
                attention_mechanism = tf.contrib.seq2seq.LuongAttention(
                    self.decoder_num_units, 
                    attention_states,
                    memory_sequence_length = self.encoder_inputs_length)

                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                    cell_2, 
                    attention_mechanism,
                    attention_layer_size = self.decoder_num_units)

                initial_state = decoder_cell.zero_state(batch_size = self.batch_size, dtype = tf.float32)
                initial_state = initial_state.clone(cell_state = self.encoder_state)
            else:
                initial_state = self.encoder_state
            
            # Helper
            training_helper = tf.contrib.seq2seq.TrainingHelper(
                self.decoder_embedding_inputs, 
                self.decoder_train_length, 
                time_major = self.time_major)

            # Decoder
            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cell, 
                helper = training_helper, 
                initial_state = initial_state,
                output_layer = projection_layer)

            # Dynamic decoding
            (self.decoder_outputs_train,
            self.decoder_state_train,
            final_sequence_length) = tf.contrib.seq2seq.dynamic_decode(
                    training_decoder, 
                    scope=scope,
                    impute_finished = False
            )
            self.decoder_logits_train = self.decoder_outputs_train.rnn_output
            decoder_predictions_train = tf.argmax(self.decoder_logits_train, axis=-1)
            self.decoder_predictions_train = tf.identity(decoder_predictions_train, name = 'train_predictions')
        
    
        ''' beam search decoder / inference section '''
        # Tile inputs for beam search decoder
        if self.beam_search:
            dec_start_state = seq2seq.tile_batch(self.encoder_state, self.beam_width)
            enc_outputs = seq2seq.tile_batch(self.encoder_outputs, self.beam_width)
            enc_lengths = seq2seq.tile_batch(self.encoder_inputs_length, self.beam_width)
        else:
            dec_start_state = self.encoder_state
            enc_outputs = self.encoder_outputs
            enc_lengths = self.encoder_inputs_length
            
#         with tf.variable_scope("Decoder", reuse = True) as scope:
        scope.reuse_variables()
        if self.attention:
            # attention_states: [batch_size, max_time, num_units]
            if self.time_major == True:
                attention_states = tf.transpose(enc_outputs, [1, 0, 2])
            else:
                attention_states = enc_outputs

            # Create an attention mechanism
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(
                self.decoder_num_units, 
                attention_states,
                memory_sequence_length = enc_lengths)

            inference_decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell_2,
                attention_mechanism,
                attention_layer_size = self.decoder_num_units)

            if self.beam_search:
                initial_state = inference_decoder_cell.zero_state(self.batch_size * self.beam_width, tf.float32)
            else:
                initial_state = inference_decoder_cell.zero_state(self.batch_size, tf.float32)
            initial_state = initial_state.clone(cell_state = dec_start_state)
        else:
            initial_state = dec_start_state

        start_tokens = tf.tile(tf.constant([0], dtype=tf.int32), [self.batch_size])

        # Decoder
        if self.beam_search == False:
            # Helper
            inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                self.embedding_decoder,
                start_tokens = start_tokens, 
                end_token = 1) # EOS id

            inference_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = inference_decoder_cell, 
                helper = inference_helper, 
                initial_state = initial_state,
                output_layer = projection_layer)
        else:
            inference_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell          = inference_decoder_cell,
                    embedding     = self.embedding_decoder,
                    start_tokens  = tf.ones_like(self.encoder_inputs_length) * tf.constant(0, dtype = tf.int32),
                    end_token     = tf.constant(1, dtype = tf.int32),
                    initial_state = initial_state,
                    beam_width    = self.beam_width,
                    output_layer  = projection_layer)

        # Dynamic decoding
        self.decoder_outputs_inference, __, ___ = tf.contrib.seq2seq.dynamic_decode(
            decoder = inference_decoder,
            maximum_iterations = tf.round(tf.reduce_max(self.encoder_inputs_length)) * 2,
            impute_finished = False)
        
        if self.beam_search:
            self.decoder_predictions_inference = tf.identity(self.decoder_outputs_inference.predicted_ids,
                                                             name = 'inference_predictions')
        else:
            self.decoder_predictions_inference = tf.identity(self.decoder_outputs_inference.sample_id,
                                                             name = 'inference_predictions')


    def _init_optimizer(self):
        
        # Mask out the losses we don't care about
        
        '''
        TOTRY: tf.sequence_loss
        
        '''
        loss_mask = tf.sequence_mask(
            tf.to_int32(self.decoder_targets_length), 
            tf.reduce_max(self.decoder_targets_length),
            dtype = tf.float32)
        
#         crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
#             labels = self.decoder_targets,
#             logits = self.decoder_logits_train
#         )
#         if self.time_major == True:
#             losses = crossent * tf.transpose(tf.to_float(loss_mask), [1, 0])
#         else:
#             losses =  tf.to_float(loss_mask) * crossent

#         train_loss = tf.reduce_sum(losses) / tf.cast(self.batch_size, tf.float32)
#         self.loss = train_loss
        
        self.loss = cost = tf.contrib.seq2seq.sequence_loss(self.decoder_logits_train,
                                                            self.decoder_targets,
                                                            loss_mask)
        tf.summary.scalar('loss', self.loss)
        self.summary_op = tf.summary.merge_all()
        
        learning_rate = 0.0002
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gradients = optimizer.compute_gradients(self.loss)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        self.train_op = optimizer.apply_gradients(capped_gradients)
    
        # Calculate and clip gradients
#         params = tf.trainable_variables()
#         gradients = tf.gradients(train_loss, params)
#         clipped_gradients, _ = tf.clip_by_global_norm(
#             gradients, 
#             5 # max_gradient_norm, usually 5 or 1
#         )
        
#         # Optimization
#         optimizer = tf.train.AdamOptimizer(learning_rate)
#         update_step = optimizer.apply_gradients(
#             zip(clipped_gradients, params))
#         self.train_op = optimizer.minimize(self.loss)

In [None]:
tf.reset_default_graph()
tf.set_random_seed(1)

step = 0
batch_size = 16
max_batches = int(len(X_train_emb) / batch_size)
batches_in_epoch = 40
epoch_be_saved = 10
beam_width = 3

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)

with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as session:
    model = Seq2SeqModel(
        encoder_num_units = 512, 
        decoder_num_units = 512, 
        embedding_size = 512,
        num_layers = 4,
        vocab_size = len(vocab_list), 
        batch_size = batch_size,
        bidirectional = False,
        attention = True,
        pre_emb = False,
        time_major = False,
        beam_search = True,
        beam_width = beam_width
    )
    print('model constructed.')
    
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    loss_track = []
    print('start training.')
    summary_writer = tf.summary.FileWriter('../../log', graph_def=sess.graph_def)
    
    for _epoch in range(1, batches_in_epoch + 1):
        for _batch in range(max_batches + 1):
            X_emb, X, y_emb, y = Touka.input_generator(X_train_emb, X_train, y_train_emb, y_train, batch_size)
            feed_dict = model.make_train_inputs(x = X, y = y, e_m = embedding_matrix)

#             writer = tf.train.("../../log/", sess.graph)
            _, l, train_samples, inf_samples, summary_str = session.run([model.train_op, 
                                                            model.loss, 
                                                            model.decoder_predictions_train,
                                                            model.decoder_predictions_inference,
                                                            model.summary_op],
                                                            feed_dict)
            summary_writer.add_summary(summary_str, _epoch * _batch)

#             tf.scalar_summary('loss', l)
#             loss_track.append(l)
#             writer = tf.train.SummaryWriter("../../log/", sess.graph)

            verbose = True
            if verbose:
                if step == 0 or step % 25 == 0:
                    print('step {}'.format(step))
                    print('  minibatch loss: {}'.format(session.run(model.loss, feed_dict)))
                    for i in range(1):
                        train_sentence = ''
                        for word in train_samples[i]:
                            train_sentence += vocab_list[word]
                        print('train logits:')
                        print(train_sentence)
                        
                        for b in range(beam_width):
                            inf_sentence = ''
                            for word in inf_samples[i]:
                                inf_sentence += vocab_list[word[b]]
                            print('inference logits:')
                            print(inf_sentence)
                            print(' ')
                        
#                         inf_sentence = ''
#                         for word in inf_samples[i]:
#                             inf_sentence += vocab_list[word]
#                         print('inference logits:')
#                         print(inf_sentence)
#                         print(' ')
            step += 1
        print(_epoch, 'epoch finished')
        if _epoch % epoch_be_saved == 0:
            saver.save(session, '../../model/' + 'enc_dec_layer_2_dim_512_pre_emb_None_beam_search.ckpt', global_step = step)
            print('model saved at step =', step)
                
    print('finish training')