In [1]:
import sys
import re
import pickle
import numpy as np
import random
import os

In [2]:
_PAD = b"_PAD"
_GO = b"_GO"
_EOS = b"_EOS"
_UNK = b"_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

_WORD_SPLIT = re.compile("([.,!?\"':;)(])")
_DIGIT_RE = re.compile(R"\d")

In [3]:
def basic_tokenizer(sentence):
    """ Split sentence into list of tokens """
    words = []
    for space_separated_item in sentence.strip().split():
        words.extend(_WORD_SPLIT.split(space_separated_item))
    return [w for w in words if w] # if w removes the ""

def get_vocab(tokenized, max_vocab_size):
    """
    Get vocab_list, vocab_dict and rev_vocab_dict given the
    tokenized sentences.
    """
    # Replace word count
    vocab = {}
    for sentence in tokenized:
        for word in sentence:
            if word in vocab:
                vocab[word] += 1
            else:
                vocab[word] = 1
    vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
    if len(vocab_list) > max_vocab_size:
        vocab_list = vocab_list[:max_vocab_size]

    # Get vocab dict (word -> token) and rev dict (token -> word)
    vocab_dict = dict([(x,y) for (y,x) in enumerate(vocab_list)])
    rev_vocab_dict = {v: k for k, v in vocab_dict.items()}

    return vocab_list, vocab_dict, rev_vocab_dict

def sentence_to_token_ids(sentence, vocab_dict, target_lang,
    normalize_digits=True):
    """
    Convert a single sentence of words to token ids. If it is the target
    language, we will append an EOS token to the end.
    """
    if not normalize_digits:
        # replace words not in vocab_dict with UNK_ID
        tokens = [vocab_dict.get(w, UNK_ID) for w in sentence]
    else:
        tokens = [vocab_dict.get(_DIGIT_RE.sub(b"0", w), UNK_ID)
            for w in sentence]

    # Append EOS token if target langauge sentence

    return tokens


def data_to_token_ids(tokenized, vocab_dict, max_seq_len, normalize_digits=True):
    """
    Convert tokens into ids used vocab_dict and normalize all digits
    to 0.
    """
    data_as_tokens = []
    seq_lens = []
    #max_len = max(len(sentence) for sentence in tokenized) + 1 # +1 for EOS
    max_len=max_seq_len+1
    for sentence in tokenized:
        sentence=sentence[:max_seq_len]
        token_ids = sentence_to_token_ids(sentence, vocab_dict, normalize_digits)
        # Padding
        data_as_tokens.append(token_ids + [PAD_ID]*(max_len - len(token_ids)))
        # Store original sequence length
        seq_lens.append(len(token_ids))

    return np.array(data_as_tokens), np.array(seq_lens)

def process_data(datafile, max_vocab_size,max_seq_len):
    """
    Read the sentences from our datafiles.
    """
    with open(datafile, 'rb') as f:
        sentences = pickle.load(f)

    # Split into tokens
    tokenized = []
    for i in range(len(sentences)):
        tokenized.append(basic_tokenizer(sentences[i]))

    # Get vocab information
    vocab_list, vocab_dict, rev_vocab_dict = get_vocab(tokenized,
        max_vocab_size)

    # Convert data to token ids
    data_as_tokens, seq_lens = data_to_token_ids(tokenized, vocab_dict, max_seq_len,normalize_digits=True)

    return data_as_tokens, seq_lens, vocab_dict, rev_vocab_dict

In [4]:
tar_token_ids, tar_seq_lens, tar_vocab_dict, tar_rev_vocab_dict = \
        process_data('test.p', max_vocab_size=5000,max_seq_len=20)

In [5]:
src_token_ids = np.zeros(tar_token_ids.shape,dtype=np.int)
src_seq_lens=tar_seq_lens.copy()

for x in range(tar_token_ids.shape[0]):
    for y in range(0, tar_token_ids.shape[1]):
        if tar_token_ids[x, y]==6 or tar_token_ids[x, y]==11:
            src_token_ids[x,y]=6 if random.random()<0.5 else 11
        else:
            src_token_ids[x,y]=tar_token_ids[x,y]
    tar_token_ids[x,tar_seq_lens[x]]=EOS_ID
    tar_seq_lens[x]+=1

src_vocab_dict, src_rev_vocab_dict=tar_vocab_dict, tar_rev_vocab_dict

In [6]:
def split_data(en_token_ids, sp_token_ids,
    en_seq_lens, sp_seq_len, train_ratio=0.8):
    """
    Split the into train and validation sets.
    """

    decoder_inputs = []
    targets = []
    # Add go token to decoder inputs and create targets
    for sentence in sp_token_ids:
        decoder_inputs.append(np.array([GO_ID] + list(sentence)))
        targets.append(np.array(([GO_ID] + list(sentence))[1:] + [0]))

    sp_token_ids = np.array(decoder_inputs)
    targets = np.array(targets)

    # Splitting index
    last_train_index = int(0.8*len(en_token_ids))

    train_encoder_inputs = en_token_ids[:last_train_index]
    train_decoder_inputs = sp_token_ids[:last_train_index]
    train_targets = targets[:last_train_index]
    train_en_seq_lens = en_seq_lens[:last_train_index]
    train_sp_seq_len = sp_seq_len[:last_train_index]

    valid_encoder_inputs = en_token_ids[last_train_index:]
    valid_decoder_inputs = sp_token_ids[last_train_index:]
    valid_targets = targets[last_train_index:]
    valid_en_seq_lens = en_seq_lens[last_train_index:]
    valid_sp_seq_len = sp_seq_len[last_train_index:]

    print("%i training samples and %i validations samples" % (
        len(train_encoder_inputs), len(valid_encoder_inputs)))

    return train_encoder_inputs, train_decoder_inputs, train_targets, \
        train_en_seq_lens, train_sp_seq_len, \
        valid_encoder_inputs, valid_decoder_inputs, valid_targets, \
        valid_en_seq_lens, valid_sp_seq_len

def generate_epoch(encoder_inputs, decoder_inputs, targets, en_seq_lens, sp_seq_lens,
    num_epochs, batch_size):

    for epoch_num in range(num_epochs):
        yield generate_batch(encoder_inputs, decoder_inputs, targets,
            en_seq_lens, sp_seq_lens, batch_size)

def generate_batch(encoder_inputs, decoder_inputs, targets,
    en_seq_lens, sp_seq_lens, batch_size):

    data_size = len(encoder_inputs)

    num_batches = (data_size // batch_size)
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)

        yield encoder_inputs[start_index:end_index], \
            decoder_inputs[start_index:end_index], \
            targets[start_index:end_index], \
            en_seq_lens[start_index:end_index], \
            sp_seq_lens[start_index:end_index]

In [7]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

def rnn_cell(FLAGS, dropout, scope):

    with tf.variable_scope(scope):
        # Get the cell type
        if FLAGS.rnn_unit == 'rnn':
            rnn_cell_type = tf.nn.rnn_cell.BasicRNNCell
        elif FLAGS.rnn_unit == 'gru':
            rnn_cell_type = tf.nn.rnn_cell.GRUCell
        elif FLAGS.rnn_unit == 'lstm':
            rnn_cell_type = tf.nn.rnn_cell.BasicLSTMCell
        else:
            raise Exception("Choose a valid RNN unit type.")

        # Single cell
        single_cell = rnn_cell_type(FLAGS.num_hidden_units)

        # Dropout
        single_cell = tf.nn.rnn_cell.DropoutWrapper(single_cell,
            output_keep_prob=1-dropout)

        # Each state as one cell
        stacked_cell = tf.nn.rnn_cell.MultiRNNCell(
            [single_cell] * FLAGS.num_layers)

    return stacked_cell

def rnn_inputs(FLAGS, input_data, vocab_size, scope):

    with tf.variable_scope(scope, reuse=True):
        W_input = tf.get_variable("W_input",
            [vocab_size, FLAGS.num_hidden_units])

    # embeddings will be shape [input_data dimensions, num_hidden units]
    embeddings = tf.nn.embedding_lookup(W_input, input_data)
    return embeddings

def rnn_softmax(FLAGS, outputs, scope):
    with tf.variable_scope(scope, reuse=True):
        W_softmax = tf.get_variable("W_softmax",
            [FLAGS.num_hidden_units, FLAGS.tar_vocab_size])
        b_softmax = tf.get_variable("b_softmax", [FLAGS.tar_vocab_size])

    logits = tf.matmul(outputs, W_softmax) + b_softmax
    return logits

class model(object):

    def __init__(self, FLAGS):

        # Placeholders
        self.encoder_inputs = tf.placeholder(tf.int32, shape=[None, None],
            name='encoder_inputs')
        self.decoder_inputs = tf.placeholder(tf.int32, shape=[None, None],
            name='decoder_inputs')
        self.inference_inputs = tf.placeholder(tf.int32, shape=[None, None],
            name='inference_inputs')
        self.targets = tf.placeholder(tf.int32, shape=[None, None],
            name='targets')
        self.src_seq_lens = tf.placeholder(tf.int32, shape=[None, ],
            name="src_seq_lens")
        self.tar_seq_lens = tf.placeholder(tf.int32, shape=[None, ],
            name="tar_seq_lens")
        self.dropout = tf.placeholder(tf.float32)

        with tf.variable_scope('encoder') as scope:

            # Encoder RNN cell
            self.encoder_stacked_cell = rnn_cell(FLAGS, self.dropout,
                scope=scope)

            # Embed encoder inputs
            W_input = tf.get_variable("W_input",
                [FLAGS.src_vocab_size, FLAGS.num_hidden_units])
            self.embedded_encoder_inputs = rnn_inputs(FLAGS,
                self.encoder_inputs, FLAGS.src_vocab_size, scope=scope)
            #initial_state = encoder_stacked_cell.zero_state(FLAGS.batch_size, tf.float32)

            # Outputs from encoder RNN
            self.all_encoder_outputs, self.encoder_state = tf.nn.dynamic_rnn(
                cell=self.encoder_stacked_cell,
                inputs=self.embedded_encoder_inputs,
                sequence_length=self.src_seq_lens, time_major=False,
                dtype=tf.float32)

        '''
        # Convert to list of tensors
        self.encoder_outputs = tf.unpack(self.all_outputs, axis=0) # annotations
        self.encoder_state = tf.unpack(self.state, axis=0)

        # First calculate a concatenation of encoder outputs to put attention on.
        self.top_states = [tf.reshape(e, [-1, 1,
            self.stacked_cell.output_size]) for e in self.encoder_outputs]
        self.attention_states = tf.concat(1, self.top_states)
        '''

        '''
        # Decoder (use last relevant state from encoder as initial state)
        self.initial_decoder_state = self.encoder_state[0]

        '''

        with tf.variable_scope('decoder') as scope:

            # Initial state is last relevant state from encoder
            self.decoder_initial_state = self.encoder_state

            # Decoder RNN cell
            self.decoder_stacked_cell = rnn_cell(FLAGS, self.dropout,
                scope=scope)

            # Embed decoder RNN inputs
            W_input = tf.get_variable("W_input",
                [FLAGS.tar_vocab_size, FLAGS.num_hidden_units])
            self.embedded_decoder_inputs = rnn_inputs(FLAGS, self.decoder_inputs,
                FLAGS.tar_vocab_size, scope=scope)

            # Outputs from encoder RNN
            self.all_decoder_outputs, self.decoder_state = tf.nn.dynamic_rnn(
                cell=self.decoder_stacked_cell,
                inputs=self.embedded_decoder_inputs,
                sequence_length=self.tar_seq_lens, time_major=False,
                initial_state=self.decoder_initial_state)

            # Softmax on decoder RNN outputs
            W_softmax = tf.get_variable("W_softmax",
                [FLAGS.num_hidden_units, FLAGS.tar_vocab_size])
            b_softmax = tf.get_variable("b_softmax", [FLAGS.tar_vocab_size])

            # Logits
            self.decoder_outputs_flat = tf.reshape(self.all_decoder_outputs,
                [-1, FLAGS.num_hidden_units])
            self.logits_flat = rnn_softmax(FLAGS, self.decoder_outputs_flat,
                scope=scope)

            # Loss with masking
            targets_flat = tf.reshape(self.targets, [-1])
            losses_flat = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.logits_flat, labels=targets_flat)
            mask = tf.sign(tf.to_float(targets_flat))
            masked_losses = mask * losses_flat
            masked_losses = tf.reshape(masked_losses,  tf.shape(self.targets))
            self.loss = tf.reduce_mean(
                tf.reduce_sum(masked_losses, reduction_indices=1))
            
        with tf.variable_scope('decoder',reuse=True) as scope:

            # Initial state is last relevant state from encoder
            self.inference_initial_state = self.encoder_state

            self.embedded_inference_inputs = rnn_inputs(FLAGS, self.inference_inputs,
                FLAGS.tar_vocab_size, scope=scope)

            # Outputs from encoder RNN
            self.all_inference_outputs, self.inference_state = tf.nn.dynamic_rnn(
                cell=self.decoder_stacked_cell,
                inputs=self.embedded_inference_inputs,
                sequence_length=self.tar_seq_lens, time_major=False,
                initial_state=self.inference_initial_state)

            # Logits
            self.inference_outputs_flat = tf.reshape(self.all_inference_outputs,
                [-1, FLAGS.num_hidden_units])
            self.inference_logits_flat = rnn_softmax(FLAGS, self.inference_outputs_flat,
                scope=scope)

        # Optimization
        self.lr = tf.Variable(0.0, trainable=False)
        trainable_vars = tf.trainable_variables()
        # clip the gradient to avoid vanishing or blowing up gradients
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(self.loss, trainable_vars), FLAGS.max_gradient_norm)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_optimizer = optimizer.apply_gradients(
            zip(grads, trainable_vars))


    def step(self, sess, FLAGS, batch_encoder_inputs, batch_decoder_inputs,
        batch_targets, batch_en_seq_lens, batch_sp_seq_lens, dropout):

        input_feed = {self.encoder_inputs: batch_encoder_inputs,
            self.decoder_inputs: batch_decoder_inputs,
            self.targets: batch_targets,
            self.src_seq_lens: batch_en_seq_lens,
            self.tar_seq_lens: batch_sp_seq_lens,
            self.dropout: dropout}
        output_feed = [self.loss, self.train_optimizer]
        outputs = sess.run(output_feed, input_feed)

        return outputs[0], outputs[1]


In [8]:
class parameters(object):

    def __init__(self):
        """
        Holds all the parameters for NMT.
        """
        self.ckpt_dir = 'checkpoints/'
        
        self.max_src_vocab_size = 5000
        self.max_tar_vocab_size = 5000

        self.num_epochs = 100
        self.batch_size = 4

        self.rnn_unit = 'gru'
        self.num_hidden_units = 500
        self.num_layers = 1
        self.dropout = 0.5
        self.learning_rate = 1e-3
        self.learning_rate_decay_factor = 0.99
        self.max_gradient_norm = 5.0

def create_model(sess, FLAGS):

    tf_model = model(FLAGS)
    print("Created a new model")
    sess.run(tf.initialize_all_variables())

    return tf_model

def train(FLAGS):


    # Split into train and validation sets
    train_encoder_inputs, train_decoder_inputs, train_targets, \
        train_src_seq_lens, train_tar_seq_lens, \
        valid_encoder_inputs, valid_decoder_inputs, valid_targets, \
        valid_src_seq_lens, valid_tar_seq_len = \
        split_data(src_token_ids, tar_token_ids, src_seq_lens, tar_seq_lens,
            train_ratio=0.8)

    # Update parameters
    FLAGS.src_vocab_size = len(src_vocab_dict)
    FLAGS.tar_vocab_size = len(tar_vocab_dict)

    # Start session
    with tf.Session() as sess:

        # Create new model or load old one
        model = create_model(sess, FLAGS)

        # Training begins
        losses = []
        for epoch_num, epoch in enumerate(generate_epoch(train_encoder_inputs,
            train_decoder_inputs, train_targets,
            train_src_seq_lens, train_tar_seq_lens,
            FLAGS.num_epochs, FLAGS.batch_size)):

            print("EPOCH: %i" % (epoch_num))
            # Decay learning rate
            sess.run(tf.assign(model.lr, FLAGS.learning_rate * \
                (FLAGS.learning_rate_decay_factor ** epoch_num)))

            batch_loss = []

            for batch_num, (batch_encoder_inputs, batch_decoder_inputs,
                batch_targets, batch_src_seq_lens,
                batch_tar_seq_lens) in enumerate(epoch):

                loss, _ = model.step(sess, FLAGS,
                    batch_encoder_inputs, batch_decoder_inputs, batch_targets,
                    batch_src_seq_lens, batch_tar_seq_lens,
                    FLAGS.dropout)

                batch_loss.append(loss)
            
            print(np.mean(batch_loss))
            losses.append(np.mean(batch_loss))

        # Save checkpoint.
        if not os.path.isdir(FLAGS.ckpt_dir):
            os.makedirs(FLAGS.ckpt_dir)
        checkpoint_path = os.path.join(FLAGS.ckpt_dir, "model.ckpt")
        print("Saving the model.")
        model.saver.save(sess, checkpoint_path,
                         global_step=model.global_step)
        
        plt.plot(losses, label='loss')
        plt.legend()
        plt.show()

In [9]:
FLAGS = parameters()
train(FLAGS)

16 training samples and 4 validations samples
Created a new model
Instructions for updating:
Use `tf.global_variables_initializer` instead.
EPOCH: 0
86.1845
EPOCH: 1
82.9684
EPOCH: 2
80.1055
EPOCH: 3
74.1841
EPOCH: 4
71.2967
EPOCH: 5
68.2202
EPOCH: 6
65.6979
EPOCH: 7
62.6165
EPOCH: 8
59.015
EPOCH: 9
56.5337
EPOCH: 10
53.9495
EPOCH: 11
50.341
EPOCH: 12
48.6295
EPOCH: 13
45.6077
EPOCH: 14
43.7991
EPOCH: 15
39.0413
EPOCH: 16
38.1672
EPOCH: 17
35.7174
EPOCH: 18
34.9555
EPOCH: 19
31.4957
EPOCH: 20
31.906
EPOCH: 21
29.4326
EPOCH: 22
27.6652
EPOCH: 23
28.7124
EPOCH: 24
25.8432
EPOCH: 25
23.9865
EPOCH: 26
23.8063
EPOCH: 27
22.7996
EPOCH: 28
20.6289
EPOCH: 29
20.1484
EPOCH: 30
18.3194
EPOCH: 31
18.0894
EPOCH: 32
18.2198
EPOCH: 33
16.9801
EPOCH: 34
15.9941
EPOCH: 35
14.9424
EPOCH: 36
15.3064
EPOCH: 37
14.6672
EPOCH: 38
12.8958
EPOCH: 39
13.1318
EPOCH: 40
12.3806
EPOCH: 41
10.8425
EPOCH: 42
9.69589
EPOCH: 43
10.0742
EPOCH: 44
9.06535
EPOCH: 45
9.3049
EPOCH: 46
7.94342
EPOCH: 47
8.11751
EPOCH: 48


NameError: name 'os' is not defined

In [None]:
def inference(FLAGS):

    # Change FLAGS parameters
    FLAGS.batch_size = 1
    FLAGS.src_vocab_size = len(src_vocab_dict)
    FLAGS.tar_vocab_size = len(tar_vocab_dict)
    FLAGS.tar_max_len = max(src_seq_lens) + 1 # GO token

    # Process sample sentence
    inference_sentence = ["I usually eat the very large salad."]
    # Split into tokens
    tokenized = []
    for i in range(len(inference_sentence)):
        tokenized.append(basic_tokenizer(inference_sentence[i]))
    # Convert data to token ids
    data_as_tokens, sample_src_seq_lens = data_to_token_ids(
        tokenized, src_vocab_dict,max_seq_len=30 ,normalize_digits=True)

    # make dummy_sp_inputs
    dummy_tar_inputs = np.array([[GO_ID]*FLAGS.tar_max_len])
    sample_tar_seq_lens = np.array([len(dummy_tar_inputs)])

    print(data_as_tokens)
    print(sample_src_seq_lens)
    print(dummy_tar_inputs)
    print(sample_tar_seq_lens)

    with tf.Session() as sess:

        # Load trained model
        input_feed = {model.encoder_inputs: data_as_tokens,
            model.inference_inputs: dummy_tar_inputs,
            model.src_seq_lens: batch_src_seq_lens,
            model.tar_seq_lens: batch_tar_seq_lens,
            model.dropout: 1}
        output_feed = [model.inference_logits_flat]
        outputs = sess.run(output_feed, input_feed)
        print(outputs)