In [8]:
import numpy as np
import pandas as pd
import re
import random
import time
import nltk
import warnings
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords

In [2]:
questions_train_path = 'data/cornell/questions_train.txt'
answers_train_path = 'data/cornell/answers_train.txt'
questions_test_path = 'data/cornell/questions_test.txt'
answers_test_path = 'data/cornell/answers_test.txt'

questions_train, answers_train, questions_test, answers_test = [], [], [], []

dataset_list = [questions_train, answers_train, questions_test, answers_test]
path_list = [questions_train_path, answers_train_path, questions_test_path, answers_test_path]

for dataset, path in zip(dataset_list, path_list):
    with open(path, 'r') as f:
        for line in f:
            dataset.append(line.strip())

In [3]:
# load GloVe
from gensim.models import KeyedVectors
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

# convert GloVe vectors into the word2vec
glove_file = 'glove.6B.50d.txt'
tmp_file = 'glove_50d_word2vec.txt'
glove2word2vec(glove_file, tmp_file)

embeddings = KeyedVectors.load_word2vec_format(tmp_file)

In [4]:
start_symbol = '<S>'
end_symbol = '</S>'
padding_symbol = '<PAD>'
unknown_symbol = '<UNK>'

special_symbols = [start_symbol, end_symbol, padding_symbol, unknown_symbol]

In [5]:
def clean_text(text):               
    text = re.sub(r"<[^>]*>", "", text)
    text = re.sub(r"[<>]", "", text)
                       
    return text

In [6]:
def preprocess_dataset(dataset):
    preprocessed_dataset = []
    for sentence in dataset:
        cleaned_sentence = clean_text(sentence)
        tokenized_sentence = nltk.word_tokenize(cleaned_sentence)
        final_sentence = [word.lower() for word in tokenized_sentence]
        preprocessed_dataset.append(final_sentence)
        
    return preprocessed_dataset

In [9]:
# train set
tokenized_questions_train = preprocess_dataset(questions_train)
tokenized_answers_train = preprocess_dataset(answers_train)

# test set
tokenized_questions_test = preprocess_dataset(questions_test)
tokenized_answers_test = preprocess_dataset(answers_test)

In [10]:
# check occurances of words
vocab_occ = {}
for dataset in [tokenized_questions_train, tokenized_answers_train]:
    for sentence in dataset:
        for word in sentence:
            vocab_occ[word] = vocab_occ.get(word, 0) + 1
vocab_occ = sorted(vocab_occ.items(), key=lambda kv: kv[1])[::-1]

In [11]:
def build_dict(tokenized_questions, tokenized_answers, special_symbols):
    word2id = {}
    id2word = []
    
    for special_symbol in special_symbols:
        id2word.append(special_symbol)
        word2id[special_symbol] = id2word.index(special_symbol)
        
    vocab_set = set(word for dataset in [tokenized_questions, tokenized_answers]
                    for sentence in dataset
                    for word in sentence
                    if word not in special_symbols)
     
    for word in vocab_set:
        id2word.append(word)
        word2id[word] = id2word.index(word)
        
    return word2id, id2word

In [12]:
word2id, id2word = build_dict(tokenized_questions_train, tokenized_answers_train, special_symbols)

In [13]:
def replace_with_unk(dataset, word2id):
    replaced_dataset = []
    for sentence in dataset:
        for i, word in enumerate(sentence):
            if word not in word2id.keys():
                sentence[i] = '<UNK>'
        replaced_dataset.append(sentence)
        
    return replaced_dataset

In [15]:
tokenized_questions_test = replace_with_unk(tokenized_questions_test, word2id)
tokenized_answers_test = replace_with_unk(tokenized_answers_test, word2id)

In [16]:
# check oov
oov = []
for dataset in [tokenized_questions_test, tokenized_answers_test]:
    for sentence in dataset:
        for word in sentence:
            if word not in word2id.keys():
                oov.append(word)
set(oov)

set()

In [17]:
# number of <UNK> in test set
count = 0
for dataset in [tokenized_questions_test, tokenized_answers_test]:
    for sentence in dataset:
        for word in sentence:
            if word == '<UNK>':
                count += 1
                
count 

1946

In [18]:
def build_embeddings(word2id, embeddings, dim=300):
    vocab_size = len(word2id)
    embedding_matrix = np.random.normal(0, 1, (vocab_size, dim))
    
    for word, i in word2id.items():
        try:
            embedding_vector = embeddings.get_vector(word)
            embedding_matrix[i] = embedding_vector
        except:
            continue
            
    return embedding_matrix

In [19]:
customized_embeddings = build_embeddings(word2id, embeddings, 50)

In [None]:
# save customized_embeddings
# path = 'word_embeddings_50d.txt'

# np.savetxt(path, customized_embeddings, delimiter=' ')

In [None]:
# save word2id dictionary
# path = 'word2id.txt'

# out = open(path, 'w')
# for word, i in word2id.items():
#     print(word, i, sep=' ', file=out)
# out.close()

In [20]:
def sentence_to_ids(tokenized_sentence, word2id, padded_len):
    num_pad = max(0, padded_len - 1 - len(tokenized_sentence))
    sent = tokenized_sentence[:padded_len-1] + ['</S>']
    sent = sent + ['<PAD>']*num_pad
    sent_ids = [word2id[word] for word in sent]
    
    sent_len = min(len(tokenized_sentence)+1, padded_len)
    
    return sent_ids, sent_len

In [21]:
def ids_to_sentence(ids, id2word):
    return [id2word[i] for i in ids]

In [22]:
def batch_to_ids(sentences, word2id, max_len):
    max_len_in_batch = min(max(len(s) for s in sentences) + 1, max_len)
    batch_ids, batch_ids_len = [], []
    for sentence in sentences:
        ids, ids_len = sentence_to_ids(sentence, word2id, max_len_in_batch)
        batch_ids.append(ids)
        batch_ids_len.append(ids_len)
        
    return batch_ids, batch_ids_len

In [23]:
def generate_batches(samples, batch_size=32):
    X, Y = [], []
    for i, (x, y) in enumerate(samples, 1):
        X.append(x)
        Y.append(y)
        if i % batch_size == 0:
            yield X, Y
            X, Y = [], []
    if X and Y:
        yield X, Y

## Model

In [24]:
import tensorflow as tf

In [25]:
class Seq2SeqModel(object):
    pass

In [26]:
def declare_placeholders(self):
    # placeholders for input and its actual lengths
    self.input_batch = tf.placeholder(shape=(None, None), dtype=tf.int32, name='input_batch')
    self.input_batch_lengths = tf.placeholder(shape=(None, ), dtype=tf.int32, name='input_batch_lengths')
    
    # placeholders for groundtruth and its actual lenghts
    self.ground_truth = tf.placeholder(shape=(None, None), dtype=tf.int32, name='ground_truth')
    self.ground_truth_lengths = tf.placeholder(shape=(None, ), dtype=tf.int32, name='ground_truth_lengths')
    
    # placeholders for dropout_rate and learning_rate
    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])
    self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[])

In [27]:
Seq2SeqModel.__declare_placeholders = classmethod(declare_placeholders)

In [28]:
def create_embeddings(self, embeddings_matrix):
    self.embeddings = tf.get_variable(name='embeddings', 
                                     shape=embeddings_matrix.shape,
                                     initializer=tf.constant_initializer(embeddings_matrix),
                                     trainable=False)
    self.input_batch_embedded = tf.nn.embedding_lookup(self.embeddings, self.input_batch)

In [29]:
Seq2SeqModel.__create_embeddings = classmethod(create_embeddings)

In [30]:
def build_encoder(self, hidden_size):
    forward_cell = tf.nn.rnn_cell.DropoutWrapper(
        tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_size),
        input_keep_prob=self.dropout_ph,
        output_keep_prob=self.dropout_ph,
        state_keep_prob=self.dropout_ph)
    
    backward_cell = tf.nn.rnn_cell.DropoutWrapper(
        tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_size),
        input_keep_prob=self.dropout_ph,
        output_keep_prob=self.dropout_ph,
        state_keep_prob=self.dropout_ph)
    
    output, final_state = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=forward_cell,
        cell_bw=backward_cell,
        inputs=self.input_batch_embedded,
        sequence_length=self.input_batch_lengths,
        dtype=tf.float32)
    
    self.encoder_output = tf.concat([output[0], output[1]], axis=2)
    
    encoder_final_state_c = tf.concat([final_state[0].c, final_state[1].c], axis=1)
    encoder_final_state_h = tf.concat([final_state[0].h, final_state[1].h], axis=1)
    self.encoder_final_state = tf.contrib.rnn.LSTMStateTuple(c=encoder_final_state_c, h=encoder_final_state_h)

In [31]:
Seq2SeqModel.__build_encoder = classmethod(build_encoder)

In [32]:
def build_decoder(self, hidden_size, vocab_size, max_iter, start_symbol_id, end_symbol_id):
    batch_size = tf.shape(self.input_batch)[0]
    start_tokens = tf.fill([batch_size], start_symbol_id)
    ground_truth_as_input = tf.concat([tf.expand_dims(start_tokens, 1), self.ground_truth], 1)
    
    # Use the embedding layer defined before to lookup embedings for ground_truth_as_input
    self.ground_truth_embedded = tf.nn.embedding_lookup(self.embeddings, ground_truth_as_input)
    
    # Create TrainingHelper for the train stage
    train_helper = tf.contrib.seq2seq.TrainingHelper(self.ground_truth_embedded,
                                                     self.ground_truth_lengths)
        
    # Create GreedyEmbeddingHelper for the inference stage
    infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(self.embeddings, start_tokens, end_symbol_id)
    
    def decode(helper, scope, reuse=None):
        """Creates decoder and return the results of the decoding with a given helper."""
        
        with tf.variable_scope(scope, reuse=reuse):
            attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                num_units=hidden_size, 
                memory=self.encoder_output,
                memory_sequence_length=self.input_batch_lengths)
            
            cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_size*2, reuse=reuse)
 
            attention_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell, attention_mechanism, attention_layer_size=hidden_size)
            
            decoder_cell = tf.contrib.rnn.OutputProjectionWrapper(
                attention_cell, vocab_size, reuse=reuse)
            
            decoder_initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=batch_size)
            decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=decoder_cell,
                helper=helper,
                initial_state=decoder_initial_state)
            
            outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=decoder,
                maximum_iterations=max_iter,
                output_time_major=False,
                impute_finished=True)
            
            return outputs
    
    self.train_outputs = decode(train_helper, 'decode')
    self.infer_outputs = decode(infer_helper, 'decode', reuse=True)

In [33]:
Seq2SeqModel.__build_decoder = classmethod(build_decoder)

In [34]:
def compute_loss(self):
    """Computes sequence loss (masked cross-entopy loss with logits)."""
    
    weights = tf.cast(tf.sequence_mask(self.ground_truth_lengths), dtype=tf.float32)
    
    self.loss = tf.contrib.seq2seq.sequence_loss(self.train_outputs.rnn_output,
                                                 self.ground_truth,
                                                 weights)

In [35]:
Seq2SeqModel.__compute_loss = classmethod(compute_loss)

In [36]:
def perform_optimization(self):
    self.train_op = tf.contrib.layers.optimize_loss(loss=self.loss,
                                                    optimizer='Adam',
                                                    learning_rate=self.learning_rate_ph,
                                                    clip_gradients=1.0,
                                                    global_step=tf.train.get_global_step())

In [37]:
Seq2SeqModel.__perform_optimization = classmethod(perform_optimization)

In [38]:
def init_model(self, embeddings_matrix, hidden_size, vocab_size, max_iter, 
               start_symbol_id, end_symbol_id, padding_symbol_id):
    self.__declare_placeholders()
    self.__create_embeddings(embeddings_matrix)
    self.__build_encoder(hidden_size)
    self.__build_decoder(hidden_size, vocab_size, max_iter, start_symbol_id, end_symbol_id)
    
    self.__compute_loss()
    self.__perform_optimization()
    
    self.train_predictions = self.train_outputs.sample_id
    self.infer_predictions = self.infer_outputs.sample_id

In [39]:
Seq2SeqModel.__init__ = classmethod(init_model)

In [40]:
def train_on_batch(self, session, X, X_seq_len, Y, Y_seq_len, learning_rate, dropout_keep_probability):
    feed_dict = {
            self.input_batch: X,
            self.input_batch_lengths: X_seq_len,
            self.ground_truth: Y,
            self.ground_truth_lengths: Y_seq_len,
            self.learning_rate_ph: learning_rate,
            self.dropout_ph: dropout_keep_probability
        }
    pred, loss, _ = session.run([
            self.train_predictions,
            self.loss,
            self.train_op], feed_dict=feed_dict)
    return pred, loss

In [41]:
Seq2SeqModel.train_on_batch = classmethod(train_on_batch)

In [42]:
def predict_for_batch(self, session, X, X_seq_len):
    feed_dict = {self.input_batch: X, self.input_batch_lengths: X_seq_len}
    pred = session.run([
            self.infer_predictions
        ], feed_dict=feed_dict)[0]
    return pred

def predict_for_batch_with_loss(self, session, X, X_seq_len, Y, Y_seq_len):
    feed_dict = {self.input_batch: X, 
                 self.input_batch_lengths: X_seq_len,
                 self.ground_truth: Y,
                 self.ground_truth_lengths: Y_seq_len}
    pred, loss = session.run([
            self.infer_predictions,
            self.loss,
        ], feed_dict=feed_dict)
    return pred, loss

In [43]:
Seq2SeqModel.predict_for_batch = classmethod(predict_for_batch)
Seq2SeqModel.predict_for_batch_with_loss = classmethod(predict_for_batch_with_loss)

In [52]:
tf.reset_default_graph()

model = Seq2SeqModel(
    embeddings_matrix=customized_embeddings,
    hidden_size=128,
    vocab_size=customized_embeddings.shape[0],
    max_iter=10, 
    start_symbol_id=word2id['<S>'],
    end_symbol_id=word2id['</S>'],
    padding_symbol_id=word2id['<PAD>'])

batch_size = 32
n_epochs = 10
learning_rate = 0.001
dropout_keep_probability = 0.5
max_len = 10
learning_rate_decay = 0.75
min_learning_rate = 0.0001

n_step = int(len(questions_train)/batch_size)

In [None]:
session = tf.Session()
session.run(tf.global_variables_initializer())

all_model_predictions = []
all_ground_truth = []

display_iter = 200
checkpoint = "model/50d/best_model.ckpt"
stop_early = 0
stop = 5
# validation_check = ((len(tokenized_questions_train))//batch_size//2)-1
summary_test_loss = []

train_set = list(zip(tokenized_questions_train, tokenized_answers_train))
test_set = list(zip(tokenized_questions_test, tokenized_answers_test))

for epoch in range(n_epochs):
    random.shuffle(train_set)
    random.shuffle(test_set)
       
    print('-'*30)
    print('Train: epoch', epoch + 1)
    total_train_time = 0
    for n_iter, (X_batch, Y_batch) in enumerate(generate_batches(train_set, batch_size)):
        start_time = time.time()      
        X_ids, X_sent_lens = batch_to_ids(X_batch, word2id, max_len)
        Y_ids, Y_sent_lens = batch_to_ids(Y_batch, word2id, max_len)
        
        predictions, loss = model.train_on_batch(
            session,
            X_ids,
            X_sent_lens,
            Y_ids,
            Y_sent_lens,
            learning_rate,
            dropout_keep_probability)
        
        end_time = time.time()
        batch_time = end_time - start_time
        total_train_time += batch_time
        if n_iter % display_iter == 0:
            print("Epoch: {:>3}/{}, Step: {:>4}/{}, Loss: {:>6.3f}, Seconds: {:>4.2f}"
                  .format(epoch+1, n_epochs, n_iter+1, n_step, loss, total_train_time))
#             print("Epoch: [%d/%d], step: [%d/%d], loss: %f" % (epoch+1, n_epochs, n_iter+1, n_step, loss))
    
    start_time = time.time()
    epoch_test_loss = []
    for n_iter, (X_batch, Y_batch) in enumerate(generate_batches(test_set, batch_size)):        
        X, X_sent_lens = batch_to_ids(X_batch, word2id, max_len)
        Y, Y_sent_lens = batch_to_ids(Y_batch, word2id, max_len)

        predictions, loss = model.predict_for_batch_with_loss(
            session,
            X,
            X_sent_lens,
            Y,
            Y_sent_lens)
        
        epoch_test_loss.append(loss)
        
    end_time = time.time()
    batch_time = end_time - start_time
    print('')
    print('Test: epoch', epoch+1, 'loss', np.mean(epoch_test_loss), 'Second:', batch_time)   
    for x, y, p in list(zip(X, Y, predictions))[:3]:
        print('X:', ' '.join(ids_to_sentence(x, id2word)))
        print('Y:', ' '.join(ids_to_sentence(y, id2word)))
        print('O:', ' '.join(ids_to_sentence(p, id2word)))
        print('')
    
    # reduce learning rate
    learning_rate *= learning_rate_decay
    learning_rate = max(learning_rate, min_learning_rate)
    
    summary_test_loss.append(np.mean(epoch_test_loss))
    if np.mean(epoch_test_loss) <= min(summary_test_loss):
        print('New Record!')
        print('')
        stop_early = 0
        saver = tf.train.Saver()
        saver.save(session, checkpoint)
    else:
        print('No Improvement')
        stop_early += 1
        if stop_early == stop:
            break
            
print('\n...training finished.')

------------------------------
Train: epoch 1
Epoch:   1/10, Step:    1/6232, Loss: 10.956, Seconds: 2.41
Epoch:   1/10, Step:  201/6232, Loss:  5.178, Seconds: 44.51
Epoch:   1/10, Step:  401/6232, Loss:  5.197, Seconds: 86.58
Epoch:   1/10, Step:  601/6232, Loss:  4.904, Seconds: 128.75
Epoch:   1/10, Step:  801/6232, Loss:  5.154, Seconds: 170.86
Epoch:   1/10, Step: 1001/6232, Loss:  4.590, Seconds: 212.93
Epoch:   1/10, Step: 1201/6232, Loss:  4.924, Seconds: 255.05
Epoch:   1/10, Step: 1401/6232, Loss:  4.818, Seconds: 297.14
Epoch:   1/10, Step: 1601/6232, Loss:  4.789, Seconds: 339.40
Epoch:   1/10, Step: 1801/6232, Loss:  4.689, Seconds: 381.54
Epoch:   1/10, Step: 2001/6232, Loss:  4.490, Seconds: 423.69
Epoch:   1/10, Step: 2201/6232, Loss:  4.254, Seconds: 465.80
Epoch:   1/10, Step: 2401/6232, Loss:  4.188, Seconds: 507.91
Epoch:   1/10, Step: 2601/6232, Loss:  4.323, Seconds: 550.02
Epoch:   1/10, Step: 2801/6232, Loss:  4.656, Seconds: 592.25
Epoch:   1/10, Step: 3001/62

Epoch:   4/10, Step: 2001/6232, Loss:  3.623, Seconds: 416.15
Epoch:   4/10, Step: 2201/6232, Loss:  3.651, Seconds: 457.73
Epoch:   4/10, Step: 2401/6232, Loss:  3.455, Seconds: 499.31
Epoch:   4/10, Step: 2601/6232, Loss:  3.724, Seconds: 540.91
Epoch:   4/10, Step: 2801/6232, Loss:  3.941, Seconds: 582.49
Epoch:   4/10, Step: 3001/6232, Loss:  3.731, Seconds: 624.03
Epoch:   4/10, Step: 3201/6232, Loss:  3.774, Seconds: 665.60
Epoch:   4/10, Step: 3401/6232, Loss:  3.993, Seconds: 707.18
Epoch:   4/10, Step: 3601/6232, Loss:  4.091, Seconds: 748.74
Epoch:   4/10, Step: 3801/6232, Loss:  4.154, Seconds: 790.36
Epoch:   4/10, Step: 4001/6232, Loss:  3.635, Seconds: 831.95
Epoch:   4/10, Step: 4201/6232, Loss:  3.897, Seconds: 873.51
Epoch:   4/10, Step: 4401/6232, Loss:  3.818, Seconds: 915.09
