Think of this as a scratchpad for writing our model.

We will check our model by training it and printing loss for 100 steps. In next notebook, we will see a complete model with validation done after some interval

In [1]:
import os
import tensorflow as tf
from tensorflow.python.ops import lookup_ops
from collections import namedtuple

#Set this to actual path of the folder where your training, validation and vocab files reside
DATA_DIR = 'ubuntu-data'

#Training files: context, response and flag
train_context_file = os.path.join(DATA_DIR, 'train.context')
train_response_file = os.path.join(DATA_DIR, 'train.response')
train_flag_file = os.path.join(DATA_DIR, 'train.flag')

#Validation files: context, response and flag
valid_context_file = os.path.join(DATA_DIR, 'valid.context')
valid_response_file = os.path.join(DATA_DIR, 'valid.response')
valid_flag_file = os.path.join(DATA_DIR, 'valid.flag')

#Vocab file
vocab_file = os.path.join(DATA_DIR, 'vocab.txt')

This should not seem new. We saw this in [dataset_ops notebook](https://github.com/vineetm/tensorflow-notes/blob/master/siamese/notebooks/dataset_ops.ipynb)

In [2]:
#Notice a new (sixth) field init. This is initializer for iterator
class DataIterator(namedtuple('DataIterator', 'init context len_context response len_response flag')):
    pass

def text_to_word_indexes(text_file, vocab_table):
    dataset = tf.data.TextLineDataset(text_file)
    
    #Split sentence to words
    dataset = dataset.map(lambda sentence: tf.string_split([sentence]).values)

    #Convert words to indexes
    dataset = dataset.map(lambda words: vocab_table.lookup(words))
    
    return dataset

def create_dataset_iterator(vocab_table, context_file, response_file, flag_file, batch_size):
    #Create a vocab table. word -> index. Tell it if word is not found, use index 0 `UNK`
    vocab_table = lookup_ops.index_table_from_file(vocab_file, default_value=0)

    #Create context dataset, sentence -> word indexes
    context_dataset = text_to_word_indexes(context_file, vocab_table)

    #Restrict context to Last 160 tokens
    context_dataset = context_dataset.map(lambda words: words[-160:])

    #Create response dataset, sentence -> word indexes
    response_dataset = text_to_word_indexes(response_file, vocab_table)

    flag_dataset = tf.data.TextLineDataset(flag_file)
    # Convert string to a float..
    flag_dataset = flag_dataset.map(lambda sentence: tf.string_to_number(sentence))

    #Join datasets together, using zip
    dataset = tf.data.Dataset.zip((context_dataset, response_dataset, flag_dataset))

    #Add length of context and response
    dataset = dataset.map(lambda context, response, flag: (context, tf.size(context), response, tf.size(response), flag))

    dataset = dataset.padded_batch(batch_size, padded_shapes=(tf.TensorShape([None]), tf.TensorShape([]), tf.TensorShape([None]), tf.TensorShape([]), tf.TensorShape([])))
    iterator = dataset.make_initializable_iterator()

    context, len_context, response, len_response, flag = iterator.get_next()

    return DataIterator(iterator.initializer, context, len_context, response, len_response, flag)

In [3]:
#Create a vocab table. word -> index. Tell it if word is not found, use index 0 `UNK`
vocab_table = lookup_ops.index_table_from_file(vocab_file, default_value=0)

iterator = create_dataset_iterator(vocab_table, train_context_file, train_response_file, train_flag_file, batch_size=16)

In [4]:
#This is size of `vocab.txt`
V = 30430
d = 128

First, we create a word embeddings Tensor. Think of it as index -> vector

Next, we convert word indexes in *context* to vectors

In [5]:
# Word embeddings (this converts index -> vector)
W = tf.get_variable(name='word_embeddings', shape=[V, d])

# Gathers all words together: Shape: batch_size x T x d
context = tf.nn.embedding_lookup(W, iterator.context)

Now, we convert context word vectors to a single fixed length vector using RNN.

* We first define the RNN Cell. We are using a LSTM cell
* Next we use dynamic_rnn. We are ignoring intermediate outputs, and are only interested in final state
* Final state thus, helps us create `c`

In [6]:
rnn_cell = tf.contrib.rnn.BasicLSTMCell(d)

with tf.variable_scope('rnn'):
    _, state_context = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=context, 
                                         sequence_length=iterator.len_context, dtype=tf.float32)
c = state_context.h

We do a similar operation to context to get `r`

* Here, we are re-using RNN weights.

In [7]:
# Gathers all words together: Shape: batch_size x T x d
response = tf.nn.embedding_lookup(W, iterator.response)
with tf.variable_scope('rnn', reuse=True):
    _, state_response = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=response, 
                                         sequence_length=iterator.len_response, dtype=tf.float32)
r = state_response.h

* Now, we define correlation weights `M`. Based on recommendation in the [original paper](www.cs.toronto.edu/~lcharlin/papers/ubuntu_dialogue_dd17.pdf) we set this as an identity matrix! 
* We further compute logits

In [8]:
M = tf.Variable(tf.eye(d), name='M')
logits = tf.reduce_sum(tf.multiply(c, tf.matmul(r, M)), axis=1)

Next, we compute loss with respect to `flag`

In [9]:
batch_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=iterator.flag, logits=logits)
loss = tf.reduce_mean(batch_loss)

In [10]:
opt = tf.train.AdamOptimizer(0.001)

params = tf.trainable_variables()
print('Trainable params: %s'%params)
gradients = tf.gradients(loss, params)
clipped_gradients, grad_norm = tf.clip_by_global_norm(gradients, 5.0)
train_step = opt.apply_gradients(zip(clipped_gradients, params))

Trainable params: [<tf.Variable 'word_embeddings:0' shape=(30430, 128) dtype=float32_ref>, <tf.Variable 'rnn/rnn/basic_lstm_cell/kernel:0' shape=(256, 512) dtype=float32_ref>, <tf.Variable 'rnn/rnn/basic_lstm_cell/bias:0' shape=(512,) dtype=float32_ref>, <tf.Variable 'M:0' shape=(128, 128) dtype=float32_ref>]


In [13]:
#Let us run our model for 100 steps
with tf.Session() as sess:
    sess.run(iterator.init)
    sess.run(tf.tables_initializer())
    sess.run(tf.global_variables_initializer())
    
    for step in range(100):
        loss_val, _ = sess.run([loss, train_step])
        print('Step:{} Loss:{}'.format(step, loss_val))



Step:0 Loss:0.6931110620498657
Step:1 Loss:0.693158745765686
Step:2 Loss:0.6929095387458801
Step:3 Loss:0.6928654909133911
Step:4 Loss:0.692034900188446
Step:5 Loss:0.6983200311660767
Step:6 Loss:0.6959814429283142
Step:7 Loss:0.6930999755859375
Step:8 Loss:0.6922240257263184
Step:9 Loss:0.6931871771812439
Step:10 Loss:0.692741870880127
Step:11 Loss:0.6931243538856506
Step:12 Loss:0.6931833028793335
Step:13 Loss:0.6928664445877075
Step:14 Loss:0.6929497718811035
Step:15 Loss:0.6936433911323547
Step:16 Loss:0.6932743191719055
Step:17 Loss:0.6933149099349976
Step:18 Loss:0.692735493183136
Step:19 Loss:0.6932310461997986
Step:20 Loss:0.6932319402694702
Step:21 Loss:0.6925898790359497
Step:22 Loss:0.6929774284362793
Step:23 Loss:0.6920992136001587
Step:24 Loss:0.6928004026412964
Step:25 Loss:0.6943953037261963
Step:26 Loss:0.694351315498352
Step:27 Loss:0.6948292851448059
Step:28 Loss:0.69340580701828
Step:29 Loss:0.6929185390472412
Step:30 Loss:0.6934899091720581
Step:31 Loss:0.6926593780