In [1]:
import tensorflow as tf

import numpy as np

In [2]:
tf.__version__

'1.3.0'

In [3]:
dialogues_path = "./data/movie_lines.txt"

### Read data

In [4]:
SOS_TOKEN = "<s>"
EOS_TOKEN = "</s>"

In [5]:
dialogue_lines = list()
with open(dialogues_path) as dialogues_file:
    for line in dialogues_file:
        line = line.strip().lower()
        split_line = line.split(' +++$+++ ')
        dialogue_lines.append(SOS_TOKEN + " " + split_line[4] + " " + EOS_TOKEN)

In [6]:
dialogue_lines[:10]

['<s> they do not! </s>',
 '<s> they do to! </s>',
 '<s> i hope so. </s>',
 '<s> she okay? </s>',
 "<s> let's go. </s>",
 '<s> wow </s>',
 "<s> okay -- you're gonna need to learn how to lie. </s>",
 '<s> no </s>',
 '<s> i\'m kidding.  you know how sometimes you just become this "persona"?  and you don\'t know how to quit? </s>',
 '<s> like my fear of wearing pastels? </s>']

### Get language stats

In [80]:
from nltk.tokenize import word_tokenize

In [81]:
def text_to_sequences(dialogue_lines):
    
    word_to_index_map = {SOS_TOKEN: 1, EOS_TOKEN: 2}
    index_to_word_map = {1: SOS_TOKEN, 2: EOS_TOKEN}
    index = 3
    max_sequence_length = 0
    
    dialogue_sequences = list()
    
    for line in dialogue_lines:
        dialogue_sequence = list()
        tokens = word_tokenize(line)
        for token in tokens:
            if token in word_to_index_map:
                dialogue_sequence.append(word_to_index_map[token])
            else:
                word_to_index_map[token] = index
                index_to_word_map[index] = token
                dialogue_sequence.append(index)
                index += 1
        
        if max_sequence_length < len(dialogue_sequence):
            max_sequence_length = len(dialogue_sequence)
        
        dialogue_sequences.append(np.asarray(dialogue_sequence))
                
    return np.asarray(dialogue_sequences), word_to_index_map, index_to_word_map, max_sequence_length

In [82]:
sequences, word_to_index_map, index_to_word_map, max_sequence_length = text_to_sequences(dialogue_lines)

In [83]:
max_sequence_length

57

In [113]:
padded_sequences = list()
for dialogue_sequence in sequences:
    padded_sequence = \
        np.concatenate((dialogue_sequence, 
                        np.zeros(max_sequence_length - len(dialogue_sequence))))
    padded_sequence = tf.convert_to_tensor(padded_sequence, dtype=tf.int32)
    padded_sequences.append(padded_sequence)
    
padded_sequences = tf.convert_to_tensor(padded_sequences)

In [115]:
padded_sequences.shape

TensorShape([Dimension(100), Dimension(57)])

In [116]:
padded_sequences.dtype

tf.int32

In [117]:
# word_to_index_map, index_to_word_map

In [118]:
VOCAB_SIZE = len(word_to_index_map)
EMBEDDING_SIZE = 50
LSTM_SIZE = 100

In [119]:
VOCAB_SIZE

375

### Define the computational graph in Tensorflow

In [132]:
from tensorflow.contrib.rnn import BasicLSTMCell, static_rnn
from tensorflow.contrib.layers import embed_sequence

In [143]:
tf.reset_default_graph()
batch_size = 20

In [147]:
x = embed_sequence(padded_sequences, vocab_size=VOCAB_SIZE, embed_dim=EMBEDDING_SIZE)
print(x.shape)

x = tf.unstack(x, max_sequence_length, 1)
print(x[0].shape, len(x))

lstm_cell = BasicLSTMCell(128)
outputs, states = static_rnn(lstm_cell, x, dtype=tf.float32)
print(outputs.shape, states.shape)

(100, 57, 50)
(100, 50) 57


ValueError: Input graph and Layer graph are not the same: Tensor("unstack_12:0", shape=(100, 50), dtype=float32) is not from the passed-in graph.

In [146]:
# # Define loss and optimizer
# cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=encoded, labels=y))
# optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# # Evaluate model
# correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
# accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# # Initializing the variables
# init = tf.global_variables_initializer()

In [None]:
with tf.Session() as sess:
    sess.run(init)
    step = 1
    # Keep training until reach max iterations
    while step * batch_size < training_iters:
        batch_x, batch_y = mnist.train.next_batch(batch_size)
        # Run optimization op (backprop)
        sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})
        if step % display_step == 0:
            # Calculate batch accuracy
            acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y})
            # Calculate batch loss
            loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y})
            print "Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
                  "{:.6f}".format(loss) + ", Training Accuracy= " + \
                  "{:.5f}".format(acc)
        step += 1
    print "Optimization Finished!"

    # Calculate accuracy for 128 mnist test images
    test_len = 128
    test_data = mnist.test.images[:test_len].reshape((-1, n_steps, n_input))
    test_label = mnist.test.labels[:test_len]
    print "Testing Accuracy:", \
        sess.run(accuracy, feed_dict={x: test_data, y: test_label})