In [None]:
import tensorflow as tf
import numpy as np

In [None]:
tf.__version__

In [None]:
dialogues_path = "./data/movie_lines.txt"

### Read data

In [None]:
EOS_TOKEN = "</s>"

In [None]:
dialogue_lines = list()
with open(dialogues_path) as dialogues_file:
    for line in dialogues_file:
        line = line.strip().lower()
        split_line = line.split(' +++$+++ ')
        dialogue_lines.append(split_line[4] + " " + EOS_TOKEN)

In [None]:
dialogue_lines[:10]

### Get language stats

In [None]:
# from nltk.tokenize import word_tokenize

In [None]:
def text_to_sequences(dialogue_lines):
    
    word_to_index_map = {EOS_TOKEN: 1}
    index_to_word_map = {1: EOS_TOKEN}
    index = 3
    max_sequence_length = 0
    
    dialogue_sequences = list()
    
    for line in dialogue_lines:
        dialogue_sequence = list()
        tokens = line.split()
        for token in tokens:
            if token in word_to_index_map:
                dialogue_sequence.append(word_to_index_map[token])
            else:
                word_to_index_map[token] = index
                index_to_word_map[index] = token
                dialogue_sequence.append(index)
                index += 1
        
        if max_sequence_length < len(dialogue_sequence):
            max_sequence_length = len(dialogue_sequence)
        
        dialogue_sequences.append(np.asarray(dialogue_sequence))
                
    return np.asarray(dialogue_sequences), word_to_index_map, index_to_word_map, max_sequence_length

In [None]:
sequences, word_to_index_map, index_to_word_map, max_sequence_length = text_to_sequences(dialogue_lines)

In [None]:
max_sequence_length

In [None]:
padded_sequences = list()
for dialogue_sequence in sequences:
    padded_sequence = \
        np.concatenate((dialogue_sequence, 
                        np.zeros(max_sequence_length - len(dialogue_sequence))))
    padded_sequence = tf.convert_to_tensor(padded_sequence, dtype=tf.int32)
    padded_sequences.append(padded_sequence)
    
padded_sequences = tf.convert_to_tensor(padded_sequences)

In [None]:
print(len(sequences), sequences[0].shape)

In [None]:
MAX_SEQUENCE_LENGTH = max([len(sequence) for sequence in sequences])
print(MAX_SEQUENCE_LENGTH)

In [None]:
# word_to_index_map, index_to_word_map

In [None]:
VOCAB_SIZE = len(word_to_index_map)
print("VOCAB_SIZE: " + str(VOCAB_SIZE))
EMBEDDING_SIZE = 50

## Build computational graph

### Define the computational graph in Tensorflow

In [None]:
tf.reset_default_graph()

# Initialize input placeholders
input_text = tf.placeholder(tf.int32, [1, None], name='input')
targets = tf.placeholder(tf.int32, [1, None], name='targets')
print(input_text)
print(targets)

input_text_shape = tf.shape(input_text)

    
# Build the RNN cell
lstm = tf.contrib.rnn.BasicLSTMCell(num_units=128)
drop_cell = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=0.75)
cell = tf.contrib.rnn.MultiRNNCell([drop_cell] * 1)

# Set the initial state
# initial_state = cell.zero_state(input_text_shape[0], tf.float32)
# initial_state = tf.identity(initial_state, name='initial_state')

# Create word embedding as input to RNN
embed = tf.contrib.layers.embed_sequence(input_text, VOCAB_SIZE, EMBEDDING_SIZE)
print(embed)

# Build RNN
outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32)
final_state = tf.identity(final_state, name='final_state')
print(outputs)
print(final_state)

# Take RNN output and make logits
logits = tf.contrib.layers.fully_connected(outputs, VOCAB_SIZE, activation_fn=None)
print(logits)

# Calculate the probability of generating each word
# probs = tf.nn.softmax(logits, name='probs')
probs = tf.contrib.seq2seq.hardmax(logits, name='probs')

# Define loss function
cost = tf.contrib.seq2seq.sequence_loss(logits, targets, tf.ones([input_text_shape[0], input_text_shape[1]]))

# Learning rate optimizer
optimizer = tf.train.AdamOptimizer(0.01)

# Gradient clipping to avoid exploding gradients
gradients = optimizer.compute_gradients(cost)
capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
train_op = optimizer.apply_gradients(capped_gradients)

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(1000):
        for i in range(len(sequences)):
            feed_dict = {
                input_text: np.asarray(sequences[i]).reshape(1, -1),
                targets: np.asarray(sequences[i]).reshape(1, -1)
            }
            train_loss, _ = sess.run([cost, train_op], feed_dict)
            
        if epoch % 10 == 0:
            print('train_loss: ', train_loss)