In [1]:
import keras
import numpy as np

Using TensorFlow backend.


In [2]:
keras.__version__

'2.0.7'

In [3]:
dialogues_path = "./data/movie_lines.txt"

## Read Data

In [4]:
from keras.preprocessing.text import Tokenizer

In [5]:
EOS_TOKEN = "~e"

In [6]:
dialogue_lines = list()
with open(dialogues_path) as dialogues_file:
    for line in dialogues_file:
        line = line.strip().lower()
        split_line = line.split(' +++$+++ ')
        try:
            dialogue_lines.append(split_line[4] + " " + EOS_TOKEN)
        except IndexError:
            pass
#             print("Skipped line " + line)

In [7]:
dialogue_lines[:10]

['they do not! ~e',
 'they do to! ~e',
 'i hope so. ~e',
 'she okay? ~e',
 "let's go. ~e",
 'wow ~e',
 "okay -- you're gonna need to learn how to lie. ~e",
 'no ~e',
 'i\'m kidding.  you know how sometimes you just become this "persona"?  and you don\'t know how to quit? ~e',
 'like my fear of wearing pastels? ~e']

In [8]:
keras_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}\t\n')

In [9]:
keras_tokenizer.fit_on_texts(dialogue_lines)

In [10]:
VOCAB_SIZE = len(keras_tokenizer.word_index) + 1
print(VOCAB_SIZE)
EMBEDDING_DIM = 500

55857


In [11]:
# keras_tokenizer.word_index

In [12]:
text_sequences = keras_tokenizer.texts_to_sequences(dialogue_lines)[:20000]

In [13]:
MAX_SEQUENCE_LENGTH = max(len(sequence) for sequence in text_sequences)
print(MAX_SEQUENCE_LENGTH)

207


## Build Neural Network

In [14]:
from keras import backend as K
from keras.engine.topology import Layer
from keras.layers import Input, Dense, RepeatVector, LSTM, Conv1D, Masking, Embedding
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences

In [15]:
x_train = pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', 
                        truncating='post', value=0)

In [None]:
x_train.shape

(20000, 207)

In [None]:
x_train_rev = list()
for x_vector in x_train:
    x_rev_vector = list()
    for index in x_vector:
        char_vector = np.zeros(VOCAB_SIZE)
        char_vector[index] = 1
        x_rev_vector.append(char_vector)
    x_train_rev.append(np.asarray(x_rev_vector))
x_train_rev = np.asarray(x_train_rev)

In [None]:
x_train_rev.shape

In [None]:
def get_seq2seq_model():
    main_input = Input(shape=x_train[0].shape, dtype='float32', name='main_input')
    print(main_input)

    embed_1 = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, 
                        mask_zero=True, input_length=MAX_SEQUENCE_LENGTH) (main_input)
    print(embed_1)

    lstm_1 = Bidirectional(LSTM(EMBEDDING_DIM, name='lstm_1'))(embed_1)
    print(lstm_1)

    repeat_1 = RepeatVector(MAX_SEQUENCE_LENGTH, name='repeat_1')(lstm_1)
    print(repeat_1)

    lstm_3 = Bidirectional(LSTM(100, return_sequences=True, name='lstm_3'))(repeat_1)
    print(lstm_3)

    softmax_1 = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'))(lstm_3)
    print(softmax_1)
    
    model = Model(main_input, softmax_1)
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

In [None]:
seq2seq_model = get_seq2seq_model()

In [None]:
seq2seq_model.fit(x_train, x_train_rev, batch_size=32, epochs=50, verbose=0)

In [None]:
predictions = seq2seq_model.predict(x_train)

In [None]:
index2word_map = inv_map = {v: k for k, v in keras_tokenizer.word_index.items()}

In [None]:
def sequence_to_str(sequence):
    word_list = list()
    for element in sequence:
#         if amax(element) < max_prob:
#             continue
        index = np.argmax(element) + 1
        word = index2word_map[index]
        word_list.append(word)
        
    return word_list

In [None]:
for i in range(len(predictions)):
    predicted_word_list = sequence_to_str(predictions[i])
    actual_len = len(dialogue_lines[i])
    print("Actual: " + dialogue_lines[i][:len(dialogue_lines[i])-3])
    generated_sentence = ""
    for word in predicted_word_list:
        if word == EOS_TOKEN:
            print('\n')
            break
        generated_sentence += word + " "
    print("Generated: " + generated_sentence)