### Import Statements

In [1]:
import keras.preprocessing.text as t
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import os
import pickle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Clean the data [movie_lines] to extract just the lines and also create a dictionary with dialogue id as key and dialogue as value

In [2]:
dialogue_conversation_exists = os.path.exists(os.path.join('data', 'dialogue_conversation'))
movie_lines_exists = os.path.exists(os.path.join('data', 'movie_lines.txt'))

if not (dialogue_conversation_exists & movie_lines_exists):
    raw_movie_lines = open(os.path.join('data', 'movie_lines.txt'), 'r').read().split('\n')[:-1]
    dialogue_conversation = {}
    
    with open(os.path.join('data','just_movie_lines.txt'), 'w') as f:
        for line in raw_movie_lines:
            line = line.split(' +++$+++ ')
            dialogue_id = line[0]
            conversation = line[-1]
            f.write(conversation + '\n')
            dialogue_conversation[dialogue_id] = conversation
    
    pickle.dump(dialogue_conversation, open(os.path.join('data', 'dialogue_conversation'), 'wb'), True)
else:
    dialogue_conversation = pickle.load(open(os.path.join('data', 'dialogue_conversation'), 'rb'))

### Extract the Embedding Indices from Pre-trained model

In [3]:
embeddings_index = {}
if not os.path.exists(os.path.join('data', 'embeddings_index')):
    f = open(os.path.join('glove.6B', 'glove.6B.100d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coeffs
    f.close()
    
    pickle.dump(embeddings_index, open(os.path.join('data', 'embeddings_index'), 'wb'), True)
else:
    embeddings_index = pickle.load(open(os.path.join('data', 'embeddings_index'), 'rb'))


### Tokenize the dataset to extract words

In [4]:
lines = open(os.path.join('data','just_movie_lines.txt'), 'r').read().split('\n')[:-1]
min_count = 15
tokenizer = t.Tokenizer(lines)
tokenizer.fit_on_texts(lines) 
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

In [5]:
print('Current vocabulary after choosing only most frequent words', str(num_words))

('Current vocabulary after choosing only most frequent words', '8424')


In [6]:

tokenizer = t.Tokenizer(num_words=num_words)
# Assigns id to words in the lines according to word count
tokenizer.fit_on_texts(lines) 

# word_index is a dictionary of word and its index.
word_index = tokenizer.word_index

### Word to Index and Index to Word Dictionary

In [7]:
word_to_index = {key: word_index[key] + 3 for key in word_index if word_index[key] <= num_words}
index_to_word = {word_to_index[key]: key for key in word_to_index}

word_to_index['<pad>'] = 0
word_to_index['<bos>'] = 1
word_to_index['<eos>'] = 2
word_to_index['<unk>'] = 3

index_to_word[0] = '<pad>'
index_to_word[1] = '<bos>'
index_to_word[2] = '<eos>'
index_to_word[3] = '<unk>'

### Extracting Conversations

In [8]:
conversations = []
conversations_exists = os.path.exists(os.path.join('data', 'conversations'))

if not conversations_exists:
    raw_movie_conversations = open(os.path.join('data', 'movie_conversations.txt'), 'r').read().split('\n')[:-1]
    
    # Extracting the conversation list and forming a list of conversations 
    # Here con_a is previous two lines, con_a_2 is current line and con_b is next/target line.
    for conversation in raw_movie_conversations:
        conversation = conversation.split(' +++$+++ ')[-1]
        conversation = conversation.replace('[', '')
        conversation = conversation.replace(']', '')
        conversation = conversation.replace('\'', '')
        conversation = conversation.split(', ')
        
        con_a_1 = ''
        for i in range(len(conversation)-1):
            
            con_a_2 = dialogue_conversation[conversation[i]]
            con_b = dialogue_conversation[conversation[i+1]]
            
            if len(con_a_1.split()) <= 50 and len(con_a_2.split()) <= 50 and len(con_b.split()) <= 50:
                con_a = "{} {}".format(con_a_1, con_a_2)
                conversations.append((con_a, con_b, con_a_2))
            
            con_a_1 = con_a_2
    pickle.dump(conversations, open(os.path.join('data', 'conversations'), 'wb'), True)
else:
    conversations = pickle.load(open(os.path.join('data', 'conversations'), 'rb'))

### Tokenize conversations and add padding ``<pad>``, ``<eos>``, ``<bos>`` and replace out of vocabulary words with ``<unk>``

Maximum number of words in a sentence is 50 

In [14]:
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

max_length = 50
vocab = word_to_index.keys()

# These are not question and answers but a conversation. 
# Just for convenience sake, I used question and answer as variable names.
question = []
answer = []

question_exists = os.path.exists(os.path.join('data', 'question'))
answer_exists = os.path.exists(os.path.join('data', 'question'))

if not (question_exists & answer_exists):
    for conv in conversations:
        conversation_a = conv[0]
        conversation_b = conv[1]
    
        conversation_a = text_to_word_sequence(conversation_a)
        conversation_b = text_to_word_sequence(conversation_b)
    
        conversation_a.insert(0, '<bos>')
        conversation_a.append('<eos>')
        conversation_b.insert(0, '<bos>')
        conversation_b.append('<eos>')
    
        conversation_a = [word_to_index[c] if c in vocab else 3 for c in conversation_a]
        conversation_b = [word_to_index[c] if c in vocab else 3 for c in conversation_b]
        
        question.append(conversation_a[:max_length])
        answer.append(conversation_b[:max_length])
    
    question = pad_sequences(question, max_length, padding='pre')
    answer = pad_sequences(answer, max_length, padding='post')
    
    pickle.dump(question, open(os.path.join(os.path.join('data', 'question')), 'wb'), True)
    pickle.dump(answer, open(os.path.join(os.path.join('data', 'answer')), 'wb'), True)
else:
    question = pickle.load(open(os.path.join(os.path.join('data', 'question')), 'rb'))
    answer = pickle.load(open(os.path.join(os.path.join('data', 'answer')), 'rb'))



### Preparing Embedding Matrix

In [10]:
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector