### Import Statements

In [3]:
import keras.preprocessing.text as t
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import os

### Clean the data [movie_lines] to extract just the lines

In [4]:
raw_movie_lines = open(os.path.join('data','movie_lines.txt'), 'r').read().split('\n')[:-1]

with open(os.path.join('data','just_movie_lines.txt'), 'w') as f:
    for line in raw_movie_lines:
        line = line.split(' +++$+++ ')
        utterance = line[-1]
        f.write(utterance + '\n')

### Extract the Embedding Indices from Pre-trained model

In [5]:
embeddings_index = {}
f = open(os.path.join('glove.6B', 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coeffs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coeffs
f.close()

### Tokenize the dataset to extract words

In [7]:
lines = open(os.path.join('data','just_movie_lines.txt'), 'r').read().split('\n')[:-1]
min_count = 5
tokenizer = t.Tokenizer(lines)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

In [8]:

tokenizer = t.Tokenizer(num_words=num_words)
# Assigns id to words in the lines according to word count
tokenizer.fit_on_texts(lines) 
# Converts sequences of text to sequence of id. 
# Assigns nothing if the word is not part of the vocab.
tokenizer.texts_to_sequences(lines)  
# word_index is a dictionary of word and its index.
word_index = tokenizer.word_index

### Preparing Embedding Matrix

In [10]:
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector