# Processor

This file does the neural network business end. This'll have to be run every time we want to generate a new model.

In [28]:
import numpy as np
from numpy import array
from numpy import argmax
import math
# from numpy.random import rand
# from numpy.random import shuffle
from pickle import load
# from pickle import dump
# import re
# import os, sys, glob
# #Don't run these imports on your local machine!
import tensorflow as tf
# #Keras imports
# from keras.layers import LSTM, Dense, Activation, Input
# from keras import optimizers
# from keras.models import Sequential
# from keras.layers.embeddings import Embedding
# from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
# from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

from nltk.translate.bleu_score import corpus_bleu

# Load Data and Tokenize
First, we load the datasets using our load_sentences method from above. We are going to load the full dataset (so we can calculate vocab and max_length sizes), and the train and test data.

Next, we tokenize the data. Tokenization is the process of mapping words to integers. We are actually going to create separate tokenizers for our input and response data. Why? Because right now, that makes the code run. We can experiment with using one tokenizer later.

In [2]:
######################
# Load data
######################
def load_sentences(filename):
    return load(open(filename, "rb"))

filepath = "../../resources/"
dataset = load_sentences(filepath + "utt-resp-both.pkl")
train = load_sentences(filepath + "utt-resp-train.pkl")
test = load_sentences(filepath + "utt-resp-test.pkl")
print("Train entries: ", len(train))
print("Test entries: ",  len(test))

Train entries:  27000
Test entries:  3000


In [3]:
######################
# Tokenizer methods
######################
#create and fit a tokenizer on the given lines
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
#get the max length of all phrases
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [4]:
######################
# Tokenize
######################
#create tokenizers
utterance_tokenizer = create_tokenizer(dataset[:, 0])
response_tokenizer = create_tokenizer(dataset[:, 1])

#define vocabulary sizes
utterance_vocab_size = len(utterance_tokenizer.word_index) + 1
response_vocab_size = len(response_tokenizer.word_index) + 1

#define max_lengths
utterance_length = max_length(dataset[:, 0])
response_length = max_length(dataset[:, 1])

#print some statistics
print("Utterance vocabulary size: %d" % utterance_vocab_size)
print("Utterance max length: %d" % utterance_length)
print("Response vocabulary size: %d" % response_vocab_size)
print("Response max length: %d" % response_length)

Utterance vocabulary size: 9770
Utterance max length: 83
Response vocabulary size: 8164
Response max length: 88


# Encoding
We need to encode each utterance-response sequence to integers, and pad each encoding to the maximum phrase length (so that every sequence of encoded integers is the same length).

We need the encodings to be the same length because we are going to use a word embedding for the input sequences and one hot encode the output sequences.

In [32]:
######################
# Encoding methods
######################
#this method encodes the lines and pads them to the max length
def encode_input(tokenizer, length, lines):
    encoding = tokenizer.texts_to_sequences(lines)
    encoding = pad_sequences(encoding, maxlen=length, padding="post")
    return encoding
#this method one-hot encodes the output (responses). 
#we do this because we want the model to predict the probability of each word in the vocabulary as an output.

def old_encode_output(sequences, vocab_size):
    # THIS IS THE OLD OUTPUT ENCODING METHOD. IT HAS ERRORS BUT IT'S TRYING ITS BEST
    ylist = list()
    for sequence in sequences:
        #to_categorical converts a class vector (integers) to binary class matrix
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

def encode_output(sequences, vocab_size):
#     print(sequences)
#     print(vocab_size)
#     sequences = sequences[:100]
    output_array = np.empty([sequences.shape[0], sequences.shape[1], vocab_size])
#     output_array = array(to_categorical(sequences[:1], num_classes=vocab_size))
#     print(output_array.shape)
#     output_array = list()
    index = 0
#     for sequence in sequences[1:]:
    for sequence in sequences:
#         if(index % 1000 == 0): print(index)
        encoded = to_categorical(sequence, num_classes=vocab_size)
#         np.append(output_array, encoded)
        output_array[index] = encoded
        index += 1
#         output_array.append(encoded)
#     output_array = array(output_array)
#     print(output_array.shape)
#     print(sequences.shape[0])
#     print(sequences.shape[1])
#     output = output_array.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return output_array

In [35]:
######################
# Encode data
######################
#training data
print("Training utterance:")
train_utterance = encode_input(utterance_tokenizer, utterance_length, train[:, 1])
print("Training response (input):")
train_response = encode_input(response_tokenizer, response_length, train[:, 0])
print("Training response (output):")
np.random.shuffle(train_response)    
offset = 0
piece_size = 1000
pieces = list() # TURN THIS INTO A NUMPY ARRAY IT'S ADDING TO
for i in range(5):
    pieces.append(encode_output(train_response[offset:(offset + piece_size)], response_vocab_size))
    offset += piece_size
train_response = array(pieces)
# train_response = encode_output(train_response, response_vocab_size)
print("Training done.")
# #test data
print("Testing utterance:")
test_utterance = encode_input(utterance_tokenizer, utterance_length, test[:, 1]) # BREAKS HERE
print("Testing response (input):")
test_response = encode_input(response_tokenizer, response_length, test[:, 0])
print("Testing response (output):")
test_response = encode_output(test_response, response_vocab_size)

Training utterance:
Training response (input):
Training response (output):


MemoryError: 

# Create model
We will create an encoder-decoder LSTM.

# What is a timestep?
A timestep is a Keras thing. Here is the formal definition:

The specified number of timesteps defines the number of input variables (X) used to predict the next time step (y).

So, basically: A timestep is the "memory" of an LSTM- it's many inputs we are remembering (I think). In this case, we are using the max_length of an utterance/response as our timestep. This means that for every predicted word, we are taking into account every other word we have predicted so far. Likewise, when we train, we are learning weights for a word based on every previous word in a sentence (this is what we want for an encoder-decoder model!!)

In [14]:
######################
# Methods to create model
######################
#this method creates a model based on the given inputs.
def create_model(input_vocab, output_vocab, input_timesteps, output_timesteps, n_units):
    model = Sequential() #we are doing seq2seq 
    model.add(Embedding(input_vocab, n_units, input_length=input_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(output_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(output_vocab, activation="softmax")))
    return model

In [15]:
######################
# Create and compile model
######################
#We can change the number of hidden units (right now its 256)
#increasing the number of hidden units will increase performance and training time
#We can change the loss function (right now its categorical_crossentropy)
#I also create a file called model.png that shows the shape of the model
#I thought we might want to use the image for our final presentation :)
model = create_model(utterance_vocab_size, response_vocab_size, utterance_length, response_length, 256)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['accuracy'])
print(model.summary())
#plot_model(model, to_file="model.png", show_shapes=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 83, 256)           2501120   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 88, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 88, 256)           525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 88, 8164)          2098148   
Total params: 5,649,892
Trainable params: 5,649,892
Non-trainable params: 0
_________________________________________________________________
None


# Train the model
Right now I'm using 30 epochs and a batch_size of 64. We can always up the number of epochs if we aren't getting good performance.

In [20]:
filename= "model.test5"
numEpochs = 50 #30 default
batchSize = 64 #64 default
checkpoint = ModelCheckpoint(filename, monitor="val_loss", verbose=1, save_best_only=True, mode="min")
model.fit(train_utterance, train_response, epochs=numEpochs, batch_size=batchSize, validation_data=(test_utterance, test_response), callbacks=[checkpoint], verbose=2) # best performance with 40 epochs, 64 batch size
print(model.summary())

Train on 3000 samples, validate on 3000 samples
Epoch 1/1
 - 123s - loss: 3.0537 - acc: 0.8797 - val_loss: 0.9317 - val_acc: 0.8988

Epoch 00001: val_loss improved from inf to 0.93171, saving model to model.test5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 83, 256)           2501120   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 88, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 88, 256)           525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 88, 8164)          2098148   
Total params: 5,649,892
Trainable params: 5,649,892
Non-train

# Evaluate the model

In [21]:
#reload the datasets (just in case)
dataset = load_sentences("utt-resp-both.pkl")
train = load_sentences("utt-resp-train.pkl")
test = load_sentences("utt-resp-test.pkl")
#create tokenizers
utterance_tokenizer = create_tokenizer(dataset[:, 0])
response_tokenizer = create_tokenizer(dataset[:, 1])
#define vocabulary sizes
utterance_vocab_size = len(utterance_tokenizer.word_index) + 1
response_vocab_size = len(response_tokenizer.word_index) + 1
#define max_lengths
utterance_length = max_length(dataset[:, 0])
response_length = max_length(dataset[:, 1])
#datasets
train_utt = encode_input(utterance_tokenizer, utterance_length, train[:, 1])
test_utt = encode_input(utterance_tokenizer, utterance_length, train[:, 1])

In [24]:
######################
# Evaluation methods
######################
#reverse-lookup a word in the tokenizer 
def get_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
#we will need to perform this reverse-lookup for every word in a predicted sequence
#this method returns the prediction in words (not integers)
def get_prediction(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = get_word(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return " ".join(target)
#we need to repeat the prediction for every utterance in the test dataset
#we then compare our prediction to the actual response
#I'm using a BLEU score to compare these quantitatively, but if we get a low BLEU score I wouldn't be surprised.
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        source = source.reshape((1, source.shape[0]))
        translation = get_prediction(model, utterance_tokenizer, source)
        raw_target, raw_source = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_source, raw_target, translation))
        actual.append(raw_target.split())
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [29]:
######################
# Evaluate
######################
model = load_model("model.test5")
#evalute on training data (this should be pretty good)
print("Model on training data:")
evaluate_model(model, utterance_tokenizer, train_utt, train)
#evaluate on test data
print("Model on test data:")
evaluate_model(model, utterance_tokenizer, test_utt, test)

Model on training data:
src=[sd	That was a great one.], target=[sd	I'll have to write that down.], predicted=[]
src=[^2	That's your weekend.], target=[sd	and that's kind of the our social for the,], predicted=[]
src=[sd	and then } she had to go back in a couple of times,], target=[b	Yeah.], predicted=[]
src=[b	Yeah.], target=[sd	when, previous secretary of, educa , Bennett, I think his name was, became the drug czar, for, President Bush, he was going to focus on this area and do something about it], predicted=[]
src=[sv	Well, you are not from that area originally, I can tell.], target=[sd	But, I didn't.], predicted=[]
src=[b	Huh uh.], target=[sd	and she stays home, too, also.], predicted=[]
src=[sv	but sometimes, it's baby sitting.], target=[b	Yeah.], predicted=[]
src=[^h	Well, let's see,], target=[sd	and I feel like a native.], predicted=[]
src=[sd	I never had known anyone to play one before.], target=[b	Uh huh.], predicted=[]
src=[sd	Now spaghetti's such an easy one.], target=[sd	I d

KeyboardInterrupt: 