# Hermes
Sequence to sequence chatbot trained on Facebook messenger data

In [56]:
from bs4 import BeautifulSoup
import urllib
import pickle

import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential, load_model
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, Dropout, recurrent
from keras.layers.wrappers import Bidirectional
from keras.callbacks import Callback
from keras import backend as K

from random import sample
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [53]:
class Configuration(object):
    """Class for holding configuration settings"""
    
CONFIG = Configuration()
CONFIG.max_input_len = 10 # Maximum number of timesteps in an input sequence
CONFIG.max_output_len = 10 # Maximum number of timesteps in an output sequence (9 for words, 1 for <EOS>)
UNKNOWN_TOKEN = "<UNKNOWN>"
EOS = "<EOS>"

# Training parameters
CONFIG.batch_size = 100
CONFIG.epochs = 500

CONFIG.amount_of_dropout = 0.1
CONFIG.hidden_size = 50 # 500
CONFIG.initialization = "he_normal" # Gaussian initialization scaled by fan-in (He et al., 2014)

# Scrape Facebook Messenger messages

In [9]:
# Read html
page = urllib.urlopen('data/messages.htm').read()
soup = BeautifulSoup(page, "html5lib")

In [10]:
contents_div = soup.body.div.next_sibling
# Get div of room groupchat
groupchat_div = contents_div.div.div

In [11]:
# Create list of all messages in this groupchat
groupchat_messages = []

groupchat_p = groupchat_div.find_all("p")
for p in groupchat_p:
    groupchat_messages.append(p.text)

In [12]:
# TODO: Clean
print "Number of messages:", len(groupchat_messages)
print groupchat_messages[:10]

Number of messages: 9257
[u'leggo', u'hebrew bible leffo', u'looks like culture and belief is all that is left', u'Ez', u'workload is only ~9.4 hours per week we can do this guys', u'graduate level probability', u"stat 210 let's go", u'yah', u'different instructor though', u'according to my.harvard']


In [44]:
# Filter for messages < maximum input length
filtered = []
for message in groupchat_messages:
    if len(message.split(" ")) < CONFIG.max_output_len:
        filtered.append(message)

groupchat_messages = filtered
print(len(groupchat_messages))

8255


In [40]:
# Create pickle dump of all messages
pickle.dump(groupchat_messages, open('data/groupchat.p', 'wb'))

# Prepare Data

In [14]:
# Create a list of unique tokens used
tokens = []
for message in groupchat_messages:
    tokens += message.split(' ')
tokens = list(set(tokens))

In [48]:
# Create question, answer pairs using alternating pairs of messages
questions = []
answers = []

for i in range(len(groupchat_messages) - 1):
    questions.append(groupchat_messages[i].split(" "))
    answers.append(groupchat_messages[i + 1].split(" "))

In [35]:
class Language(object):
    def __init__(self, tokens, maxlen):
        # Add STOP and UNKNOWN_TOKEN to vocabulary
        tokens.append(UNKNOWN_TOKEN)
        self.tokens = sorted(set(tokens))
        # Reserve index 0 for the EOS token
        self.tokens_indices = dict((t, i + 1) for i, t in enumerate(self.tokens))
        self.indices_tokens = dict((i + 1, t) for i, t in enumerate(self.tokens))
        self.tokens.insert(0, EOS)
        self.tokens_indices[EOS] = 0
        self.indices_tokens[0] = EOS
        self.maxlen = maxlen

    @property
    def size(self):
        """The number of unique tokens"""
        return len(self.tokens)

    def encode(self, l):
        """Encode a list of tokens as one-hot"""
        X = np.zeros((self.maxlen, self.size), dtype=np.bool)
        for i, item in enumerate(l):
            try:
                X[i, self.tokens_indices[item]] = 1
            except KeyError:
                X[i, self.tokens_indices[UNKNOWN_TOKEN]] = 1
        # Insert EOS token
        X[i + 1, self.tokens_indices[EOS]] = 1
        return X

    def decode(self, X):
        """Decode array of predicted token indices into a array of predicted tokens"""
        result = []
        for x in X:
            if self.indices_tokens[x] == EOS:
                return result
            result.append(self.indices_tokens[x])
        
        return result
    
    def most_likely_seq_probs(self, X):
        """Return vector of probabilities for the most likely sequence"""
        probs = np.array([])

        for x in X:
            most_likely_ind = x.argmax()
            most_likely_prob = x.max()

            # Check for end-of-line token and return if found
            if most_likely_ind == self.tokens_indices[EOS]:
                return probs
            else:
                probs = np.append(probs, most_likely_prob)

        return probs

def generate_sample_weights(Y):
    sample_weights = np.zeros((len(Y), CONFIG.max_output_len))

    for i in range(len(Y)):
        for j in range(CONFIG.max_output_len):
            if np.any(Y[i, j]):
                sample_weights[i,j] = 1

    return sample_weights

In [49]:
# One-hot encode all inputs and outputs
language = Language(tokens, CONFIG.max_output_len)
X = np.array([language.encode(question) for question in questions])
Y = np.array([language.encode(question) for answer in answers])

In [52]:
# Train-validation split
X_train, X_val, Y_train, Y_val, questions_train, questions_val, answers_train, answers_val = train_test_split(X, Y, questions, answers, test_size=0.1)
# Generate sample weights
sample_weights = generate_sample_weights(Y_train)

# Model Definition

In [66]:
def generate_model(output_dim):
    """Generate the model"""
    print('Building model...')
    model = Sequential()
    # Encoding layers
    model.add(recurrent.LSTM(CONFIG.hidden_size, input_shape=(None, output_dim), kernel_initializer=CONFIG.initialization, return_sequences=True))
    model.add(Dropout(CONFIG.amount_of_dropout))
    model.add(recurrent.LSTM(CONFIG.hidden_size, input_shape=(None, output_dim), kernel_initializer=CONFIG.initialization, return_sequences=False))
    model.add(Dropout(CONFIG.amount_of_dropout))
    # Repeat hidden representation
    model.add(RepeatVector(CONFIG.max_output_len))
    # Decoding layers
    model.add(recurrent.LSTM(CONFIG.hidden_size, return_sequences=True, kernel_initializer=CONFIG.initialization))
    model.add(Dropout(CONFIG.amount_of_dropout))
    model.add(recurrent.LSTM(CONFIG.hidden_size, return_sequences=True, kernel_initializer=CONFIG.initialization))
    model.add(Dropout(CONFIG.amount_of_dropout))

    # For each of step of the output sequence, decide which token should be chosen
    model.add(TimeDistributed(Dense(output_dim, kernel_initializer=CONFIG.initialization)))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], sample_weight_mode="temporal")
    print('Model successfully built')
    model.summary()
    return model

def print_predictions(model, output_language, X, X_text, Y_text):
    """Print out errors on selected examples"""
    print()
    for rowX, rowX_text, rowY_text in zip(X, X_text, Y_text):
        preds = model.predict_classes(rowX[np.newaxis], verbose=0)
        preds_probs = model.predict(rowX[np.newaxis], verbose=0)
        q = ' '.join(rowX_text)
        correct = ' '.join(rowY_text)
        guess = ' '.join(output_language.decode(preds[0]))
        confidences = output_language.most_likely_seq_probs(preds_probs[0])

        print('Question:', q)
        print('Answer:', correct)
        print('Guess:', guess)
        print('Confidences', confidences)
        print('---')
    print()

# Train

In [67]:
class OnEpochEndCallback(Callback):
    """Execute this every end of epoch"""

    def on_epoch_end(self, epoch, logs=None):
        mask = sample(range(len(X_val)), 50)
        X_sample, X_text_sample, Y_text_sample = X_val[mask], np.array(questions_val)[mask], np.array(questions_val)[mask] 
        language = Language(tokens, CONFIG.max_output_len)
        print_predictions(self.model, language, X_sample, X_text_sample, Y_text_sample)

In [None]:
# Initialize model
model = generate_model(language.size)
ON_EPOCH_END_CALLBACK = OnEpochEndCallback()
STOP_CALLBACK = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.00001, patience=5)
history = model.fit(x=X_train, y=Y_train, batch_size=CONFIG.batch_size, epochs=CONFIG.epochs, validation_data=(X_val, Y_val), sample_weight=sample_weights, callbacks=[ON_EPOCH_END_CALLBACK, STOP_CALLBACK])

Building model...
Model successfully built
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_10 (LSTM)               (None, None, 50)          1659800   
_________________________________________________________________
dropout_9 (Dropout)          (None, None, 50)          0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dropout_10 (Dropout)         (None, 50)                0         
_________________________________________________________________
repeat_vector_3 (RepeatVecto (None, 10, 50)            0         
_________________________________________________________________
lstm_12 (LSTM)               (None, 10, 50)            20200     
_________________________________________________________________
dropout_11 (Dropout)         (Non