# Building a Chat Bot with Python


## PT 1

* Load the Data
* Explore the Data Format
* Create a vocabulary

In [None]:
import pickle
import numpy as np

In [None]:
with open('train_qa.txt','rb') as f:
    train_data = pickle.load(f)

In [None]:
with open('test_qa.txt','rb') as f:
    test_data = pickle.load(f)

In [None]:
type(train_data)

In [None]:
len(train_data), len(test_data)

In [None]:
for item in train_data[0]:
    if type(item) == list:
        print(' '.join(item))
    if type(item) == str:
        print(item)

In [None]:
all_data = test_data + train_data

In [None]:
vocab = set()

for story, question, answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))
    

In [None]:
vocab

In [None]:
vocab.add('no')
vocab.add('yes')

In [None]:
vocab

In [None]:
# +1 placeholder for keras pad sequence
vocab_len = len(vocab)+1
vocab_len

In [None]:
# find Longest Story
all_story_lengths = [len(data[0]) for data in all_data]

In [None]:
max_story_len = np.max(all_story_lengths)
np.max(all_story_lengths), np.argmax(all_story_lengths)

In [None]:
# find Longest Question
all_question_lengths = [len(data[1]) for data in all_data]

max_question_len = np.max(all_question_lengths)
np.max(max_question_len), np.argmax(max_question_len)

## PT 2

using keras preprocessing for padding sequences and tokenizing data

* understand how to vectorize the data

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [None]:
tokenizer.word_index

In [None]:
train_story_text = list()
train_question_text = list()
train_answers = list()

for story, question, answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)

In [None]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [None]:
train_story_text[:2]

In [None]:
train_story_seq[:2]

In [None]:
def vectorize_stories(data,
                      word_index=tokenizer.word_index,
                      max_story_length=max_story_len,
                      max_question_length=max_question_len
                     ):
    # stories = X, questions = Xq, correct ansers (yes/no) = Y
    X = []
    Xq = []
    Y = []
    
    for story, query, answer in data:
        #for each story create a list
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in query]
        
        y = np.zeros(len(word_index)+1)
        y[word_index[answer]] = 1
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
    return (
        pad_sequences(X,maxlen=max_story_len),
        pad_sequences(Xq,maxlen=max_question_len),
        np.array(Y)
    )

In [None]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

inputs_test, queries_test, answers_test = vectorize_stories(test_data)




In [None]:
answers_test

In [None]:
inputs_test

In [None]:
inputs_train

In [None]:
answers_test

In [None]:
tokenizer.word_index['yes']

In [None]:
tokenizer.word_index['no']

In [None]:
sum(answers_test)

## PT 3

* build encoders: Input Encoder M, Input Encoder C, and Question Encoder
* build Neural Network
* complete the nextwork

In [None]:
# from tensorflow.keras.models import Sequential, Model
# from keras.layers.embeddings import Embedding
# from tensorflow.keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM


In [None]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout
from keras.layers import add, dot, concatenate
from keras.layers import LSTM

In [None]:
# Placeholder shape = (max_story_len, batch_size)

input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

In [None]:
# vocab_len
vocab_size = len(vocab)+1



In [None]:
# input encoder M
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
input_encoder_m.add(Dropout(0.3))

# Output
# (samples, story_maxlen, embedding_dim)


In [None]:

# input encoder M
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size, output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))

# Output
# (samples, story_maxlen, max_question_len)

In [None]:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=max_question_len))
question_encoder.add(Dropout(0.3))

# (samples, query_maxlen, embedding_dim)

In [None]:
# Encoded <--- Encoder(Input)
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [None]:
match = dot([input_encoded_m, question_encoded], axes=(2,2))
match = Activation('softmax')(match)

In [None]:
response = add([match, input_encoded_c])
response = Permute((2,1))(response)

In [None]:
answer = concatenate([response,question_encoded])

In [None]:
answer

In [None]:
answer = LSTM(32)(answer)

In [None]:
answer = Dropout(0.5)(answer)

In [None]:
answer = Dense(vocab_size)(answer)

In [None]:
answer = Activation('softmax')(answer)

In [None]:
model = Model([input_sequence,question],answer)

In [None]:
model.compile(optimizer='rmsprop', 
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [None]:
model.summary()

## PT 4

* (show how to) fit/train model (load a pre-trained network)
* plot training history
* evaluate model
* write custom stories and questions

In [None]:
help(model.fit)

In [None]:
history = model.fit([inputs_train, queries_train],
                    answers_train,
                    batch_size=256, 
                    epochs=3, 
                    validation_data=([inputs_test, queries_test],answers_test)
#                     custom_objects={'class_name': 'CustomLayer', 'config': {'a': 2} }
                   )

In [None]:
# !conda install matplotlib -y

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
model.save('mybrandnewmodel.h5')

In [None]:
model.load_weights('chatbot_10.h5')

In [None]:
pred_results = model.predict(([inputs_test,queries_test]))

In [None]:
pred_results[0]

In [None]:
test_data[0]

In [None]:
val_max = np.argmax(pred_results[0])

In [None]:
for key,val in tokenizer.word_index.items():
    if val == val_max:
        k = key

In [None]:
k

In [None]:
pred_results[0][val_max]

In [None]:
my_story = 'John left the kitchen . Sandra dropped the football in the garden .'

In [None]:
my_story.split()

In [None]:
my_question = "Is the football in the garden ?"
my_question.split()

In [None]:
mydata = [(my_story.split(), my_question.split(), 'yes')]

In [None]:
mydata

In [None]:
my_story, my_ques, my_ans = vectorize_stories(mydata)

In [None]:
my_ans

In [None]:
pred_results = model.predict(([my_story,my_ques]))

In [None]:
val_max = np.argmax(pred_results[0])

In [None]:
for key,val in tokenizer.word_index.items():
    if val == val_max:
        k = key
        
k

In [None]:
pred_results

In [None]:
pred_results[0][val_max]