In [1]:
# importing the required dependencies

import pickle
import numpy as np
# from keras.preprocessing.sequence import pad_sequences
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer


In [2]:
# Loading Test and Train Data

with open("train_qa.txt", "rb") as fp:   
    train_data =  pickle.load(fp)
with open("test_qa.txt", "rb") as fp:   
    test_data =  pickle.load(fp)

In [4]:
print(f"Type of train data : {type(train_data)} and len of train data {len(train_data)}")
print(f"Type of test data : {type(test_data)} and len of test data {len(test_data)}")

Type of train data : <class 'list'> and len of train data 10000
Type of test data : <class 'list'> and len of test data 1000


In [5]:
# checking the data 

print(train_data[0])
print(' '.join(train_data[0][0]), ' '.join(train_data[0][1]) , train_data[0][2])

(['Mary', 'moved', 'to', 'the', 'bathroom', '.', 'Sandra', 'journeyed', 'to', 'the', 'bedroom', '.'], ['Is', 'Sandra', 'in', 'the', 'hallway', '?'], 'no')
Mary moved to the bathroom . Sandra journeyed to the bedroom . Is Sandra in the hallway ? no


In [7]:
# combining the data 

all_data = test_data + train_data


In [8]:
# Setting up vocabulary of all words

# Create a set that holds the vocab words
vocab = set()

In [13]:
# Creating all the unique words from the combined data

for story, question , answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

# also adding the answer from the data manually
vocab.add('no')
vocab.add('yes')

print(vocab)
print(len(vocab)) # total numbers of words

{'there', 'garden', '.', 'dropped', 'up', 'kitchen', 'Is', 'picked', 'travelled', 'bedroom', 'journeyed', 'apple', 'hallway', 'bathroom', 'football', 'to', '?', 'took', 'grabbed', 'in', 'John', 'Mary', 'moved', 'Daniel', 'left', 'yes', 'office', 'went', 'got', 'no', 'Sandra', 'the', 'back', 'put', 'down', 'milk', 'discarded'}
37


In [15]:
# finding the maximum len 
max_story_len = max([len(data[0]) for data in all_data]) # for story
max_question_len = max([len(data[1]) for data in all_data]) # for question
print(max_story_len)
print(max_question_len)

156
6


In [17]:
# Vectorizing the data

# from keras.preprocessing.sequence import pad_sequences
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer

vocab_size = len(vocab) + 1 # Reserve one space for padding

In [19]:
# integer encode sequences of words
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)
print(tokenizer.word_index)

{'there': 1, 'garden': 2, '.': 3, 'dropped': 4, 'up': 5, 'kitchen': 6, 'is': 7, 'picked': 8, 'travelled': 9, 'bedroom': 10, 'journeyed': 11, 'apple': 12, 'hallway': 13, 'bathroom': 14, 'football': 15, 'to': 16, '?': 17, 'took': 18, 'grabbed': 19, 'in': 20, 'john': 21, 'mary': 22, 'moved': 23, 'daniel': 24, 'left': 25, 'yes': 26, 'office': 27, 'went': 28, 'got': 29, 'no': 30, 'sandra': 31, 'the': 32, 'back': 33, 'put': 34, 'down': 35, 'milk': 36, 'discarded': 37}


In [20]:
# appending to the list story , question , answere seperately
train_story_text = []
train_question_text = []
train_answers_text = []

for story,question,answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers_text.append(answer)

In [23]:
print(train_question_text)
print(train_question_text[15])

[['Is', 'Sandra', 'in', 'the', 'hallway', '?'], ['Is', 'Daniel', 'in', 'the', 'bathroom', '?'], ['Is', 'Daniel', 'in', 'the', 'office', '?'], ['Is', 'Daniel', 'in', 'the', 'bedroom', '?'], ['Is', 'Daniel', 'in', 'the', 'bedroom', '?'], ['Is', 'Mary', 'in', 'the', 'bedroom', '?'], ['Is', 'Sandra', 'in', 'the', 'office', '?'], ['Is', 'Sandra', 'in', 'the', 'bathroom', '?'], ['Is', 'Sandra', 'in', 'the', 'bathroom', '?'], ['Is', 'Mary', 'in', 'the', 'kitchen', '?'], ['Is', 'Sandra', 'in', 'the', 'office', '?'], ['Is', 'Mary', 'in', 'the', 'hallway', '?'], ['Is', 'Mary', 'in', 'the', 'hallway', '?'], ['Is', 'Mary', 'in', 'the', 'hallway', '?'], ['Is', 'Mary', 'in', 'the', 'garden', '?'], ['Is', 'Sandra', 'in', 'the', 'office', '?'], ['Is', 'Sandra', 'in', 'the', 'bathroom', '?'], ['Is', 'Sandra', 'in', 'the', 'kitchen', '?'], ['Is', 'Mary', 'in', 'the', 'bedroom', '?'], ['Is', 'Mary', 'in', 'the', 'kitchen', '?'], ['Is', 'Daniel', 'in', 'the', 'bedroom', '?'], ['Is', 'Sandra', 'in', 'the',

In [24]:
# Also getting integers sequuence
train_story_seq = tokenizer.texts_to_sequences(train_story_text)
print(train_story_seq)

[[22, 23, 16, 32, 14, 3, 31, 11, 16, 32, 10, 3], [22, 23, 16, 32, 14, 3, 31, 11, 16, 32, 10, 3, 22, 28, 33, 16, 32, 10, 3, 24, 28, 33, 16, 32, 13, 3], [22, 23, 16, 32, 14, 3, 31, 11, 16, 32, 10, 3, 22, 28, 33, 16, 32, 10, 3, 24, 28, 33, 16, 32, 13, 3, 31, 28, 16, 32, 6, 3, 24, 28, 33, 16, 32, 14, 3], [22, 23, 16, 32, 14, 3, 31, 11, 16, 32, 10, 3, 22, 28, 33, 16, 32, 10, 3, 24, 28, 33, 16, 32, 13, 3, 31, 28, 16, 32, 6, 3, 24, 28, 33, 16, 32, 14, 3, 24, 8, 5, 32, 15, 1, 3, 24, 28, 16, 32, 10, 3], [22, 23, 16, 32, 14, 3, 31, 11, 16, 32, 10, 3, 22, 28, 33, 16, 32, 10, 3, 24, 28, 33, 16, 32, 13, 3, 31, 28, 16, 32, 6, 3, 24, 28, 33, 16, 32, 14, 3, 24, 8, 5, 32, 15, 1, 3, 24, 28, 16, 32, 10, 3, 21, 9, 16, 32, 27, 3, 31, 28, 16, 32, 2, 3], [31, 29, 32, 15, 1, 3, 22, 28, 16, 32, 10, 3], [31, 29, 32, 15, 1, 3, 22, 28, 16, 32, 10, 3, 24, 29, 32, 12, 1, 3, 31, 9, 16, 32, 13, 3], [31, 29, 32, 15, 1, 3, 22, 28, 16, 32, 10, 3, 24, 29, 32, 12, 1, 3, 31, 9, 16, 32, 13, 3, 31, 23, 16, 32, 2, 3, 22, 9, 1

In [25]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len,max_question_len=max_question_len):

    # X = STORIES
    X = []
    # Xq = QUERY/QUESTION
    Xq = []
    # Y = CORRECT ANSWER
    Y = []
    
    
    for story, query, answer in data:
        
        # Grab the word index for every word in story
        x = [word_index[word.lower()] for word in story]
        # Grab the word index for every word in query
        xq = [word_index[word.lower()] for word in query]
        
        # Grab the Answers, Index 0 is reserved so we're going to use + 1
        y = np.zeros(len(word_index) + 1)
        
        # Now that y is all zeros and we know its just Yes/No , we can use numpy logic to create this assignment
    
        y[word_index[answer]] = 1
        
        # Append each set of story,query, and answer to their respective holding lists
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
    # Finally, pad the sequences based on their max length so the RNN can be trained on uniformly long sequences.
        
    # RETURN TUPLE FOR UNPACKING
    return (pad_sequences(X, maxlen=max_story_len),pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [26]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

In [28]:
# checking ....
for x in queries_train:
    print(x)

[ 7 31 20 32 13 17]
[ 7 24 20 32 14 17]
[ 7 24 20 32 27 17]
[ 7 24 20 32 10 17]
[ 7 24 20 32 10 17]
[ 7 22 20 32 10 17]
[ 7 31 20 32 27 17]
[ 7 31 20 32 14 17]
[ 7 31 20 32 14 17]
[ 7 22 20 32  6 17]
[ 7 31 20 32 27 17]
[ 7 22 20 32 13 17]
[ 7 22 20 32 13 17]
[ 7 22 20 32 13 17]
[ 7 22 20 32  2 17]
[ 7 31 20 32 27 17]
[ 7 31 20 32 14 17]
[ 7 31 20 32  6 17]
[ 7 22 20 32 10 17]
[ 7 22 20 32  6 17]
[ 7 24 20 32 10 17]
[ 7 31 20 32 14 17]
[ 7 31 20 32 10 17]
[ 7 24 20 32 27 17]
[ 7 24 20 32  6 17]
[ 7 31 20 32 14 17]
[ 7 31 20 32 27 17]
[ 7 21 20 32 27 17]
[ 7 31 20 32 27 17]
[ 7 31 20 32 13 17]
[ 7 21 20 32 14 17]
[ 7 21 20 32 10 17]
[ 7 22 20 32 13 17]
[ 7 21 20 32 10 17]
[ 7 24 20 32 14 17]
[ 7 31 20 32 13 17]
[ 7 22 20 32  6 17]
[ 7 22 20 32 14 17]
[ 7 31 20 32 27 17]
[ 7 22 20 32 14 17]
[ 7 31 20 32  2 17]
[ 7 22 20 32 27 17]
[ 7 21 20 32 14 17]
[ 7 21 20 32  2 17]
[ 7 21 20 32  2 17]
[ 7 24 20 32 13 17]
[ 7 22 20 32 27 17]
[ 7 24 20 32 14 17]
[ 7 22 20 32 13 17]
[ 7 24 20 32  6 17]


In [31]:
# doing for test data
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [32]:
tokenizer.word_index['yes']
tokenizer.word_index['no']

30

In [33]:
# creating the model

from keras.models import Sequential, Model
from keras.layers import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout
from keras.layers import add, dot, concatenate
from keras.layers import LSTM

input_sequence = Input((max_story_len,))
question = Input((max_question_len,))
vocab_size = len(vocab) + 1

input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3))

input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))

question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,
                               output_dim=64,
                               input_length=max_question_len))
question_encoder.add(Dropout(0.3))

input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

match = dot([input_encoded_m, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)
response = add([match, input_encoded_c])
response = Permute((2, 1))(response)  

answer = concatenate([response, question_encoded])
answer = LSTM(32)(answer)
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)
answer = Activation('softmax')(answer)

model = Model([input_sequence, question], answer)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [35]:
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 156)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 6)]          0           []                               
                                                                                                  
 sequential (Sequential)        (None, None, 64)     2432        ['input_1[0][0]']                
                                                                                                  
 sequential_2 (Sequential)      (None, 6, 64)        2432        ['input_2[0][0]']                
                                                                                              

In [36]:
# Only Training for 10 epochs
history = model.fit([inputs_train, queries_train], answers_train,batch_size=32,epochs=10,validation_data=([inputs_test, queries_test], answers_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [38]:
pred_results = model.predict(([inputs_test, queries_test]))



In [46]:
# Testing the model

story =' '.join(word for word in test_data[15][0])
query = ' '.join(word for word in test_data[15][1])

print(story)
print(query)
print("Answer :",test_data[15][2])

John journeyed to the hallway . John got the apple there .
Is John in the hallway ?
Answer : yes


In [47]:
#Generate prediction from model
val_max = np.argmax(pred_results[15])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[15][val_max])

Predicted answer is:  yes
Probability of certainty was:  0.51312137
