In [2]:
import pickle
import numpy as np

In [3]:
with open('train_qa.txt','rb') as f:
    train_data = pickle.load(f)

In [4]:
with open('test_qa.txt','rb') as f:
    test_data = pickle.load(f)

In [5]:
type(test_data)

list

In [6]:
#TRAIN-TEST SPLIT ALREADY DONE
#LIST OF TUPLES

In [7]:
' '.join(train_data[0][0])#DATA; QUERY; ANSWER

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [8]:
#CREATE A VOCABULARY

all_data = test_data + train_data

#SET
vocab = set()

for story, question, answer in all_data: #tuple unpacking
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

In [9]:
vocab.add('no')
vocab.add('yes')

In [10]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [11]:
#WE ARE LIMITED TO THESE WORDS WHEN ASKING A QUERY
vocab_len = len(vocab) + 1 #PLACEHOLDER WHEN USING KERAS

In [12]:
#HOW LONG IS THE LONGEST STORY AND LONGEST QUESTION

In [13]:
#LONGEST STORY
all_story_lens = [len(data[0]) for data in all_data]

max_story_len = max(all_story_lens)

In [14]:
#LONGEST QUESTION
max_question_len = max([len(data[1]) for data in all_data])

In [15]:
### PART 2: USING KERAS WITH THIS DATA

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

2023-06-08 02:31:42.418902: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
from keras.preprocessing.text import Tokenizer

In [18]:
#INSTANTIATE: NO PUNCTUATION FILTERS
tokenizer = Tokenizer(filters=[])

#CREATE DICTIONARY: WORD:ID_NUMBER
tokenizer.fit_on_texts(vocab)

In [19]:
#VECTORISATION FOR STORY, QUESTION, ANSWERS

In [20]:
train_story_text = []
train_question_text = []
train_answers = []

In [21]:
for story, question, answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)
    

In [22]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)
train_question_seq = tokenizer.texts_to_sequences(train_question_text)
train_answers_seq = tokenizer.texts_to_sequences(train_answers)

In [23]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len, max_question_len=max_question_len):
    
    #VECTORISE INTO PADDED SEQUENCES
    
    #STORIES
    X = []
    
    #QUESTIONS
    Xq  = []
    
    #CORRECT ANSWER [YES/NO]
    Y = []
    
    for story, query, answer in data:
        #FOR EACH STORY; LIST OF WORD INDICES
        x = [word_index[word.lower()] for word in story]
        
        #FOR EACH QUESTION; LIST OF WORD INDICES
        xq = [word_index[word.lower()] for word in query]
        
        #ANSWER
        y = np.zeros(len(word_index)+1)
        
        #USE NUMPY LOGIC [YES/NO]
        y[word_index[answer]] = 1
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
        
    return (pad_sequences(X, maxlen=max_story_len), pad_sequences(Xq,maxlen=max_question_len), np.array(Y))



In [24]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

In [25]:
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [26]:
tokenizer.word_index['yes']

18

In [27]:
tokenizer.word_index['no']

34

In [28]:
sum(answers_test)

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0., 497.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0., 503.,   0.,   0.,   0.])

In [29]:
##PART 3: CREATING THE MODEL WITH KERAS

In [30]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding

ModuleNotFoundError: No module named 'keras.layers.embeddings'

In [31]:
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM

In [32]:
from tensorflow.keras.layers import Embedding


In [33]:
#CREATE PLACEHOLDERS USING INPUT

#PLACEHOLDER shape = (max_story_len, batch_size)
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

In [34]:
vocab_size = vocab_len

In [35]:
#INPUT ENCODER M

#INPUT GETS EMBEDDED TO A SEQUENCE OF VECTORS
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
input_encoder_m.add(Dropout(0.3))#30% OF NEURONS TURNED OFF TO AVOID OVERFITTING

2023-06-08 02:31:54.748230: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [36]:
#INPUT ENCODER C

#INPUT GETS EMBEDDED TO A SEQUENCE OF VECTORS
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size, output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))#30% OF NEURONS TURNED OFF TO AVOID OVERFITTING

In [37]:
#M - > SAMPLES, STORY_MAXLEN, EMBEDDING_DIM
#C -> SAMPLES, STORY_MAXLEN, MAX_QUESTION_LEN
#Q -> SAMPLES, QUERY_MAXLEN, EMBEDDING_DIM

In [38]:
#QUESTION ENCODER
#INPUT GETS EMBEDDED TO A SEQUENCE OF VECTORS
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=max_question_len))
question_encoder.add(Dropout(0.3))#30% OF NEURONS TURNED OFF TO AVOID OVERFITTING

In [39]:
#PASS IN THE PLACEHOLDERS INTO THE ENCODERS


#ENCODED <--- ENCODER(INPUT)
#RESULTS
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [40]:
match = dot([input_encoded_m,question_encoded], axes = (2,2))
match = Activation('softmax')(match)

In [41]:
#ADD THIS MATCH MATRIX WITH SECOND INPUT VECTOR SEQUENCE
response = add([match, input_encoded_c])
response = Permute((2,1))(response)

In [42]:
#CONCATENATE THE MATCH MATRIX WITH QUESTION VECTOR SEQUENCE
answer = concatenate([response, question_encoded])
answer #CORRECT: (NONE, 6, 220)

<KerasTensor: shape=(None, 6, 220) dtype=float32 (created by layer 'concatenate')>

In [43]:
#REDUCE WITH A RNN: CHOOSE A LSTM
answer = LSTM(32)(answer)

In [44]:
#PREVENT OVERFITTING: DROPOUT
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer) #SAMPLES,VOCAB_SIZE #YES/NO

In [45]:
#turn into 0/1 with softmax
answer = Activation('softmax')(answer)

In [46]:
model = Model([input_sequence,question],answer)

In [47]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics = ['accuracy'])

In [48]:
##PART - 4: TRAIN, EVALUATE THE MODEL

In [49]:
history = model.fit([inputs_train, queries_train], answers_train, batch_size=32,epochs=250,validation_data = ([inputs_test, queries_test],answers_test))

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250


Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78/250
Epoch 79/250
Epoch 80/250
Epoch 81/250
Epoch 82/250
Epoch 83/250
Epoch 84/250
Epoch 85/250
Epoch 86/250
Epoch 87/250
Epoch 88/250
Epoch 89/250
Epoch 90/250
Epoch 91/250
Epoch 92/250
Epoch 93/250
Epoch 94/250
Epoch 95/250
Epoch 96/250
Epoch 97/250
Epoch 98/250
Epoch 99/250
Epoch 100/250
Epoch 101/250
Epoch 102/250
Epoch 103/250
Epoch 104/250
Epoch 105/250
Epoch 106/250
Epoch 107/250
Epoch 108/250
Epoch 109/250
Epoch 110/250
Epoch 111/250
Epoch 112/250
Epoch 113/250
Epoch 114/250


Epoch 115/250
Epoch 116/250
Epoch 117/250
Epoch 118/250
Epoch 119/250
Epoch 120/250
Epoch 121/250
Epoch 122/250
Epoch 123/250
Epoch 124/250
Epoch 125/250
Epoch 126/250
Epoch 127/250
Epoch 128/250
Epoch 129/250
Epoch 130/250
Epoch 131/250
Epoch 132/250
Epoch 133/250
Epoch 134/250
Epoch 135/250
Epoch 136/250
Epoch 137/250
Epoch 138/250
Epoch 139/250
Epoch 140/250
Epoch 141/250
Epoch 142/250
Epoch 143/250
Epoch 144/250
Epoch 145/250
Epoch 146/250
Epoch 147/250
Epoch 148/250
Epoch 149/250
Epoch 150/250
Epoch 151/250
Epoch 152/250
Epoch 153/250
Epoch 154/250
Epoch 155/250
Epoch 156/250
Epoch 157/250
Epoch 158/250
Epoch 159/250
Epoch 160/250
Epoch 161/250
Epoch 162/250
Epoch 163/250
Epoch 164/250
Epoch 165/250
Epoch 166/250
Epoch 167/250
Epoch 168/250
Epoch 169/250
Epoch 170/250
Epoch 171/250


Epoch 172/250
Epoch 173/250
Epoch 174/250
Epoch 175/250
Epoch 176/250
Epoch 177/250
Epoch 178/250
Epoch 179/250
Epoch 180/250
Epoch 181/250
Epoch 182/250
Epoch 183/250
Epoch 184/250
Epoch 185/250
Epoch 186/250
Epoch 187/250
Epoch 188/250
Epoch 189/250
Epoch 190/250
Epoch 191/250
Epoch 192/250
Epoch 193/250
Epoch 194/250
Epoch 195/250
Epoch 196/250
Epoch 197/250
Epoch 198/250
Epoch 199/250
Epoch 200/250
Epoch 201/250
Epoch 202/250
Epoch 203/250
Epoch 204/250
Epoch 205/250
Epoch 206/250
Epoch 207/250
Epoch 208/250
Epoch 209/250
Epoch 210/250
Epoch 211/250
Epoch 212/250
Epoch 213/250
Epoch 214/250
Epoch 215/250
Epoch 216/250
Epoch 217/250
Epoch 218/250
Epoch 219/250
Epoch 220/250
Epoch 221/250
Epoch 222/250
Epoch 223/250
Epoch 224/250
Epoch 225/250
Epoch 226/250
Epoch 227/250


Epoch 228/250
Epoch 229/250
Epoch 230/250
Epoch 231/250
Epoch 232/250
Epoch 233/250
Epoch 234/250
Epoch 235/250
Epoch 236/250
Epoch 237/250
Epoch 238/250
Epoch 239/250
Epoch 240/250
Epoch 241/250
Epoch 242/250
Epoch 243/250
Epoch 244/250
Epoch 245/250
Epoch 246/250
Epoch 247/250
Epoch 248/250
Epoch 249/250
Epoch 250/250


In [50]:
#EVALUATING ON GIVEN TEST SET


#PREDICT RESULTS
pred_results = history.predict(([inputs_test, queries_test]))

AttributeError: 'History' object has no attribute 'predict'

In [None]:
#test_data -> list of tuples [story, question, answer]
#pred_results has probabilities for every single word

#max probability
val_max = np.argmax(pred_results[0])

#print the word
for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

In [None]:
#ANSWER = 
k

In [None]:
#PROBABILITY OF BEING SURE OF THAT ANSWER
pred_results[0][val_max]

In [None]:
#FOR YOUR OWN STORIES; YOU CAN ONLY USE WORDS FROM THAT VOCABULARY

#SPACE BEFORE AND AFTER PUNCTUATION: SAME FORMAT AS TRAIN_DATA
my_story = "John left the kitches . Sandra dropped the football in tthe garden ."

In [None]:
my_question = 'Is the football in the garden ?' #SAME FORMAT AS TRAINING DATA

In [None]:
my_data = [(my_story.split(), my_question.split(), 'yes')] 

In [None]:
#VECTORIZED
my_story, my_ques, my_ans = vectorize_stories(my_data)

In [None]:
#PREDICT ONLY AFTER STORY AND QUESTION
pred_results = model.predict(([my_story, my_ques]))

In [None]:
val_max = np.argmax(pred_results[0])

In [None]:
for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

In [None]:
k

In [None]:
pred_results[0][val_max]