In [31]:
import pickle
import numpy as np

with open('train_qa.txt','rb') as f:
    train_data=pickle.load(f)

In [3]:
with open('test_qa.txt','rb') as f:
    test_data=pickle.load(f)

In [4]:
type(test_data)

list

In [5]:
all_data=test_data+ train_data

In [6]:
len(all_data)

11000

In [7]:
vocab =set()
for story,question,answer in all_data:
    vocab=vocab.union(set(story))
    vocab=vocab.union(set(question))

In [8]:
vocab.add('no')
vocab.add('yes')

In [9]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [11]:
vocab_len=len(vocab)+1

In [12]:
vocab_len

38

In [13]:
#Longest story

all_story_lens=[len(data[0]) for data in all_data]
max_story_len=max(all_story_lens)

In [14]:
#Longest question
all_question_lens=[len(data[1]) for data in all_data]
max_question_len=max(all_question_lens)

In [18]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [19]:
tokenizer=Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [20]:
tokenizer.word_index

{'took': 1,
 '?': 2,
 'moved': 3,
 'sandra': 4,
 'put': 5,
 'in': 6,
 'hallway': 7,
 'the': 8,
 'up': 9,
 'down': 10,
 'office': 11,
 'there': 12,
 'grabbed': 13,
 'got': 14,
 'left': 15,
 'football': 16,
 'garden': 17,
 'no': 18,
 'apple': 19,
 'went': 20,
 'milk': 21,
 'yes': 22,
 '.': 23,
 'to': 24,
 'mary': 25,
 'journeyed': 26,
 'discarded': 27,
 'kitchen': 28,
 'bedroom': 29,
 'picked': 30,
 'travelled': 31,
 'back': 32,
 'bathroom': 33,
 'dropped': 34,
 'john': 35,
 'is': 36,
 'daniel': 37}

In [21]:
train_story_text=[]
train_question_text=[]
train_answers=[]

In [23]:
for story,question,answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)

In [25]:
#train_story_text

In [26]:
train_story_seq =tokenizer.texts_to_sequences(train_story_text)

In [28]:
#train_story_seq

In [34]:
def vectorize_stories(data,word_index=tokenizer.word_index,max_story_len=max_story_len,max_question_len=max_question_len):
    #Stories=X
    X=[]
    #Questions=Xq
    Xq=[]
    #Y Correct answer(yes/no)
    Y=[]
    
    for story,query,answer in data:
        x=[word_index[word.lower()] for word  in story]
        xq=[word_index[word.lower()] for word in query]
        y=np.zeros(len(word_index)+1)
        y[word_index[answer]]=1
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
    return (pad_sequences(X,maxlen=max_story_len),pad_sequences(Xq,maxlen=max_question_len),np.array(Y))

In [35]:
inputs_train , queries_train , answers_train =vectorize_stories(train_data)

In [36]:
inputs_test , queries_test , answers_test =vectorize_stories(test_data)

Creating Model

In [37]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout
from keras.layers import add, dot, concatenate
from keras.layers import LSTM

In [38]:
input_sequence=Input((max_story_len,))
question=Input((max_question_len,))

In [39]:
vocab_size=len(vocab)+1

In [40]:
#INPUT ENCODER M
input_encoder_m=Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3))

In [41]:
#INPUT ENCODER C
input_encoder_c=Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))

In [42]:
#question ENCODER C
question_encoder=Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,output_dim=64,input_length=max_question_len))
question_encoder.add(Dropout(0.3))

In [44]:
#Encoded<---encoder(input)
input_encoded_m =input_encoder_m(input_sequence)
input_encoded_c =input_encoder_c(input_sequence)
question_encoded=question_encoder(question)

In [45]:
# shape: `(samples, story_maxlen, query_maxlen)`
match = dot([input_encoded_m, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)

In [46]:
# add the match matrix with the second input vector sequence
response = add([match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
response = Permute((2, 1))(response)  # (samples, query_maxlen, story_maxlen)

In [47]:
answer=concatenate([response,question_encoded])

In [48]:
answer

<KerasTensor: shape=(None, 6, 220) dtype=float32 (created by layer 'concatenate')>

In [50]:
answer=LSTM(32)(answer)

In [51]:
# Regularization with Dropout
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)  # (samples, vocab_size)

In [52]:
# we output a probability distribution over the vocabulary
answer = Activation('softmax')(answer)

# build the final model
model = Model([input_sequence, question], answer)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [53]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 156)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 6)]          0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, None, 64)     2432        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 6, 64)        2432        input_2[0][0]                    
______________________________________________________________________________________________

In [54]:
# train
history = model.fit([inputs_train, queries_train], answers_train,batch_size=32,epochs=120,validation_data=([inputs_test, queries_test], answers_test))

Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78

Epoch 114/120
Epoch 115/120
Epoch 116/120
Epoch 117/120
Epoch 118/120
Epoch 119/120
Epoch 120/120


In [55]:
filename = 'chatbot_120_epochs.h5'
model.save(filename)



In [57]:
import matplotlib.pyplot as plt
%matplotlib inline
print(history.history.keys())
# summarize history for accuracy
#plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


KeyError: 'val_acc'

In [58]:
model.load_weights(filename)
pred_results = model.predict(([inputs_test, queries_test]))

In [59]:
#Generate prediction from model
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])

Predicted answer is:  no
Probability of certainty was:  0.9999323


In [60]:
sq_pairs = [str(train_story_text[i])+" "+str(train_question_text[i]) for i in range(len(train_story_text))]

In [62]:
X_train=[inputs_train , queries_train] 
y_train=answers_train

In [63]:
X_test=[inputs_test , queries_test] 
y_test=answers_test

In [66]:
from sklearn.model_selection import train_test_split
X=sq_pairs
y=answers_test.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [10000, 1000]

In [65]:
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, classification_report
svmmodel = SVC(kernel='linear', degree=3, C=1, decision_function_shape='ovo').fit(X_train, y_train)
accuracy = svmmodel.score(X_test, y_test)
f1score = f1_score(y_test, predictions, average='micro')
print(classification_report(y_test, predictions))
print("Accuracy of SVM with Polynomial Kernel:", accuracy)

ValueError: could not broadcast input array from shape (10000,156) into shape (10000)