<a href="https://colab.research.google.com/github/thiago2608santana/Natural_Language_Processing_with_Python/blob/main/Chat_Bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importar bibliotecas

In [1]:
import pickle
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM

# Carregar os dados

In [2]:
with open('/content/drive/MyDrive/NLP_with_Python_Datasets/train_qa.txt', 'rb') as f:
  train_data = pickle.load(f)

In [3]:
with open('/content/drive/MyDrive/NLP_with_Python_Datasets/test_qa.txt', 'rb') as f:
  test_data = pickle.load(f)

# Visualizar e entender os dados

In [4]:
type(train_data)

list

In [5]:
type(test_data)

list

In [6]:
len(train_data)

10000

In [7]:
len(test_data)

1000

**story**

In [8]:
' '.join(train_data[0][0])

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

**question**

In [9]:
' '.join(train_data[0][1])

'Is Sandra in the hallway ?'

**answer**

In [10]:
train_data[0][2]

'no'

# Preprocessamento dos dados

In [11]:
all_data = test_data + train_data

In [12]:
len(all_data)

11000

In [13]:
vocab = set()

for story, question, answer in all_data:
  vocab = vocab.union(set(story))
  vocab = vocab.union(set(question))

In [14]:
vocab.add('no')

In [15]:
vocab.add('yes')

In [16]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [17]:
vocab_size = len(vocab) + 1

In [18]:
vocab_size

38

**Longest story**

In [19]:
all_story_lens = [len(data[0]) for data in all_data]

In [20]:
max_story_len = max(all_story_lens)

In [21]:
max_story_len

156

**Longets question**

In [22]:
max_question_len = max([len(data[1]) for data in all_data])

In [23]:
max_question_len

6

# Tokenizar o dataset

In [24]:
tokenizer = Tokenizer(filters=[])

In [25]:
tokenizer.fit_on_texts(vocab)

In [26]:
tokenizer.word_index

{'put': 1,
 'in': 2,
 'hallway': 3,
 'back': 4,
 'took': 5,
 '.': 6,
 'journeyed': 7,
 'apple': 8,
 'dropped': 9,
 'left': 10,
 'moved': 11,
 'the': 12,
 'is': 13,
 'john': 14,
 'to': 15,
 'got': 16,
 'no': 17,
 'down': 18,
 'kitchen': 19,
 'there': 20,
 'bathroom': 21,
 'went': 22,
 'travelled': 23,
 'grabbed': 24,
 'up': 25,
 'yes': 26,
 'discarded': 27,
 'football': 28,
 'picked': 29,
 'office': 30,
 'daniel': 31,
 'milk': 32,
 'mary': 33,
 'garden': 34,
 'bedroom': 35,
 '?': 36,
 'sandra': 37}

In [27]:
train_story_text = []
train_question_text = []
train_answers = []

for story, question, answer in train_data:
  train_story_text.append(story)
  train_question_text.append(question)
  train_answers.append(answer)

In [28]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

# Função que vetoriza os dados (story, question, answer)

In [29]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len, max_question_len=max_question_len):

  #Stories = X
  X = []
  #Questions = Xq
  Xq = []
  #Correct answer (yes/no) = Y
  Y = []

  for story, query, answer in data:

    x = [word_index[word.lower()] for word in story]
    xq = [word_index[word.lower()] for word in query]

    y = np.zeros(len(word_index)+1)
    y[word_index[answer]] = 1

    X.append(x)
    Xq.append(xq)
    Y.append(y)

  return (pad_sequences(X, maxlen=max_story_len), pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

# Vetorizar os dados

In [30]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

In [31]:
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [32]:
inputs_test

array([[ 0,  0,  0, ..., 12, 35,  6],
       [ 0,  0,  0, ..., 12, 34,  6],
       [ 0,  0,  0, ..., 12, 34,  6],
       ...,
       [ 0,  0,  0, ..., 12,  8,  6],
       [ 0,  0,  0, ..., 12, 34,  6],
       [ 0,  0,  0, ...,  8, 20,  6]], dtype=int32)

In [33]:
answers_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

**Verificar a quantidade de yes/no e checar a posição (index) correspondente. Tudo ok**

In [34]:
tokenizer.word_index['yes']

26

In [35]:
tokenizer.word_index['no']

17

In [36]:
sum(answers_test)

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0., 503.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0., 497.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])

**PLACEHOLDER shape=(max_story_len, batch_size)**

In [37]:
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

**Input Encoder M**

In [38]:
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
input_encoder_m.add(Dropout(0.3))
#Output: (samples, story_maxlen, embedding_dim)

**Input Encoder C**

In [39]:
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size, output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))
#Output: (samples, story_maxlen, max_question_len)

In [40]:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=max_question_len))
question_encoder.add(Dropout(0.3))
#Output: (samples, query_maxlen, embedding_dim)

**ENCODED <--- ENCODER (INPUT)**

In [41]:
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [42]:
match = dot([input_encoded_m, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)

In [43]:
response = add([match, input_encoded_c])
response = Permute((2, 1))(response)

In [44]:
answer = concatenate([response, question_encoded])

In [45]:
answer

<KerasTensor: shape=(None, 6, 220) dtype=float32 (created by layer 'concatenate')>

In [46]:
answer = LSTM(32)(answer)

In [47]:
answer = Dropout(0.5)(answer)

In [48]:
answer = Dense(vocab_size)(answer)
#(samples, vocab_size) # YES/NO

In [49]:
answer = Activation('softmax')(answer)

In [50]:
model = Model([input_sequence, question], answer)

In [51]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [52]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 156)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 6)]          0           []                               
                                                                                                  
 sequential (Sequential)        (None, None, 64)     2432        ['input_1[0][0]']                
                                                                                                  
 sequential_2 (Sequential)      (None, 6, 64)        2432        ['input_2[0][0]']                
                                                                                              