In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os
import re
import numpy as np
from time import time
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical 

In [2]:
# Maximum sentence length
MAX_LENGTH = 13


In [3]:
path_to_zip = tf.keras.utils.get_file(
    'cornell_movie_dialogs.zip',
    origin=
    'http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip',
    extract=True)

path_to_dataset = os.path.join(
    os.path.dirname(path_to_zip), "cornell movie-dialogs corpus")

path_to_movie_lines = os.path.join(path_to_dataset, 'movie_lines.txt')
path_to_movie_conversations = os.path.join(path_to_dataset,
                                           'movie_conversations.txt')

Downloading data from http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip


In [4]:
def preprocess_sentence(sentence):
  sentence = sentence.lower().strip()
  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
  sentence = re.sub(r'[" "]+', " ", sentence)
  # removing contractions
  sentence = re.sub(r"i'm", "i am", sentence)
  sentence = re.sub(r"he's", "he is", sentence)
  sentence = re.sub(r"she's", "she is", sentence)
  sentence = re.sub(r"it's", "it is", sentence)
  sentence = re.sub(r"that's", "that is", sentence)
  sentence = re.sub(r"what's", "that is", sentence)
  sentence = re.sub(r"where's", "where is", sentence)
  sentence = re.sub(r"how's", "how is", sentence)
  sentence = re.sub(r"\'ll", " will", sentence)
  sentence = re.sub(r"\'ve", " have", sentence)
  sentence = re.sub(r"\'re", " are", sentence)
  sentence = re.sub(r"\'d", " would", sentence)
  sentence = re.sub(r"\'re", " are", sentence)
  sentence = re.sub(r"won't", "will not", sentence)
  sentence = re.sub(r"can't", "cannot", sentence)
  sentence = re.sub(r"n't", " not", sentence)
  sentence = re.sub(r"n'", "ng", sentence)
  sentence = re.sub(r"'bout", "about", sentence)
  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
  sentence = sentence.strip()
  return sentence

In [5]:
def load_conversations():
  # dictionary of line id to text
  id2line = {}
  with open(path_to_movie_lines, errors='ignore') as file:
    lines = file.readlines()
  for line in lines:
    parts = line.replace('\n', '').split(' +++$+++ ')
    id2line[parts[0]] = parts[4]

  inputs, outputs = [], []
  with open(path_to_movie_conversations, 'r') as file:
    lines = file.readlines()
  for line in lines:
    parts = line.replace('\n', '').split(' +++$+++ ')
    # get conversation in a list of line ID
    conversation = [line[1:-1] for line in parts[3][1:-1].split(', ')]  #here [1: -1] is used to remove [,],' from str. Ex: ['L22', 'L33']
    for i in range(len(conversation) - 1):
      inputs.append(preprocess_sentence(id2line[conversation[i]]))
      outputs.append('<START> ' + preprocess_sentence(id2line[conversation[i + 1]]) + ' <END>')
  return inputs, outputs




In [6]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
questions, answers = load_conversations()

questions_short, answers_short = [], []
for (que, ans) in zip(questions, answers):
  if len(text_to_word_sequence(que)) <= MAX_LENGTH and len(text_to_word_sequence(ans)) <= MAX_LENGTH:
      questions_short.append(que)
      answers_short.append(ans)
questions_short = questions_short[:15000]
answers_short = answers_short[:15000]
print("Questions in dataset: {}".format(len(questions_short)))
print("Answers in dataset: {}".format(len(answers_short)))

Questions in dataset: 15000
Answers in dataset: 15000


In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions_short + answers_short)
VOCAB_SIZE = len(tokenizer.word_index) + 1
print('Vocabulary size : {}'.format(VOCAB_SIZE))

Vocabulary size : 8619


In [None]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
#with open('tokenizer.pickle', 'rb') as handle:
#    tokenizera = pickle.load(handle)

In [9]:
tokenized_questions = tokenizer.texts_to_sequences(questions_short)

encoder_input_data = pad_sequences(tokenized_questions, 
                                 maxlen=MAX_LENGTH,
                                 padding='post')
print(encoder_input_data.shape)

tokenized_answers = tokenizer.texts_to_sequences(answers_short)
decoder_input_data = pad_sequences(tokenized_answers,   
                                   maxlen=MAX_LENGTH,
                                   padding='post')
print(decoder_input_data.shape)

(15000, 13)
(15000, 13)


In [10]:
# remove the first 'start' word from every answer
decoder_output = []
for i in range(len(tokenized_answers)):
    decoder_output.append(tokenized_answers[i][1:])
padded_answers = pad_sequences(decoder_output, maxlen=MAX_LENGTH, padding='post')
decoder_output_data = to_categorical(padded_answers, VOCAB_SIZE)

In [11]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras import Model
from tensorflow.keras.models import load_model

In [12]:
enc_inputs = Input(shape=(None,))
enc_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True)(enc_inputs)
enc_outputs, state_h, state_c = LSTM(200, return_state=True)(enc_embedding)
enc_states = [state_h, state_c]
# decoder will be used to capture space-dependent relations 
# between words from the answers using encoder's 
# internal state as a context
dec_inputs = Input(shape=(None,))
dec_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True)(dec_inputs)
dec_lstm = LSTM(200, return_state=True, return_sequences=True)
dec_outputs, _, _ = dec_lstm(dec_embedding,  
                             initial_state=enc_states)
# decoder is connected to the output Dense layer
dec_dense = Dense(VOCAB_SIZE, activation='softmax')
output = dec_dense(dec_outputs)
model = Model([enc_inputs, dec_inputs], output)
# output of this network will look like this:
# y_true = [0.05, 0.95, 0...]
# and expected one-hot encoded output like this:
# y_pred = [0, 1, 0...]
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 200)    1723800     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 200)    1723800     ['input_2[0][0]']                
                                                                                              

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_output_data,  batch_size=16,  epochs=300)

In [17]:
model = load_model('chatbot.h5')

In [18]:
def make_inference_models():
    dec_state_input_h = Input(shape=(200,))
    dec_state_input_c = Input(shape=(200,))
    dec_states_inputs = [dec_state_input_h, dec_state_input_c]
    # these state vectors are used as an initial state 
    # for LSTM layer in the inference decoder
    # third input is the Embedding layer as explained above   
    dec_outputs, state_h, state_c = dec_lstm(dec_embedding,
                                    initial_state=dec_states_inputs)
    dec_states = [state_h, state_c]
    # Dense layer is used to return OHE predicted word
    dec_outputs = dec_dense(dec_outputs)
    dec_model = Model(
        inputs=[dec_inputs] + dec_states_inputs,
        outputs=[dec_outputs] + dec_states)
   
    # single encoder input is a question, represented as a sequence 
    # of integers padded with zeros
    enc_model = Model(inputs=enc_inputs, outputs=enc_states)
   
    return enc_model, dec_model
enc_model, dec_model = make_inference_models()

In [15]:
import pickle
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
VOCAB_SIZE = len(tokenizer.word_index) + 1
print(VOCAB_SIZE)
enc_model = load_model('enc_model.h5')
dec_model = load_model('dec_model.h5')

8619


In [16]:
def str_to_tokens(sentence: str):
    # convert input string to lowercase, 
    # then split it by whitespaces
    words = sentence.lower().split()
    # and then convert to a sequence 
    # of integers padded with zeros
    tokens_list = list()
    for current_word in words:
        result = tokenizer.word_index.get(current_word, '')
        if result != '':
            tokens_list.append(result)
    return pad_sequences([tokens_list], maxlen=MAX_LENGTH, padding='post')

In [17]:

states_values = enc_model.predict(str_to_tokens(input('You : ')))
empty_target_seq = np.zeros((1, 1))
empty_target_seq[0, 0] = tokenizer.word_index['start']
stop_condition = False
decoded_translation = ''
while not stop_condition:
    # feed the state vectors and 1-word target sequence 
    # to the decoder to produce predictions for the next word
    dec_outputs, h, c = dec_model.predict([empty_target_seq] 
                                          + states_values)         
    # sample the next word using these predictions
    sampled_word_index = np.argmax(dec_outputs[0, -1, :])
    sampled_word = None
    # append the sampled word to the target sequence
    for word, index in tokenizer.word_index.items():
        if sampled_word_index == index:
            if word != 'end':
                decoded_translation += ' {}'.format(word)
            sampled_word = word
    # repeat until we generate the end-of-sequence word 'end' 
    # or we hit the length of answer limit
    if sampled_word == 'end' or len(decoded_translation.split()) > MAX_LENGTH:
        stop_condition = True
    # prepare next iteration
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = sampled_word_index
    states_values = [h, c]
print("Bot: " + decoded_translation)

You : hello
Bot:  hey that is me that is talking it
