<a href="https://colab.research.google.com/github/utsavdatta-git/keras_chatbots/blob/master/Simple_Baseline_Chatbot_with_Bi_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import all the libraries
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import json
import re
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
import numpy as np
import pandas as pd
import nltk
from gensim.parsing.preprocessing import preprocess_string,strip_punctuation, strip_numeric, strip_multiple_whitespaces, strip_non_alphanum
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, TimeDistributed
from tensorflow.keras import preprocessing , utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using TensorFlow backend.


In [0]:
def setup_google_drive():
  from google.colab import drive
  drive.mount('/content/drive')

In [0]:
#-----------------------------------------------------------------------------
# Function name: text_preprocess
# Purpose: This fuction pre-processes an input text through a series of pre-processing pipelines.
#          It has been implemented as a function to be used as a common function 
#          to pre-process texts at any stage of the assignment             
# Input: a string of text
# Output: preprocessed list of words
#-----------------------------------------------------------------------------
def text_preprocess(text):
  #Strip leading and trailing spaces and De-capitalize
  text = str(text).strip().lower()
  #These are just common English contractions as much as possible
  contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", 
                    "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                    "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", 
                    "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", 
                    "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", 
                    "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", 
                    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", 
                    "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", 
                    "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                    "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                    "so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", 
                    "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", 
                    "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", 
                    "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", 
                    "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", 
                    "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", 
                    "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", 
                    "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
                    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
  #Removing contractions
  text_cleaned = " ".join([contraction_dict.get(x,x) for x in text.split()])
  
  #Removing punctuations, removing numbers and mupliple spaces
  CUSTOM_FILTERS = [lambda x: x, strip_punctuation, strip_numeric, strip_multiple_whitespaces,strip_non_alphanum]
  dialogues_list_preprocessed = preprocess_string(text_cleaned, CUSTOM_FILTERS)
 
  stemmer = PorterStemmer()
  
  lemmatizer = WordNetLemmatizer()
  
  #Stemming and Lemmatizing
  #dialogues_list_preprocessed = [lemmatizer.lemmatize(x) for x in dialogues_list_preprocessed]
  #dialogues_list_preprocessed = [stemmer.stem(x) for x in dialogues_list_preprocessed]
  dialogues_list_preprocessed = [x.replace("’","") for x in dialogues_list_preprocessed]
  
  return dialogues_list_preprocessed

In [0]:
#-----------------------------------------------------------------------------
# Function name: read_conv_data
# Purpose: This fuction reads conversation files which contain question and answer pairs             
# Input: none
# Output: questions, answers and list of distinct questions 
#-----------------------------------------------------------------------------
def read_conv_data():
  questions = []
  answers = []
  preprocessed_answers = []
  df = pd.read_csv("/content/drive/My Drive/Cousera files/qna_chitchat_professional.tsv", sep="\t")
  for index, row in df.iterrows():
      questions.append(' '.join(text_preprocess(row[0])))
      preprocessed_answers.append(' '.join(text_preprocess(row[1])))
      answers.append(row[1])  
  distinct_answers = set(answers)  
  return questions, answers, preprocessed_answers, distinct_answers

In [0]:
#-----------------------------------------------------------------------------
# Function name: create_question_answer_batches
# Purpose: This function modifies question and answers to make them fit to input into neural network             
# Input: none
# Output: padded numeric list of questions and list of answers-to-numeric value mapping 
#-----------------------------------------------------------------------------
def create_question_answer_batches():
  questions, answers, preprocessed_answers, distinct_answers = read_conv_data()
  tokenizer = Tokenizer(oov_token="<OOV>")
  tokenizer.fit_on_texts(questions+preprocessed_answers)
  sequences_question = tokenizer.texts_to_sequences(questions)
  word_index = tokenizer.word_index
  print('Found %s unique tokens.' % len(word_index))
  # input_data
  max_question_len = max([len(x.split()) for x in questions])
  data_questions = pad_sequences(sequences_question, maxlen=max_question_len,padding='post')
  max_length = data_questions.shape[1]
  # output_data
  answers_dict={}
  num_to_answers_dict={}
  labels = []
  for num, ans in enumerate(distinct_answers):
    answers_dict[ans] = num
    num_to_answers_dict[num] = ans
  for answer in answers:
    labels.append(answers_dict[answer])
  labels_one_hot = utils.to_categorical(labels)  
  return word_index, data_questions, answers_dict, num_to_answers_dict, labels_one_hot, tokenizer

In [0]:
#-----------------------------------------------------------------------------
# Function name: create_glove_embed_index
# Purpose: This function creates a word to vector model from a pre-trained Glove model             
# Input: none
# Output: glove word embedding dictionary 
#-----------------------------------------------------------------------------
def create_glove_embed_index():
  embeddings_index = {}
  f = open('/content/drive/My Drive/Cousera files/glove.6B.100d.txt')
  for line in f:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = coefs
  f.close()

  print('Found %s word vectors.' % len(embeddings_index))
  return embeddings_index

In [0]:
#-----------------------------------------------------------------------------
# Function name: create_embedding_matrix
# Purpose: This function creates a word index to vector matrix             
# Input: vocab_size, embedding_dim, word_index
# Output: word embedding matrix 
#-----------------------------------------------------------------------------
def create_embedding_matrix(vocab_size, embedding_dim, word_index):
  embedding_matrix = np.zeros((vocab_size, embedding_dim))
  embeddings_index = create_glove_embed_index()
  for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector
  return embedding_matrix    

In [0]:
#-----------------------------------------------------------------------------
# Function name: create_model
# Purpose: create a keras model to train the chatbot             
# Input: vocab_size, LSTM_nodes, embedding_dim, word_index
# Output: the model
#-----------------------------------------------------------------------------
def create_model(vocab_size, LSTM_nodes, embedding_dim, word_index):
  embedding_matrix = create_embedding_matrix(vocab_size, embedding_dim, word_index)
  model = tf.keras.Sequential([
                              tf.keras.layers.Embedding(input_dim = vocab_size, 
                                output_dim = embedding_dim,
                                weights = [embedding_matrix],
                                trainable = False),
                              tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LSTM_nodes)),
                              tf.keras.layers.Dense(labels_one_hot.shape[1], activation="softmax")
  ])
  model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy', metrics=["accuracy"])
  return model

In [0]:
#-----------------------------------------------------------------------------
# Function name: start_chat
# Purpose: start the chat using the trained model             
# Input: tokenizer, max_length, model, num_to_answers_dict
# Output: none
#-----------------------------------------------------------------------------
def start_chat(tokenizer, max_length, model, num_to_answers_dict):
  stop_condition = False
  while not stop_condition :
    user_conv = input("User: ")
    if user_conv == "Exit":
      break
    sequenced_new_data = tokenizer.texts_to_sequences([user_conv])
    padded_new_data = pad_sequences(sequenced_new_data, padding="post", maxlen=max_length)
    pred = model.predict_classes(padded_new_data)
    response = num_to_answers_dict[pred[0]]
    print("Bot: "+response)

In [10]:
# Put it all together
# Setup
setup_google_drive()
word_index, data_questions, answers_dict, num_to_answers_dict, labels_one_hot, tokenizer = create_question_answer_batches()
# Define constants
vocab_size = len(word_index)+1
embedding_dim = 100
LSTM_nodes = 64
batch_size = 32
epochs = 100
conv_model = create_model(vocab_size, LSTM_nodes, embedding_dim, word_index)
# Start training
conv_model.fit(data_questions, labels_one_hot, batch_size=batch_size, epochs=epochs) 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found 2625 unique tokens.
Found 400000 word vectors.
Train on 9796 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100

<tensorflow.python.keras.callbacks.History at 0x7fb148e14198>

In [11]:
start_chat(tokenizer, data_questions.shape[1], conv_model, num_to_answers_dict)

User: Hello dear! How are you?
Bot: I'm digital. In other words, I'm not human.
User: I know but you are really nice
Bot: Noted.
User: what are you upto?
Bot: I'm digital. In other words, I'm not human.
User: Do you eat?
Bot: I don't have a body.
User: Do you have any problems?
Bot: I don't have family.
User: I like you very very much
Bot: Thanks.
User: Do you like me?
Bot: I do like you.
User: Is Australia a great country?
Bot: Ok.
User: are all robots cruel?
Bot: Not at all.
User: what kind of chatbot are you?
Bot: I'm digital. In other words, I'm not human.
User: Is life good?
Bot: I'm happy to hear that.
User: that was a question
Bot: Excellent.
User: Exit
