# **BUILDING A CHATBOT WITH DEEP NLP**

---




## **IMPORTING THE LIBRARIES**

In [1]:
import numpy as np
import tensorflow as tf
import re
import time

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **STEP 1: DATA PREPROCESSING**

### **IMPORTING THE DATASET**

In [3]:
path = '/content/drive/My Drive'

In [4]:
lines = open(path+'/movie_lines.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')
conversations = open(path+'/movie_conversations.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')

In [5]:
lines[0:10]

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.",
 'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow',
 "L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.",
 'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No',
 'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?']

In [6]:
conversations[0:10]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L363', 'L364']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L365', 'L366']"]

### **CREATING A DICTIONARY THAT MAPS EACH LINES AND ITS ID**

In [7]:
id2line = {}

for line in lines :
  # Splitting the line by the code word `+++$+++`
  _line = line.split(' +++$+++ ') 
  if len(_line) == 5 : 
      id2line[_line[0]] = _line[4]

### **CREATING A LIST OF ALL OF THE CONVERSATIONS**

In [8]:
conversations_ids = []

for conversation in conversations[:-1] : 
  _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
  conversations_ids.append(_conversation.split(','))

In [9]:
conversations_ids[0:10]

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208'],
 ['L271', 'L272', 'L273', 'L274', 'L275'],
 ['L276', 'L277'],
 ['L280', 'L281'],
 ['L363', 'L364'],
 ['L365', 'L366']]

### **GET THE QUESTIONS AND THE ANSWERS SEPARATELY**

In [10]:
questions = []
answers = []

for conversation in conversations_ids :
  for i in range (len(conversation) - 1) : 
    questions.append(id2line[conversation[i]])
    answers.append(id2line[conversation[i+1]])

In [11]:
questions[0:20]

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
 "Well, I thought we'd start with pronunciation, if that's okay with you.",
 'Not the hacking and gagging and spitting part.  Please.',
 "You're asking me out.  That's so cute. What's your name again?",
 "No, no, it's my fault -- we didn't have a proper introduction ---",
 'Cameron.',
 "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
 'Why?',
 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.',
 'Gosh, if only we could find Kat a boyfriend...',
 "C'esc ma tete. This is my head",
 "Right.  See?  You're ready for the quiz.",
 "I don't want to know how to say that though.  I want to know useful things. Like where the good stores are.  How much does champagne cost?  Stuff like Chat.  I have n

In [12]:
answers[0:20]

["Well, I thought we'd start with pronunciation, if that's okay with you.",
 'Not the hacking and gagging and spitting part.  Please.',
 "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?",
 'Forget it.',
 'Cameron.',
 "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
 'Seems like she could get a date easy enough...',
 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.',
 "That's a shame.",
 'Let me see what I can do.',
 "Right.  See?  You're ready for the quiz.",
 "I don't want to know how to say that though.  I want to know useful things. Like where the good stores are.  How much does champagne cost?  Stuff like Chat.  I have never in my life had to point out my head to someone.",
 "That's because it's such a nice one.",
 'Forget French.',
 "Well, there's someone I think might be --",
 'Where?',
 "I 

### **CLEANING ALL THE TEXTS FUNCTION**

In [13]:
def clean_text(text):
    text = text.lower() # to lower case every letter in the text
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    return text

### **CLEANING THE QUESTIONS**

In [14]:
clean_questions = []

for question in questions : 
  clean_questions.append(clean_text(question))

In [15]:
clean_questions[0:10]

['can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again',
 'well i thought we would start with pronunciation if that is okay with you',
 'not the hacking and gagging and spitting part  please',
 'you are asking me out  that is so cute what is your name again',
 "no no it's my fault  we didn't have a proper introduction ",
 'cameron',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does',
 'why',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
 'gosh if only we could find kat a boyfriend']

### **CLEANING THE ANSWERS**

In [16]:
clean_answers = []

for answer in answers : 
  clean_answers.append(clean_text(answer))

In [17]:
clean_answers[0:10]

['well i thought we would start with pronunciation if that is okay with you',
 'not the hacking and gagging and spitting part  please',
 "okay then how 'bout we try out some french cuisine  saturday  night",
 'forget it',
 'cameron',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does',
 'seems like she could get a date easy enough',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
 'that is a shame',
 'let me see what i can do']

### **REMOVING THE NON FREQUENT WORDS OCCURING BY CREATING A CORPUS**

**CREATING A DICTIONARY THAT MAPS EACH WORD TO ITS NUMBER OF OCCURANCES**

In [18]:
word2count = {}

for question in clean_questions : 
  for word in question.split() :
    if word not in word2count : 
      word2count[word] = 1
    else :
      word2count[word] += 1

for answer in clean_answers : 
  for word in answer.split() :
    if word not in word2count : 
      word2count[word] = 1
    else :
      word2count[word] += 1

### **CREATING TWO DICTIONARIES THAT MAP THE QUESTIONS WORDS AND ANSWERS WORDS TO A UNIQUE INTEGERS**

**TOKENISATION AND FILTERING THE NON FREQUENT WORDS**

In [19]:
threshold = 20
questionsWords2int = {}
word_number = 0

for word, count in word2count.items() :
  if count >= threshold : 
    questionsWords2int[word] = word_number
    word_number += 1

answersWords2int = {}   
word_number = 0

for word, count in word2count.items() :
  if count >= threshold : 
    answersWords2int[word] = word_number
    word_number += 1

**ADDING THE LAST TOKENS TO THESE DICTIONARIES**

In [20]:
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']

for token in tokens:
    questionsWords2int[token] = len(questionsWords2int) + 1
for token in tokens:
    answersWords2int[token] = len(answersWords2int) + 1

### **CREATING THE INVERSE DICTIONARY OF THE ANSWERSWORDS2INT DICTIONARY**

This is to inverse map the integers in the answerWords2int dictionary to individual words.

In [21]:
answersInt2Words = {w_i : w for w, w_i in answersWords2int.items()}

### **ADDING THE `<EOS>` TOKEN TO THE END OF EVERY ANSWERS IN THE LIST**

In [22]:
for i in range (len(clean_answers)) :
  clean_answers[i] += ' <EOS>' 

In [23]:
clean_answers[0:10]

['well i thought we would start with pronunciation if that is okay with you <EOS>',
 'not the hacking and gagging and spitting part  please <EOS>',
 "okay then how 'bout we try out some french cuisine  saturday  night <EOS>",
 'forget it <EOS>',
 'cameron <EOS>',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does <EOS>',
 'seems like she could get a date easy enough <EOS>',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something <EOS>',
 'that is a shame <EOS>',
 'let me see what i can do <EOS>']

### **TRANSLATING ALL THE QUESTIONS AND ANSWERS TO INTEGERS AND REPLACING ALL THE WORDS WHICH WERE FILTERED OUT BY `<OUT>`**

In [24]:
questions_to_int = []

for question in clean_questions :
  ints = []
  for word in question.split() :
    if word not in questionsWords2int :
      ints.append(questionsWords2int['<OUT>'])
    else :
      ints.append(questionsWords2int[word])
  questions_to_int.append(ints)      

In [25]:
answers_to_int = []

for answer in clean_answers :
  ints = []
  for word in answer.split() :
    if word not in answersWords2int :
      ints.append(answersWords2int['<OUT>'])
    else :
      ints.append(answersWords2int[word])
  answers_to_int.append(ints) 

### **SORTING THE QUESTIONS AND ANSWERS BY THE LENGTH OF THE QUESTIONS**

In [26]:
sorted_clean_questions = []
sorted_clean_answers = []

# Min length = 1, Max length = 24
for length in range (1, 25) : 
  for i in enumerate(questions_to_int) :
    if len(i[1]) == length :
      sorted_clean_questions.append(questions_to_int[i[0]])
      sorted_clean_answers.append(answers_to_int[i[0]])

In [27]:
sorted_clean_questions[0:10]

[[47], [62], [123], [147], [135], [39], [175], [39], [182], [183]]

In [28]:
sorted_clean_answers[0:10]

[[15,
  48,
  25,
  47,
  18,
  49,
  50,
  15,
  51,
  52,
  45,
  53,
  8824,
  54,
  52,
  55,
  41,
  56,
  18,
  57,
  58,
  59,
  60,
  61,
  8823],
 [8824,
  63,
  60,
  64,
  65,
  66,
  67,
  68,
  69,
  60,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  60,
  78,
  79,
  52,
  74,
  80,
  81,
  8823],
 [102, 8823],
 [1529, 77, 101, 1550, 33, 149, 608, 8823],
 [27, 153, 227, 3, 6453, 8823],
 [26, 27, 7, 160, 253, 65, 1280, 97, 65, 613, 8823],
 [1387, 134, 8823],
 [27, 239, 133, 194, 226, 74, 8823],
 [196, 8823],
 [20, 27, 124, 612, 32, 45, 1512, 47, 8823]]

## **STEP 2: BUILDING THE NLP SEQUENCE TO SEQUENCE MODEL**

### **CREATING PLACEHOLDERS FOR THE INPUTS AND THE TARGETS**

In [29]:
def model_inputs() :
  inputs = tf.placeholder(tf.int32, [None, None], name = 'input')
  targets = tf.placeholder(tf.int32, [None, None], name = 'target') 
  lr = tf.placeholder(tf.float32, name = 'learning_rate')
  keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
  return inputs, targets, lr, keep_prob

### **PREPROCESSING THE TARGETS INTO BATCHES OF SOME SIZE**

In [30]:
def preprocess_targets(targets, word2int, batch_size) :
  # First word should be a starting token
  left_side = tf.fill([batch_size, 1], word2int['<SOS>']) 
  # Stride and slide each target
  right_side = tf.strided_slice(targets, [0,0], [batch_size, -1], [1,1]) 
  preprocessed_targets = tf.concat([left_side, right_side], axis = 1)
  return preprocessed_targets

### **CREATING THE ENCODING RNN LAYER { STACKED LSTM }**




*   RNN Inputs: Inputs, targets, lr etc.
*   RNN Size: Number of input tensors
*   RNN Layers
*   Keep_Prob: To control the dropout rate
*   Sequence_Length: length of questions















In [31]:
def encoder_rnn(rnn_inputs, rnn_size, rnn_layers, keep_prob, sequence_length) : 
  lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
  lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
  encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
  # Input and ouput size of the forward and backward cell must match
  _, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fv = encoder_cell,
                                                                  cell_bv = encoder_cell,
                                                                  sequence_length = sequence_length,
                                                                  inputs = rnn_inputs,
                                                                  dtype = tf.float32
                                                                  ) 
  return encoder_state                                                                 

### **DECODING THE OBSERVATIONS ON THE TRAINING SET**



*   Encoder state: output from encoder_rnn_layer
*   Decoder cell: cell in the rnn decoder
*   Decoder embedded unit: unit on which embedding is applied on 
*   Sequence_length
*   Decoding_Scope: ds that wraps tf variables
*   Output_function: fn used to return decoder o/p
*   Keep Prob
*   Batch_size


(Embedding: Mapping from discrete objects like words to real number vectors)







In [32]:
def decode_training_set(encoder_state, decoder_cell, decoder_embedded_input, sequence_length, decoding_scope, output_function, keep_prob, batch_size) :
  attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
  attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = "bahdanau", num_units = decoder_cell.output_size)
  # Training for dynamic rnn decoder
  training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                            attention_keys,
                                                                            attention_values,
                                                                            attention_score_function,
                                                                            attention_construct_function,
                                                                            name = "attn_dec_train"
                                                                            ) 
  decoder_output, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                training_decoder_function,
                                                                decoder_embedded_input,
                                                                sequence_length,
                                                                scope = decoding_scope
                                                                )
  decoder_output_dropout = tf.nn.dropout(decoder_output, keep_prob)
  return output_function(decoder_output_dropout)

### **DECODING THE OBSERVATIONS IN THE TEST/VALIDATION STATE**

In [33]:
def decode_test_set(encoder_state, decoder_cell, decoder_embeddings_matrix, sos_id, eos_id, maximum_length, num_words, sequence_length, decoding_scope, output_function, keep_prob, batch_size) :
  attention_states = tf.zeroes([batch_size, 1, decoder_cell.output_size])
  attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = "bahdanau", num_units = decoder_cell.output_size)
  # Testing for dynamic rnn decoder
  test_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_inference(output_function,
                                                                            encoder_state[0],                                                                         encoder_state[0],
                                                                            attention_keys,
                                                                            attention_values,
                                                                            attention_score_function,
                                                                            attention_construct_function,
                                                                            decoder_embeddings_matrix,
                                                                            sos_id,
                                                                            eos_id,
                                                                            maximum_length,
                                                                            num_words,
                                                                            name = "attn_dec_inf"
                                                                            ) 
  test_predictions, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                  test_decoder_function,
                                                                  scope = decoding_scope
                                                                  )
  return test_predictions

### **CREATING THE DECODING RNN LAYER**





In [34]:
def decoder_rnn(decoder_embedded_input, decoder_embedded_matrix, encoder_state, num_words, sequence_length, rnn_size, num_layers, word2int, keep_prob, batch_size) :
  with tf.variable_scope("decoding") as decoding_scope : 
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    decoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
    # Initialize Weights
    weights = tf.truncated_normal_initializer(stddev = 0.1)
    # Get the biases
    biases = tf.zeros_initializer()
    output_function = lambda x : tf.contrib.layers.fully_connected(x,
                                                                   num_words,
                                                                   None, 
                                                                   scope = decoding_scope,
                                                                   weights_initializer = weights,
                                                                   biases_initializer = biases
                                                                   )
    training_predictions = decode_training_set(encoder_state,
                                               decoder_cell,
                                               decoder_embedded_input,
                                               sequence_length,
                                               decoding_scope,
                                               output_function,
                                               keep_prob,
                                               batch_size                                          
                                               )
    decoding_scope.reuse_variables()
    test_predictions = decode_test_set(encoder_state,
                                       decoder_cell,
                                       decoder_embedded_matrix,
                                       word2int['<SOS>'],
                                       word2int['<EOS>'],
                                       sequence_length-1,
                                       num_words,
                                       decoding_scope,
                                       output_function,
                                       keep_prob,
                                       batch_size
                                       )
  return training_predictions, test_predictions

### **BUILDING THE SEQUENCE TO SEQUENCE MODEL**

In [35]:
def seq2seq_model(inputs, targets, keep_prob, batch_size, sequence_length, answers_num_words, questions_num_words, encoder_embedding_size, decoder_embedding_size, rnn_size, num_layers, questionsWords2int) : 
  # Putting together the encoder and the decoder
  encoder_embedded_input = tf.contrib.layers.embed_sequence(inputs, 
                                                            answers_num_words+1,
                                                            encoder_embedding_size,
                                                            initializer = tf.random_uniform_initializer(0,1)
                                                            )
  encoder_state = encoder_rnn(encoder_embedded_input, rnn_size, num_layers, keep_prob, sequence_length)
  preprocessed_targets = preprocess_targets(targets, questionsWords2int, batch_size)
  decoder_embeddings_matrix = tf.Variable(tf.random_uniform([questions_num_words+1, decoder_embedding_size], 0, 1))
  decoder_embedded_input = tf.nn.embedding_lookup(decoder_embeddings_matrix, preprocessed_targets)
  training_predictions, test_predictions = decoder_rnn(decoder_embedded_input, 
                                                       decoder_embedded_matrix, 
                                                       encoder_state, 
                                                       num_words, 
                                                       sequence_length, 
                                                       rnn_size, 
                                                       num_layers, 
                                                       questionsWords2int,
                                                       keep_prob, 
                                                       batch_size
                                                       )
  return training_predictions, test_predictions