# Building a Chatbot

In [None]:
#Base reference https://tutorials.botsfloor.com/how-to-build-your-first-chatbot-c84495d4622d

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import time
tf.__version__

'1.12.0'

### Inspect and Load the Data

In [3]:
# Load movie conversation data
lines = open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conv_lines = open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

In [4]:
# Create a dictionary to map each line's id with its text
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

In [5]:
# Create a list of all of the conversations' lines' ids.
convs = [ ]
for line in conv_lines[:-1]:
    _line = line.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
    convs.append(_line.split(','))

In [6]:
# Sort the sentences into questions (inputs) and answers (targets)
questions = []
answers = []

for conv in convs:
    for i in range(len(conv)-1):
        questions.append(id2line[conv[i]])
        answers.append(id2line[conv[i+1]])

In [7]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [8]:
# Clean the data
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
    
clean_answers = []    
for answer in answers:
    clean_answers.append(clean_text(answer))

In [9]:
# Find the length of sentences
lengths = []
for question in clean_questions:
    lengths.append(len(question.split()))
for answer in clean_answers:
    lengths.append(len(answer.split()))

# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])

In [10]:
# Remove questions and answers that are shorter than 2 words and longer than 20 words.
min_line_length = 2
max_line_length = 20

# Filter out the questions that are too short/long
short_questions_temp = []
short_answers_temp = []

i = 0
for question in clean_questions:
    if len(question.split()) >= min_line_length and len(question.split()) <= max_line_length:
        short_questions_temp.append(question)
        short_answers_temp.append(clean_answers[i])
    i += 1

# Filter out the answers that are too short/long
short_questions = []
short_answers = []

i = 0
for answer in short_answers_temp:
    if len(answer.split()) >= min_line_length and len(answer.split()) <= max_line_length:
        short_answers.append(answer)
        short_questions.append(short_questions_temp[i])
    i += 1

In [11]:
# Create a dictionary for the frequency of the vocabulary
vocab = {}
for question in short_questions:
    for word in question.split():
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1
            
for answer in short_answers:
    for word in answer.split():
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1

In [12]:
threshold = 7
count = 0
for k,v in vocab.items():
    if v >= threshold:
        count += 1

In [13]:

questions_vocab_to_int = {}

word_num = 0
for word, count in vocab.items():
    if count >= threshold:
        questions_vocab_to_int[word] = word_num
        word_num += 1
        
answers_vocab_to_int = {}

word_num = 0
for word, count in vocab.items():
    if count >= threshold:
        answers_vocab_to_int[word] = word_num
        word_num += 1

In [14]:
# Add the unique tokens to the vocabulary dictionaries.
codes = ['<PAD>','<EOS>','<UNK>','<GO>']

for code in codes:
    questions_vocab_to_int[code] = len(questions_vocab_to_int)+1
    
for code in codes:
    answers_vocab_to_int[code] = len(answers_vocab_to_int)+1

In [15]:
# Create dictionaries to map the unique integers to their respective words.
# i.e. an inverse dictionary for vocab_to_int.
questions_int_to_vocab = {v_i: v for v, v_i in questions_vocab_to_int.items()}
answers_int_to_vocab = {v_i: v for v, v_i in answers_vocab_to_int.items()}

In [16]:
# Add the end of sentence token to the end of every answer.
for i in range(len(short_answers)):
    short_answers[i] += ' <EOS>'

In [17]:
# Convert the text to integers. 
# Replace any words that are not in the respective vocabulary with <UNK> 
questions_int = []
for question in short_questions:
    ints = []
    for word in question.split():
        if word not in questions_vocab_to_int:
            ints.append(questions_vocab_to_int['<UNK>'])
        else:
            ints.append(questions_vocab_to_int[word])
    questions_int.append(ints)
    
answers_int = []
for answer in short_answers:
    ints = []
    for word in answer.split():
        if word not in answers_vocab_to_int:
            ints.append(answers_vocab_to_int['<UNK>'])
        else:
            ints.append(answers_vocab_to_int[word])
    answers_int.append(ints)

In [18]:
# Calculate what percentage of all words have been replaced with <UNK>
word_count = 0
unk_count = 0

for question in questions_int:
    for word in question:
        if word == questions_vocab_to_int["<UNK>"]:
            unk_count += 1
        word_count += 1
    
for answer in answers_int:
    for word in answer:
        if word == answers_vocab_to_int["<UNK>"]:
            unk_count += 1
        word_count += 1

In [19]:
sorted_questions = []
sorted_answers = []

for length in range(1, max_line_length+1):
    for i in enumerate(questions_int):
        if len(i[1]) == length:
            sorted_questions.append(questions_int[i[0]])
            sorted_answers.append(answers_int[i[0]])


In [20]:
def model_inputs():
    with tf.name_scope('model_inputs') as scope:
        inputs = tf.placeholder(tf.int32, [None, None], name='input_sequence')
        print(inputs.name)
        targets = tf.placeholder(tf.int32, [None, None], name='target_sequence')
        source_sequence_length = tf.placeholder(tf.int32,[None], name='source_sequence_length')
        target_sequence_length = tf.placeholder(tf.int32, [None], name='target_sequence_length')
        max_target_len = tf.reduce_max(target_sequence_length)    
        return inputs, targets, source_sequence_length, target_sequence_length, max_target_len

In [21]:
def hyperparam_inputs():
    with tf.name_scope('hyperparam_inputs') as scope:
        lr_rate = tf.placeholder(tf.float32, name='learning_rate')
        keep_prob = tf.placeholder(tf.float32, name='keep_prob')
        return lr_rate, keep_prob

In [22]:
def process_decoder_input(target_data, target_vocab_to_int, batch_size):
    with tf.name_scope('process_decoder_input') as scope:
        after_slicing = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1],
                                         name='strided_slice')
        after_concat = tf.concat( [tf.fill([batch_size, 1], target_vocab_to_int['<GO>']), 
                                   after_slicing], 1,name='concatGo')
        return after_concat

In [23]:
def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob, 
                   source_vocab_size, 
                   encoding_embedding_size,
                   isBiDirectional):
    with tf.name_scope('encoding_layer') as scope:
        embed = tf.contrib.layers.embed_sequence(rnn_inputs, 
                                                 vocab_size=source_vocab_size, 
                                                 embed_dim=encoding_embedding_size)
        stacked_cells_fw = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(
            tf.contrib.rnn.LSTMCell(rnn_size), keep_prob) for _ in range(num_layers)])
        
        stacked_cells_bw = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(
            tf.contrib.rnn.LSTMCell(rnn_size), keep_prob) for _ in range(num_layers)])
        
        
        if(False == isBiDirectional):
          outputs, state = tf.nn.dynamic_rnn(stacked_cells_fw, 
                                             embed, 
                                             dtype=tf.float32)
        else:
          outputs, state = tf.nn.bidirectional_dynamic_rnn(cell_fw = stacked_cells_fw,
                                                     cell_bw = stacked_cells_bw,
                                                     sequence_length = None,
                                                     inputs = embed, 
                                                     dtype=tf.float32)
        return outputs, state

In [24]:
def decoding_layer_train(initial_state, dec_cell, dec_embed_input, 
                         target_sequence_length, max_summary_length, 
                         output_layer, keep_prob):
   with tf.name_scope('decoding_layer_train') as scope:
        helper = tf.contrib.seq2seq.TrainingHelper(dec_embed_input, 
                                               target_sequence_length,name='TrainingHelper')
        decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, helper, 
                                                  initial_state, output_layer)

        outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, 
                                                      impute_finished=True, 
                                                      maximum_iterations=max_summary_length)
        return outputs

In [25]:
def decoding_layer_infer(initial_state, dec_cell, dec_embeddings, start_of_sequence_id,
                         end_of_sequence_id, max_target_sequence_length,
                         vocab_size, output_layer, batch_size, keep_prob):
   with tf.name_scope('decoding_layer_infer') as scope:
        helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(dec_embeddings, 
                                                      tf.fill([batch_size], start_of_sequence_id), 
                                                      end_of_sequence_id)   
        decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, helper, 
                                                  initial_state, output_layer)
        outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, 
                                                      impute_finished=True, 
                                                      maximum_iterations=max_target_sequence_length)
       
        return outputs

In [26]:
def decoding_layer(dec_input, enc_outputs, encoder_state,
                   source_sequence_length, target_sequence_length, 
                   max_target_sequence_length,
                   rnn_size, num_layers,
                   target_vocab_to_int, target_vocab_size,
                   batch_size, keep_prob, 
                   decoding_embedding_size,
                   withAttentionLayer):
    with tf.name_scope('decoding_layer') as scope:
        dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size+1, 
                                                        decoding_embedding_size],
                                                       name='dec_embeddings'))
        dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input,name='dec_embed_input')
        cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) 
                                             for _ in range(num_layers)])
        
        initial_state = encoder_state
        if(True == withAttentionLayer):
          attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size,
                                                          enc_outputs,
                                                          source_sequence_length,
                                                          name = "Bahdanau")
          cells = tf.contrib.seq2seq.AttentionWrapper(cells,
                                                     attn_mech,
                                                     rnn_size,
                                                     alignment_history=False)
          initial_state = cells.zero_state(batch_size, tf.float32)
        
    
    with tf.variable_scope("decode"):
        output_layer = tf.layers.Dense(target_vocab_size,name='output_layer')
        train_output = decoding_layer_train(initial_state, 
                                            cells, 
                                            dec_embed_input, 
                                            target_sequence_length, 
                                            max_target_sequence_length, 
                                            output_layer, 
                                            keep_prob)
    
    with tf.variable_scope("decode", reuse=True):
        infer_output = decoding_layer_infer(initial_state, 
                                            cells, 
                                            dec_embeddings, 
                                            target_vocab_to_int['<GO>'], 
                                            target_vocab_to_int['<EOS>'], 
                                            max_target_sequence_length, 
                                            target_vocab_size, 
                                            output_layer,
                                            batch_size,
                                            keep_prob)

    return (train_output, infer_output)

In [27]:
def mergeBiDirectionalOutput(enc_states, num_layers):
  fw_state = enc_states[0]
  bw_state = enc_states[1]
  lstm_c = []
  lstm_h = []
  
  for i in range(num_layers):
    lstm_c.append(tf.concat((fw_state[i].c,bw_state[i].c),1))
    lstm_h.append(tf.concat((fw_state[i].h,bw_state[i].h),1))
  
  listForLstmTupleObj = []
  for i in range(num_layers):
    listForLstmTupleObj.append(tf.contrib.rnn.LSTMStateTuple(c = lstm_c[i], h = lstm_h[i]))
 
  tup = tuple(listForLstmTupleObj)
  return tup
 

In [28]:
def seq2seq_model(input_data, target_data, 
                  keep_prob, batch_size,
                  source_sequence_length, target_sequence_length, 
                  max_target_sentence_length,
                  source_vocab_size, target_vocab_size,
                  enc_embedding_size, dec_embedding_size,
                  rnn_size, num_layers, 
                  target_vocab_to_int, 
                  isBiDirectional, withAttentionLayer):
    with tf.name_scope('seq2seq_model') as scope:
        enc_outputs, enc_states = encoding_layer(input_data, 
                                             rnn_size, 
                                             num_layers, 
                                             keep_prob, 
                                             source_vocab_size, 
                                             enc_embedding_size,
                                             isBiDirectional)
        cell_size = rnn_size
        
        if(True == isBiDirectional):
          enc_states = mergeBiDirectionalOutput(enc_states, num_layers)
          cell_size = rnn_size * 2
          
          #Concat output
          fw_cell_op = enc_outputs[0]
          bw_cell_op = enc_outputs[1]
          enc_outputs = tf.concat((fw_cell_op,bw_cell_op),axis=2)
          
        
        dec_input = process_decoder_input(target_data, 
                                      target_vocab_to_int, 
                                      batch_size)
        
        train_output, infer_output = decoding_layer(dec_input,
                                                    enc_outputs,
                                                    enc_states, 
                                                    source_sequence_length,
                                                    target_sequence_length, 
                                                    max_target_sentence_length,
                                                    cell_size,
                                                    num_layers,
                                                    target_vocab_to_int,
                                                    target_vocab_size,
                                                    batch_size,
                                                    keep_prob,
                                                    dec_embedding_size,
                                                    withAttentionLayer)
        
    return train_output, infer_output

In [29]:
# Set the Hyperparameters
epochs = 100
batch_size = 200
num_layers = 1 
rnn_size = 512
encoding_embedding_size = 512
decoding_embedding_size = 512
learning_rate = 0.005
learning_rate_decay = 0.9
min_learning_rate = 0.0001
keep_probability = 0.75
isBiDirectional = True
withAttentionLayer = False

In [30]:

source_vocab_to_int = questions_vocab_to_int
target_vocab_to_int = answers_vocab_to_int

tf.reset_default_graph()
train_graph = tf.Graph()
checkpoint = 'chatbot/model'

with train_graph.as_default():
    with tf.name_scope('train_graph.as_default') as scope:
        sess = tf.InteractiveSession()
        input_data, targets, source_sequence_length, target_sequence_length, max_target_sequence_length = model_inputs()
        lr, keep_prob = hyperparam_inputs()
        train_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                   targets,
                                                   keep_prob,
                                                   batch_size,
                                                   source_sequence_length,
                                                   target_sequence_length,
                                                   max_target_sequence_length,
                                                   len(source_vocab_to_int),
                                                   len(target_vocab_to_int),
                                                   encoding_embedding_size,
                                                   decoding_embedding_size,
                                                   rnn_size,
                                                   num_layers,
                                                   target_vocab_to_int,
                                                   isBiDirectional,
                                                   withAttentionLayer)
        sess.close()
        training_logits = tf.identity(train_logits.rnn_output, name='logits')
        inference_logits = tf.identity(inference_logits.sample_id, name='predictions')
        masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, 
                                 dtype=tf.float32, name='masks')

        with tf.name_scope("optimization"):
            cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

            optimizer = tf.train.AdamOptimizer(lr)

            gradients = optimizer.compute_gradients(cost)
            capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
            train_op = optimizer.apply_gradients(capped_gradients)
        

train_graph.as_default/model_inputs/input_sequence:0


In [31]:
def pad_sentence_batch(sentence_batch, vocab_to_int):
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [32]:
def batch_data(questions, answers, batch_size):
    """Batch questions and answers together"""
    for batch_i in range(0, len(questions)//batch_size):
        start_i = batch_i * batch_size
        questions_batch = questions[start_i:start_i + batch_size]
        answers_batch = answers[start_i:start_i + batch_size]
        pad_questions_batch = np.array(pad_sentence_batch(questions_batch, questions_vocab_to_int))
        pad_answers_batch = np.array(pad_sentence_batch(answers_batch, answers_vocab_to_int))
        answer_batch_size = []
        for target in pad_answers_batch:
            answer_batch_size.append(len(target))
        question_batch_size = []
        for source in pad_questions_batch:
            question_batch_size.append(len(source))        
        yield pad_questions_batch, pad_answers_batch, question_batch_size, answer_batch_size

In [33]:
# Validate the training with 15% of the data
train_valid_split = int(len(sorted_questions)*0.15)

# Split the questions and answers into training and validating data
train_questions = sorted_questions[train_valid_split:]
train_answers = sorted_answers[train_valid_split:]

valid_questions = sorted_questions[:train_valid_split]
valid_answers = sorted_answers[:train_valid_split]




In [34]:
display_step = 100
stop_early = 0 
stop = 5
validation_check = ((len(train_questions))//batch_size//2)-1 
total_train_loss = 0
summary_valid_loss = [] 

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    for epoch_i in range(1, epochs+1):
      for batch_i, (source_batch, target_batch, sources_lengths, targets_lengths) in enumerate(
              batch_data(train_questions, train_answers, batch_size)):
          start_time = time.time()
          _, loss = sess.run(
                  [train_op, cost],
                  {input_data: source_batch,
                   targets: target_batch,
                   lr: learning_rate,
                   source_sequence_length: sources_lengths,
                   target_sequence_length: targets_lengths,
                   keep_prob: keep_probability})

          total_train_loss += loss
          end_time = time.time()
          batch_time = end_time - start_time

          if batch_i % display_step == 0:
              print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                    .format(epoch_i,
                            epochs, 
                            batch_i, 
                            len(train_questions) // batch_size, 
                            total_train_loss / display_step, 
                            batch_time*display_step))
              total_train_loss = 0

          if batch_i % validation_check == 0 and batch_i > 0:
              total_valid_loss = 0
              start_time = time.time()
              for batch_ii, (questions_batch, answers_batch, question_lengths, answer_lengths) in \
                      enumerate(batch_data(valid_questions, valid_answers, batch_size)):
                  valid_loss = sess.run(cost, {input_data: questions_batch,
                                                  targets: answers_batch,
                                                       lr: learning_rate,
                                   source_sequence_length: question_lengths,
                                   target_sequence_length: answer_lengths,
                                                keep_prob: 1})
                  total_valid_loss += valid_loss
              end_time = time.time()
              batch_time = end_time - start_time
              avg_valid_loss = total_valid_loss / (len(valid_questions) / batch_size)
              print('Valid Loss: {:>6.3f}, Seconds: {:>5.2f}'.format(avg_valid_loss, batch_time))

              # Reduce learning rate, but not below its minimum value
              learning_rate *= learning_rate_decay
              if learning_rate < min_learning_rate:
                  learning_rate = min_learning_rate

              summary_valid_loss.append(avg_valid_loss)
              if avg_valid_loss <= min(summary_valid_loss):
                  print('New Record!') 
                  stop_early = 0
                  saver.save(sess, checkpoint)

              else:
                  print("No Improvement.")
                  stop_early += 1
                  if stop_early == stop:
                      break

      if stop_early == stop:
          print("Stopping Training.")
          break

Epoch   1/100 Batch    0/587 - Loss:  0.093, Seconds: 1650.45


KeyboardInterrupt: 

In [None]:
def question_to_seq(question, vocab_to_int):
    '''Prepare the question for the model'''
    
    question = clean_text(question)
    return [vocab_to_int.get(word, vocab_to_int['<UNK>']) for word in question.split()]

In [None]:
tf.reset_default_graph()
metagraph = "chatbot/model.meta" 
random = np.random.choice(len(short_questions))
input_question = short_questions[random]
batch_size  = 200
# Prepare the question
input_question = question_to_seq(input_question, questions_vocab_to_int)

# Pad the questions until it equals the max_line_length
input_question = input_question + [questions_vocab_to_int["<PAD>"]] * (max_line_length - len(input_question))

with tf.Session() as sess:
    saver = tf.train.import_meta_graph(metagraph)
    saver.restore(sess,checkpoint)
    graph = sess.graph
    logit =  graph.get_tensor_by_name('train_graph.as_default/predictions:0')
    input_source = graph.get_tensor_by_name('train_graph.as_default/model_inputs/input_sequence:0')
    input_len = graph.get_tensor_by_name('train_graph.as_default/model_inputs/source_sequence_length:0')
    output_len = graph.get_tensor_by_name('train_graph.as_default/model_inputs/target_sequence_length:0')
    learning_rate = graph.get_tensor_by_name('train_graph.as_default/hyperparam_inputs/keep_prob:0')
    
    output = sess.run(logit, {input_source: [input_question]*batch_size,
                                         input_len: [len(input_question)]*batch_size, 
                                         output_len: [len(input_question)*2]*batch_size,
                                         learning_rate: 1.0})
    translate_logits = output[0]
    
    pad_q = questions_vocab_to_int["<PAD>"]
    pad_a = answers_vocab_to_int["<PAD>"]

    print("\n translate_logits:",translate_logits.shape)
    print('Input')
    print('  Word Ids:      {}'.format([i for i in input_question]))
    print('  English Words: {}'.format([questions_int_to_vocab[i] for i in input_question]))

    print('\nPrediction')
    print('  Word Ids:      {}'.format([i for i in translate_logits]))
    print('  French Words: {}'.format(" ".join([answers_int_to_vocab[i] for i in translate_logits])))