In [1]:
# import libraries
import numpy as np
import re
import time
import tensorflow as tf

In [26]:
# import the datasets
lines = open('cornell movie-dialogs corpus/movie_lines.txt', encoding='utf-8', 
             errors = "ignore").read().split("\n")
conversations = open('cornell movie-dialogs corpus/movie_conversations.txt', 
                     encoding='utf-8', errors = "ignore").read().split("\n")

In [27]:
# dictionnary to map each line to its id
idToLine = {}
for line in lines:
    _line = line.split(" +++$+++ ")

    # select only the lines with a conversation exchange length of 5
    if len(_line) == 5:
        idToLine[_line[0]] = _line[4]

In [28]:
# create a list of the conversations
conversations_ids = []
for conversation in conversations[:-1]:
    #remove the square brackets, quotes and empty spaces
    _conversation = conversation.split(" +++$+++ ")[-1][1:-1].replace("'", "").replace(" ", "")
    conversations_ids.append(_conversation.split(','))

In [29]:
# getting separately the questions and the answers
rawQuestions = []
rawAnswers = []

for conversation in conversations_ids:
    for i in range(len(conversation) - 1):
        rawQuestions.append(idToLine[conversation[i]])
        rawAnswers.append(idToLine[conversation[i+1]])

In [30]:
# clean the texts
def clean_text(text):
    
    contractions = {
    "ain't": "am not / are not",
    "aren't": "are not / am not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had / he would",
    "he'd've": "he would have",
    "he'll": "he shall / he will",
    "he'll've": "he shall have / he will have",
    "he's": "he has / he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how has / how is",
    "i'd": "I had / I would",
    "i'd've": "I would have",
    "i'll": "I shall / I will",
    "i'll've": "I shall have / I will have",
    "i'm": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it'd": "it had / it would",
    "it'd've": "it would have",
    "it'll": "it shall / it will",
    "it'll've": "it shall have / it will have",
    "it's": "it has / it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had / she would",
    "she'd've": "she would have",
    "she'll": "she shall / she will",
    "she'll've": "she shall have / she will have",
    "she's": "she has / she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as / so is",
    "that'd": "that would / that had",
    "that'd've": "that would have",
    "that's": "that has / that is",
    "there'd": "there had / there would",
    "there'd've": "there would have",
    "there's": "there has / there is",
    "they'd": "they had / they would",
    "they'd've": "they would have",
    "they'll": "they shall / they will",
    "they'll've": "they shall have / they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had / we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what shall / what will",
    "what'll've": "what shall have / what will have",
    "what're": "what are",
    "what's": "what has / what is",
    "what've": "what have",
    "when's": "when has / when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where has / where is",
    "where've": "where have",
    "who'll": "who shall / who will",
    "who'll've": "who shall have / who will have",
    "who's": "who has / who is",
    "who've": "who have",
    "why's": "why has / why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had / you would",
    "you'd've": "you would have",
    "you'll": "you shall / you will",
    "you'll've": "you shall have / you will have",
    "you're": "you are",
    "you've": "you have"
    }
    
    text = text.lower()
    for word in text.split():
        if word in contractions:
            text = text.replace(word, contractions[word])
    
    return text

In [31]:
questions = []
answers = []

for question in rawQuestions:
    questions.append(clean_text(question))
    
for answer in rawAnswers:
    answers.append(clean_text(answer))

In [33]:
# Filtering out the questions and answers that are too short or too long
short_questions = []
short_answers = []
i = 0
for question in questions:
    if 2 <= len(question.split()) <= 25:
        short_questions.append(question)
        short_answers.append(answers[i])
    i += 1
questions = []
answers = []
i = 0
for answer in short_answers:
    if 2 <= len(answer.split()) <= 25:
        answers.append(answer)
        questions.append(short_questions[i])
    i += 1

In [34]:
print(questions[:3])
print(answers[:3])

['can we make this quick?  roxanne korrine and andrew barrett are having an incredibly horrendous public break- up on the quad.  again.', 'well, i thought we had / we would start with pronunciation, if that has / that is okay with you.', 'not the hacking and gagging and spitting part.  please.']
['well, i thought we had / we would start with pronunciation, if that has / that is okay with you.', 'not the hacking and gagging and spitting part.  please.', "okay... then how 'bout we try out some french cuisine.  saturday?  night?"]


In [9]:
# remove the words that have a low amount of occurrences (optimised for training)
word2count = {}
for question in questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [10]:
for answer in answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [11]:
# dictionnaries to map the words in questions and answers to a unique id

# remove the least common words, in this case less than 20 times
threshold = 20

questionswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold:
        questionswords2int[word] = word_number
        word_number += 1
        
answerswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold:
        answerswords2int[word] = word_number
        word_number += 1

In [12]:
# adding the last tokens to these two dictionnaries
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    questionswords2int[token] = len(questionswords2int) + 1
    answerswords2int[token] = len(answerswords2int) + 1

In [13]:
# creating the inverse dictionary of the answerswords2int dictionnary
answersint2word = {w_i: w for w, w_i in answerswords2int.items()}

In [14]:
# add EOS token to the end of every answer (required for decoding layers in seq2seq model)
for i in range(len(answers)):
    answers[i] += ' <EOS>'

In [15]:
# translate all the questions and answers into integers and replace all the words that were filtered out by <OUT>
questions_to_int = []
for question in questions:
    ints = []
    for word in question.split():
        if word not in questionswords2int:
            ints.append(questionswords2int["<OUT>"])
        else:
            ints.append(questionswords2int[word])
    
    questions_to_int.append(ints)
    
answers_to_int = []
for answer in answers:
    ints = []
    for word in answer.split():
        if word not in answerswords2int:
            ints.append(answerswords2int["<OUT>"])
        else:
            ints.append(answerswords2int[word])
    
    answers_to_int.append(ints)

In [16]:
# sort the questions and answers by the length of the questions (speeds up the training by reducing padding)
sorted_questions = []
sorted_answers = []

for questLen in range(1, 25 + 1):
    for idx, quest in enumerate(questions_to_int):
        if len(quest) == questLen:
            sorted_questions.append(quest)
            sorted_answers.append(answers_to_int[idx])

## Tensorflow 1: building seq2seq model

In [17]:
# create placeholders for the inputs and targets of the model
def model_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name = 'input')
    targets= tf.placeholder(tf.int32, [None, None], name = 'target')
    lr = tf.placeholder(tf.float32, name = 'learning rate')
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
    return inputs, tagerts, lr, keep_prob

In [18]:
# Preprocessing the targets
def preprocess_targets(targets, word2int, batch_size):
    '''
    Add SOS to the start of each liine by creating a tensor with a scalar value
    and concatenating it with the tensors of the targets
    '''
    left_side = tf.fill([batch_size, 1], word2int["<SOS>"])
    right_side = tf.strided_slice(targets, [0,0], [batch_size,-1], [1,1])
    preprocessed_targets = tf.concat([left_side, right_side], axis = 1)
    return preprocessed_targets

In [19]:
# create the encoder RNN layer
def encoder_rnn(rnn_inputs, rnn_size, num_layers, keep_prob, seq_length):
    # define lstm layer and add dropout layer
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, 
                                                 input_keep_prob = keep_prob)
    
    #take a list of RNNCells and wrap them into a single cell
    encoder_cell = tf.contrib.runn.MultiRNNCell([lstm_dropout] * num_layers)
    
    # pass through bidirectionnal rnn to obtain state
    encoder_output, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw = encoder_cell, 
                                                       cell_bw = encoder_cell,
                                                      sequence_length = seq_length,
                                                      inputs = rnn_inputs,
                                                      dtype = tf.float32)
    
    return encoder_state

In [20]:
# decoding the training set
def decode_training_set(encoder_state, decoder_cell, decoder_embedded_input, 
                        sequence_length, decoding_scope, output_function, 
                        keep_prob, batch_size):
    # determine the states to be backpropagated during the training of the model
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = 'bahdanau', num_units = decoder_cell.output_size)
    
    training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0], attention_keys, attention_values, attention_score_function, attention_construct_function, name = "attn_dec_train")
    decoder_output, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell, training_decoder_function, decoder_embedded_input, sequence_length, scope = decoding_scope)
    
    decoder_output_dropout = tf.nn.dropout(decoder_output, keep_prob)
    return output_function(decoder_output_dropout)

In [21]:
# decoding the test/validation set
def decode_test_set(encoder_state, decoder_cell, decoder_embeddings_matrix, sos_id, eos_id, max_length, num_words, 
                    sequence_length, decoding_scope, output_function, keep_prob, batch_size):
    
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = 'bahdanau', num_units = decoder_cell.output_size)
    test_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_inference(output_function, encoder_state[0], attention_keys, attention_values, attention_score_function, attention_construct_function, decoder_embeddings_matrix, sos_id, eos_id, max_length, num_words, name = "attn_dec_inf")
    
    test_predictions, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell, test_decoder_function, scope = decoding_scope)
    
    return test_predictions

In [22]:
# creating the decoder rnn
def decoder_rnn(decoder_embedded_input, decoder_embeddings_matrix, 
                encoder_state, num_words, sequence_length, rnn_size, 
                num_layers, word2int, keep_prob, batch_size):
    with tf.variable_scope("decoding") as decoding_scope:
        lstm = tf.contrib.rnn.BasicLSTMCell()
        lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob=keep_prob)
        
        #take a list of RNNCells and wrap them into a single cell
        decoder_cell = tf.contrib.runn.MultiRNNCell([lstm_dropout] * num_layers)
        
        # generates normal truncated distribution of the weights
        weights = tf.truncated_normal_initializer(stddev=0.1)
        biases = tf.zeros_initializer()
        
        output_function = lambda x: tf.contrib.layers.fully_connected(x, num_words, None, scope = decoding_scope, weights_initializers=weights, biases_initializer=biases)
        
        training_predictions = decode_training_set(encoder_state, decoder_cell, decoder_embedded_input, sequence_length, decoding_scope, output_function, keep_prob, batch_size)
        
        decoding_scope.reuse_variables()
        test_predcitions = decode_test_set(encoder_state, decoder_cell, decoder_embeddings_matrix, word2int['<SOS>'], word2int['<EOS>'], sequence_length -1, num_words, decoding_scope, output_function, keep_prob, batch_size)
    
    return training_predictions, test_predictions

In [23]:
# building the seq2seq model
def seq2seq_model(inputs, targets, keep_prob, batch_size, seq_length, answers_num_words, 
                  questions_num_words, encoder_embedding_size, decoder_embedding_size, rnn_size, 
                  num_layers, questionswords2int):
    
    encoder_embedded_input = tf.contrib.layers.embed_sequence(inputs, answers_num_words+1, 
                                                              encoder_embedding_size, 
                                                              initializer=tf.random_uniform_initializer(0,1))
    encdoer_state = encoder_rnn(encoder_embedded_input, rnn_size, num_layers, keep_prob, sequence_length)
    
    # preprocess the targets
    preprocessed_targets = preprocess_targets(targets, questionswords2int, batch_size)
    
    # initialize the embeddings matrix of the decoder with random numbers between 0 an 1
    decoder_embeddings_matrix = tf.Variabletf.random_uniform([questions_num_words + 1, decoder_embedding_size], 0, 1)
    
    # get the embedded inputs
    decoder_embedded_input = tf.nn.embedding_lookup(decoder_embeddings_matrix, preprocessed_targets)
    
    training_predictions, test_predictions = decoder_rnn(decoder_embedded_input, decoder_embeddings_matrix, encoder_state, 
                                                         questions_num_words, sequence_length, rnn_size, num_layers, 
                                                         questionswords2int, keep_prob, batch_size)
    return training_predictions, test_predictions

## Training seq2seq model

In [24]:
# setting the hyperparameters
epochs = 10
batch_size = 64
rnn_size = 512
num_layers = 3
encoding_embedding_size = 512
decoding_embedding_size = 512
learning_rate = 0.01
learning_rate_decay = 0.9
min_learning_rate = 0.0001

# dropout rate of 50% for hidden units
keep_probability = 0.5

In [25]:
# define session
tf.reset_default_graph()
session = tf.interactiveSession()

AttributeError: module 'tensorflow' has no attribute 'reset_default_graph'

In [None]:
# load the model inputs
inputs, targets, lr, keep_prob = model_inputs()

In [None]:
# setting the sequence length
max_length = 25
sequence_length = tf.placeholder_with_default(max_length, None, name = 'sequence_length')

In [None]:
# get the shape of the inputs tensor
input_shape = tf.shape(inputs)

In [None]:
# get the training and test predictions
training_predictions, test_predictions = seq2seq_model(tf.reverse(inputs, [-1]), targets, keep_prob, batch_size, sequence_length, 
                                                       len(answerswords2int), len(questionswords2int), encoding_embedding_size,
                                                       decdoding_embedding_size, rnn_size, num_layers, questionswords2int)

In [None]:
# setting up the loss error, the optimizer and gradient clipping
with tf.name_scope('optimization'):
    loss_error = tf.contrib.seq2seq.sequence_loss(training_predictions, targets, tf.ones([input_shape[0], sequence_length]))
    optimizer = tf.train.AdamOptimizer(learning_rate)
    gradients = optimizer.compute_gradients(loss_error)
    clipped_gradients = [(tf.clip_by_value(grad_tensor, -5., 5.), grad_variable) for grad_tensor, grad_variable in gradients if grad_tensor is not None]
    optimizer_gradient_clipping = optimizer.apply_gradients(clipped_gradients)

In [None]:
# padding the sequences with the <PAD> token so that question length = answer length
def apply_padding(batch_of_sequences, word2int):
    max_sequence_length = max([len(sequence) for sequence in batch_of_sequences])
    return [sequence + [word2int["<PAD>"]]*(max_sequence_length-len(sequence)) for sequence in batch_of_sequences]

In [None]:
# split the data into batches of questions and answers
def split_into_batches(questions, answers, batch_size):
    for batch_index in range(0, len(questions)//batch_size):
        start_index = batch_index * batch_size
        questions_in_batch = questions[start_index : start_index+batch_size]
        answers_in_batch = answers[start_index : start_index+batch_size]
        padded_questions_in_batch = np.array(apply_padding(questions_in_batch, questionswords2int))
        padded_answers_in_batch = np.array(apply_padding(answers_in_batch, answerswords2int))
        yield padded_questions_in_batch, padded_answers_in_batch

In [None]:
# split the questions and answers into training and validation sets
training_validation_split = int(len(sorted_questions)*0.15)
training_questions = sorted_questions[training_validation_split:]
training_answers = sorted_answers[training_validation_split:]
validation_questions = sorted_questions[:training_validation_split]
validation_answers = sorted_answers[:training_validation_split]

In [None]:
# training the model

# check the training loss every batch_index_check_training_loss and the 
# validation loss half-way through the epoch and at its end
batch_index_check_training_loss = 100
batch_index_check_validation_loss = (len(training_questions) // batch_size // 2) - 1 
total_training_loss_error = 0

# elements for early stopping
list_validation_loss_error = []
early_stopping_check = 0
early_stopping_stop = 100
checkpoint = "chatbot_weights.ckpt"

# initilize global variables
session.run(tf.global_variables_initializer())

for epoch in range(1, epochs+1):
    for batch_index, (padded_questions_in_batch, padded_answers_in_batch) in enumerate(split_into_batches(training_questions, training_answers, batch_size)):
        starting_time = time.time()
        _, batch_training_loss_error = session.run([optimizer_gradient_clipping, loss_error], {inputs: padded_questions_in_batch, targets: padded_answers_in_batch, lr: learning_rate, sequence_length: padded_answers_in_batch.shape[1], keep_prob: keep_probability})
        total_training_loss_error += batch_training_loss_error
        ending_time = time.time()
        batch_time = ending_time - starting_time
        
        if batch_index % batch_index_check_training_loss ==0:
            print("Epoch: {:>3}/{}, Batch:{:>4}, Training Loss Error: {:>6.3f}, Training Time on 100 Batches: {:d} seconds".format(epoch, epochs, batch_index, len(training_questions)//batch_size, total_training_loss_error/batch_index_check_training_loss, int(batch_time*batch_index_check_training_loss)))
            total_training_loss_error = 0
            
        if batch_index % batch_index_check_validation_loss == 0 and batch_index > 0:
            total_validation_loss_error = 0
            starting_time = time.time()
            for batch_index_validation, (padded_questions_in_batch, padded_answers_in_batch) in enumerate(split_into_batches(validation_questions, validation_answers, batch_size)):
                # reminder: optimizer and keep_prob are only used in training
                batch_validation_loss_error = session.run(loss_error, {inputs: padded_questions_in_batch, targets: padded_answers_in_batch, lr: learning_rate, sequence_length: padded_answers_in_batch.shape[1], keep_prob: 1})
                total_validation_loss_error += batch_validation_loss_error
            ending_time = time.time()
            batch_time = ending_time - starting_time
            average_validation_loss_error = total_validation_loss_error / (len(validation_questions) / batch_size)
            print("Validation Loss Error: {:>6.3f}, Batch Validation Time: {:d} seconds".format(average_validation_loss_error, int(batch_time)))
            
            # apply decay to learning rate
            learning_rate *= learning_rate_decay
            if learning_rate < min_learning_rate:
                learning_rate = min_learning_rate
            
            # early stopping
            list_validation_loss_error.append(average_validation_loss_error)
            if average_validation_loss_error <= min(list_validation_loss_error):
                early_stopping_check = 0
                saver = tf.train.Saver()
                saver.save(session, checkpoint)
            else:
                early_stopping_check += 1
                if early_stopping_check == early_stopping_stop:
                    break
    
    if early_stopping_check == early_stopping_stop:
        break

## Testing seq2seq model

In [None]:
# loading the weights and running the session
checkpoint = './chatbot_weights.ckpt'
session = tf.InteractiveSession()
session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(session, checkpoint)

In [None]:
# converting the questions from strings to lists of encoding integers
def convert_string2int(question, word2int):
    question = clean_text(question)
    
    # get the int value representing the word in our dictionnary or the int for <OUT> 
    # if the word does not exist in the dictionnary
    return [word2int.get(word, word2int['<OUT>']) for word in question.split()]

In [None]:
# setting up the chat
while(True):
    question = input('You: ')
    if question == 'Goodbye':
        break
    question = convert_string2int(question, questionswords2int)
    question = question + [questionswords2int["<PAD>"]] * (max_length - len(questions))
    fake_batch = np.zeros((batch_size, max_length))
    fake_batch[0] = question
    predicted_answer = session.run(test_predictions, {inputs: fake_batch, keep_prob: 0.5})[0]
    
    answer = ''
    for word_int in np.argmax(predicted_answer, 1):
        if answersints2word[word_int] == 'i':
            token = 'I'
        elif answersints2word[word_int] == '<EOS>':
            token = '.'
        elif answersints2word[word_int] == '<OUT>':
            token = '.'
        else:
            token = ' ' + answersints2word[word_int]
        
        answer += token
        if token == '.':
            break
    print('ChatBot: ' + answer)