### What is this notebook?
This notebook takes in movie conversations and trains a chatbot to interact with people. This chatbot is trained via a tensorflow neural net.

In [4]:
import ast
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import time

## Section 1.1: Preprocessing Functions

In [2]:
def load_in_lines():
    with open('data/dialogues/movie_lines.txt', 'r') as f:
        lines = f.read().split('\n')
    with open('data/dialogues/movie_conversations.txt', 'r') as g:
        conv_lines = g.read().split('\n')
    return lines, conv_lines

In [4]:
def create_line_mapping(lines):
    '''
    creates a dictionary mapping lineids to the lines.
    '''
    line_mapping = {}
    for line in lines:
        split_line = line.split(' +++$+++ ')
        if len(split_line) == 5:
            line_mapping[split_line[0]] = split_line[4]
    return line_mapping

In [5]:
def format_conv_lines(conv_lines):
    '''
    extracts just the line ids from the conv_lines list.
    returns a list of lists of strings. Each string is a lineid.
    '''
    convs = []
    for conv in conv_lines[:-1]:
        conv_line_list = conv.split(" +++$+++ ")[-1]
        convs.append(ast.literal_eval(conv_line_list))
    return convs

In [6]:
def create_questions_and_answers(convs, line_mapping):
    '''
    creates a list of questions and a list of answers.
    These are the back to back lines from each conversation.
    '''
    questions = []
    answers = []
    for conv in convs:
        for i in range(len(conv) - 1):
            questions.append(line_mapping[conv[i]])
            answers.append(line_mapping[conv[i+1]])
    return questions, answers

In [7]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [8]:
def create_clean_questions_and_answers(questions, answers):
    '''
    applies clean_text function to questions and answers lists.
    '''
    clean_questions = [clean_text(text) for text in questions]
    clean_answers = [clean_text(text) for text in answers]
    return clean_questions, clean_answers

In [26]:
def filter_long_and_short_sentences(questions, answers, min_threshold=2, max_threshold=20):
    '''
    Filters out questions and answers with length below the min_threshold (2)
    and above the max_threshold (20)
    '''
    questions_filtered = []
    answers_filtered = []
    
    for q, a in zip(questions, answers):
        if len(q.split()) > min_threshold and len(q.split()) < max_threshold and \
        len(a.split()) > min_threshold and len(a.split()) < max_threshold:
            questions_filtered.append(q)
            answers_filtered.append(a)
   
    return questions_filtered, answers_filtered

In [30]:
def create_word_frequency_dictionaries(sentence_list):
    '''
    This function applies to either the list of questions of the list of answers.
    It creates a dictionary mapping words to the number of times that word occurs.
    '''
    word_frequencies = {}
    for sentence in sentence_list:
        for word in sentence.split():
            if word_frequencies.get(word, 0) == 0:
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    return word_frequencies    

In [43]:
def filter_uncommon_words(vocab, threshold):
    '''
    filters out words from vocab dictionary (output of create_word_frequency_dictionaries)
    that occurs fewer than threshold many times.
    '''
    filtered_vocab = {key: val for key, val in vocab.iteritems() if val >= threshold}
    return filtered_vocab

In [48]:
def create_unique_identifiers(vocab):
    '''
    creates dictionary mapping each word in vocab to a unique number
    '''
    words = set(vocab.keys())
    vocab_identifiers = {}
    unique_int = 0
    for word in words:
        vocab_identifiers[word] = unique_int
        unique_int += 1
    
    return vocab_identifiers

In [54]:
def create_dictionary_tokens(vocab_identifiers):
    '''
    adds identification tokens to the vocab_identifiers dictionary
    '''
    codes = ['<PAD>','<EOS>','<UNK>','<GO>']

    for code in codes:
        vocab_identifiers[code] = len(filtered_vocab)+1
    
    # Rename vocab_identifiers now this has tokens. This is just for interpretability reasons.
    identifiers_with_tokens = vocab_identifiers
    return identifiers_with_tokens

In [61]:
def create_int_to_word_dict(identifiers_with_tokens):
    '''
    creates a dictionary mapping integers to words as opposed to words mapping to 
    integers.
    '''
    int_to_word_dict = {idx : word for word, idx in identifiers_with_tokens.iteritems()}
    return int_to_word_dict

In [75]:
def add_eos_token(lines):
    '''
    Here lines is a list of strings (sentences). This will generally be either
        questions_filtered (or)
        answers_filtered
    but any input of form list of strings will word.
    This function appends ' <EOS>' to the end of each string, representing the
    end of sentence.
    '''
    lines_with_eos = [x + ' <EOS>' for x in lines]
    return lines_with_eos

In [79]:
def convert_lines_to_ints(lines, word_identifiers):
    '''
    converts list of lines to a list of list of integers where each integer represents
    a word. The integer mapping is per the word_identifiers defined by the 
    create_unique_identifiers function. If a word isn't in the word_identifiers, it will be
    replaced with '<UNK>' for unknown.
    INPUT
        lines: list of strings. Each string is a movie line.
        word_identifiers: dictionary. maps words to unique identifiers. 
    OUTPUT
        int_lines: list of list of integers.
    '''
    int_lines = []
    for line in lines:
        word_split = line.split()
        words_as_ints = map(lambda x: word_identifiers.get(x, '<UNK>'), word_split)
        int_lines.append(words_as_ints)
    
    return int_lines

In [99]:
def get_percent_unk(int_lines):
    '''
    tells us the percent of total words in int_lines that are the '<UNK>' token.
    INPUT
        int_lines: list of list of integers and <UNK> tokens. Output of convert_lines_to_ints
        function.
    '''
    words_flat = [word for sentence in int_lines for word in sentence]
    word_set = set(words_flat)
    with_unk = len(words_flat)
    without_unk = len([x for x in words_flat if type(x) == int])
    percent_unk = (with_unk - float(without_unk)) / with_unk
    
    print "{} unique words".format(len(word_set))
    print "Percent of total words spoken that are unkown: {}%".format(round(100*percent_unk, 2))

In [115]:
def sort_by_question_length(int_form_questions, int_form_answers):
    '''
    Sorts question and answer pairs by the lengths of the questions.
    Implementing this function speeds up training time and reduces loss.
    INPUT
        int_form_questions: list of integers and <UNK> tokens, the questions.
        int_form_answers: list of integers and <UNK> tokens, the answers.
    OUTPUT
        int_questions_sorted: sorted list of integers and <UNK> tokens, the questions.
        int_answers_sorted: sorted list of integers and <UNK> tokens, the answers.
    '''
    indices = np.argsort([len(sent) for sent in int_form_questions])
    int_questions_sorted = list(np.array(int_form_questions)[indices])
    int_answers_sorted = list(np.array(int_form_answers)[indices])
    return int_questions_sorted, int_answers_sorted

## Section 1.2: Modeling Functions

In [5]:
def create_model_inputs():
    '''
    creates placeholders for model input
    '''
    input_data = tf.placeholder(tf.int32, shape=None, name='input')
    targets = tf.placeholder(tf.int32, shape=None, name='targets')
    learning_rate = tf.placeholder(tf.float32, shape=None, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, shape=None, name='keep_prob')
    
    return input_data, targets, learning_rate, keep_prob

In [None]:
def process_encoding_input(targets, word_identifiers, batch_size):
    '''
    Remove the last word id from each batch and add <GO> to the beginning
    of each batch
    INPUT
        targets: tensorflow placeholder. Target variable.
        word_identifier: dictionary. Maps each word to a unique integer.
        batch_size: integer. Number of samples to run through model at a time.
    OUTPUT
        formatted_input: tensorflow tensor. sentences starting with wordid (int)
            for <GO>
    '''
    slice_off_the_end = tf.strided_slice(targets, [0,0], [batch_size, -1], [1,1])
    formatted_input = tf.concat(
        [tf.fill([batch_size, 1], word_identifiers['<GO>']), 
        slice_off_the_end],
        axis=1)
    return formatted_input

In [None]:
def create_encoding_layer(
    rnn_inputs, 
    lstm_unit_count, 
    num_layers, 
    drop_prob, 
    sequence_length
):
    '''
    Creates multilayer bidirectional Recurrent Neural Net encoding.
    INPUT
        rnn_inputs: tensorflow tensor. input_data created by create_model_input function.
        lstm_unit_count: integer. The number of lstm units.
        num_layers: integer. Number of layers.
        drop_prob: float between 0 and 1. Probability of dropping a hidden unit in training
        sequence_length: vector of ints where each integer represents the length 
            of that sequence.
    OUTPUT
        states: tuple of backward and forward final states of RNN.
    '''
    lstm_cell = lstm.tf.contrib.rnn.BasicLSTMCell(num_units=lstm_unit_count)
    # dropout regularization
    dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob=(1-drop_prob))
    multi_rnn_cell = tf.contrib.rnn.MultiRNNCell([dropout] * num_layers)
    # Don't care about outputs because we are feeding this into a decoding layer
    outputs, states = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=multi_rnn_cell,
        cell_bw=multi_rnn_cell,
        sequence_length=sequence_length,
        inputs=rnn_inputs,
        dtype=tf.float32
    )
    return states

In [None]:
def train_decoding_layer(
    encoder_state, 
    decoding_cell, 
    dec_embed_input, 
    sequence_length, 
    decoding_scope,
    output_fn, 
    drop_prob, 
    batch_size
):
    '''
    Creates decoding layer for RNN
    INPUT
        encoder state: Tuple. Output of create_encoding_layer.
        decode_cell: multiRNNCell.
        dec_embed_input: tensorflow variable. decoder embedding.
        sequence_length: integer. The max sentence length for each batch.
        decoding_scope: tensorflow variable scope.
        output_fn: tensorflow fully connected layer.
        drop_prob: float between 0 and 1. Probability of dropping a hidden unit in training.
        batch_size: integer. Number of samples to run through the model at a time.
    OUTPUT
        decoding_layer_output: tensorflow tensor. 
        
    '''
    
    attention_states = tf.zeros([batch_size, 1, decoding_cell.output_size])
    
    att_keys, att_vals, att_score_fn, att_construct_fn = (
        tf.contrib.seq2seq
        .prepare_attention(
            attention_states, 
            attention_option="bahdanau", 
            num_units=decoding_cell.output_size
        )
    )
    
    train_decoder_fn = (
        tf.contrib.seq2seq
        .attention_decoder_fn_train(
            encoder_state[0],
            att_keys,
            att_vals,
            att_score_fn,
            att_construct_fn,
            name = "attn_dec_train"
        )
    )
    train_pred, _, _ = (
        tf.contrib.seq2seq
        .dynamic_rnn_decoder(
            decoding_cell, 
            train_decoder_fn, 
            dec_embed_input, 
            sequence_length, 
            scope=decoding_scope
        )
    )
    train_pred_drop = tf.nn.dropout(train_pred, 1-drop_prob)
    decoding_layer_output = output_fn(train_pred_drop)
    return decoding_layer_output

In [None]:
def decoding_layer_infer(
    encoder_state, 
    dec_cell, 
    dec_embeddings, 
    seq_start_id, 
    seq_end_id,
    max_length,
    vocab_size, 
    decoding_scope, 
    output_fn,  
    batch_size):
    '''
    Decode the prediction data
    INPUT
        encoder_state:
        dec_cell:
        dec_embeddings:
        seq_start_id:
        seq_end_id:
        max_length:
        vocab_size:
        decoding_scope:
        output_fn:
        batch_size:
    OUTPUT
        infer_logits:
    '''
    
    attention_states = tf.zeros([batch_size, 1, dec_cell.output_size])
    
    att_keys, att_vals, att_score_fn, att_construct_fn = \
            tf.contrib.seq2seq.prepare_attention(attention_states,
                                                 attention_option="bahdanau",
                                                 num_units=dec_cell.output_size)
    
    infer_decoder_fn = tf.contrib.seq2seq.attention_decoder_fn_inference(output_fn, 
                                                                         encoder_state[0], 
                                                                         att_keys, 
                                                                         att_vals, 
                                                                         att_score_fn, 
                                                                         att_construct_fn, 
                                                                         dec_embeddings,
                                                                         seq_start_id, 
                                                                         seq_end_id, 
                                                                         max_length, 
                                                                         vocab_size, 
                                                                         name = "attn_dec_inf")
    infer_logits, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(dec_cell, 
                                                                infer_decoder_fn, 
                                                                scope=decoding_scope)
    
    return infer_logits

## Section 2.1: Data Preprocessing

In [None]:
lines, conv_lines = load_in_lines()

In [10]:
line_mapping = create_line_mapping(lines)

In [11]:
convs = format_conv_lines(conv_lines)

In [12]:
questions, answers = create_questions_and_answers(convs, line_mapping)

In [13]:
clean_questions, clean_answers = create_clean_questions_and_answers(questions, answers)

In [27]:
questions_filtered, answers_filtered = \
    filter_long_and_short_sentences(clean_questions, clean_answers)

In [34]:
vocab_freq = create_word_frequency_dictionaries(answers_filtered)

In [44]:
filtered_vocab = filter_uncommon_words(vocab_freq, 10)

In [49]:
word_identifiers = create_unique_identifiers(filtered_vocab)

In [67]:
identifiers_with_tokens = create_dictionary_tokens(word_identifiers)

In [68]:
int_to_word_dict = create_int_to_word_dict(identifiers_with_tokens)

In [76]:
answers_filtered_eos = add_eos_token(answers_filtered)

In [80]:
int_form_questions = convert_lines_to_ints(questions_filtered, word_identifiers)
int_form_answers = convert_lines_to_ints(answers_filtered_eos, identifiers_with_tokens)

In [100]:
get_percent_unk(int_form_questions)

4227 unique words
Percent of total words spoken that are unkown: 6.9%


In [116]:
int_questions_sorted, int_answers_sorted = \
    sort_by_question_length(int_form_questions, int_form_answers)

## Section 2.2: Modeling

In [7]:
# Set the Hyperparameters
epochs = 100
batch_size = 128
rnn_size = 512
num_layers = 2
encoding_embedding_size = 512
decoding_embedding_size = 512
learning_rate = 0.005
learning_rate_decay = 0.9
min_learning_rate = 0.0001
drop_prob = 0.25

In [8]:
input_data, targets, learning_rate, keep_prob = create_model_inputs()

In [None]:
formatted_input = process_encoding_input()

In [9]:
states = create_encoding_layer()

NameError: name 'create_encoding_layer' is not defined