In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import time
import collections
from Seq2Seq import seq2seq_model
tf.__version__

'1.0.1'

In [2]:
# Load the data
lines = open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conv_lines = open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

In [3]:
# Create a dictionary to map each line's id with its text
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]
        
# Create a list of all of the conversations' lines' ids.
convs = [ ]
for line in conv_lines[: -1]:
    _line = line.split(' +++$+++ ')[-1][1: -1].replace("'","").replace(" ","")
    convs.append(_line.split(','))
    
# Sort the sentences into questions (inputs) and answers (targets)
questions = []
answers = []

for conv in convs:
    for i in range(len(conv)-1):
        questions.append(id2line[conv[i]])
        answers.append(id2line[conv[i + 1]])

In [4]:
def clean_text(text):
    # Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

clean_questions = [clean_text(question) for question in questions]
clean_answers = [clean_text(answer) for answer in answers]

In [5]:
# Remove questions and answers that are shorter than 2 words and longer than 20 words.
min_line_length = 2
max_line_length = 20

# Filter out the questions that are too short/long
short_questions_temp = []
short_answers_temp = []

i = 0
for question in clean_questions:
    if len(question.split()) >= min_line_length and len(question.split()) <= max_line_length:
        short_questions_temp.append(question)
        short_answers_temp.append(clean_answers[i])
    i += 1

# Filter out the answers that are too short/long
short_questions = []
short_answers = []

i = 0
for answer in short_answers_temp:
    if len(answer.split()) >= min_line_length and len(answer.split()) <= max_line_length:
        short_answers.append(answer)
        short_questions.append(short_questions_temp[i])
    i += 1

In [6]:
# Compare the number of lines we will use with the total number of lines.
print("# of questions:", len(short_questions))
print("# of answers:", len(short_answers))
print("% of data used: {}%".format(round(len(short_questions)/len(questions),4)*100))

# of questions: 138335
# of answers: 138335
% of data used: 62.419999999999995%


In [7]:
# Create a dictionary for the frequency of the vocabulary
def count_freq(vocab, sentences):
    for sentence in sentences:
        for word in sentence.split():
            vocab[word] += 1
    return

vocab = collections.defaultdict(int)
count_freq(vocab, short_questions)
count_freq(vocab, short_answers)

In [8]:
# Remove rare words from the vocabulary.
# We will aim to replace fewer than 5% of words with <UNK>
# You will see this ratio soon.
threshold = 10
count = 0
for k,v in vocab.items():
    if v >= threshold:
        count += 1

print("Size of total vocab:", len(vocab))
print("Size of vocab we will use:", count)

Size of total vocab: 45618
Size of vocab we will use: 8092


In [9]:
# In case we want to use a different vocabulary sizes for the source and target text, 
# we can set different threshold values.
# Nonetheless, we will create dictionaries to provide a unique integer for each word.
questions_vocab_to_int = {}
answers_vocab_to_int = {}

def normal_tokenize(threshold, vocab, vocab_to_int):
    start_number = 0
    for word, count in vocab.items():
        if count >= threshold:
            vocab_to_int[word] = start_number
            start_number += 1
            
def add_special_tokens(vocab_to_int, start_number):
    vocab_to_int['<PAD>'] = start_number
    vocab_to_int['<EOS>'] = start_number + 1
    vocab_to_int['<UNK>'] = start_number + 2
    vocab_to_int['<GO>'] = start_number + 3

normal_tokenize(threshold, vocab, questions_vocab_to_int)
normal_tokenize(threshold, vocab, answers_vocab_to_int)

add_special_tokens(questions_vocab_to_int, len(questions_vocab_to_int))
add_special_tokens(answers_vocab_to_int, len(answers_vocab_to_int))

In [73]:
# Create dictionaries to map the unique integers to their respective words.
# i.e. an inverse dictionary for vocab_to_int.
questions_int_to_vocab = {v_i: v for v, v_i in questions_vocab_to_int.items()}
answers_int_to_vocab = {v_i: v for v, v_i in answers_vocab_to_int.items()}

# Add the end of sentence token to the end of every answer.
for i in range(len(short_answers)):
    short_answers[i] += ' <EOS>'

print (short_answers[111])

no i ' m not <EOS> <EOS>


In [11]:
# Convert the text to integers. Replace any words that are not in the respective vocabulary with <UNK> 
questions_int = []
for question in short_questions:
    ints = []
    for word in question.split():
        if word not in questions_vocab_to_int:
            ints.append(questions_vocab_to_int['<UNK>'])
        else:
            ints.append(questions_vocab_to_int[word])
    questions_int.append(ints)
    
answers_int = []
for answer in short_answers:
    ints = []
    for word in answer.split():
        if word not in answers_vocab_to_int:
            ints.append(answers_vocab_to_int['<UNK>'])
        else:
            ints.append(answers_vocab_to_int[word])
    answers_int.append(ints)

In [12]:
# Calculate what percentage of all words have been replaced with <UNK>
def count_unk(sentence_int, unk_int):
    word_count = 0
    unk_count = 0
    for sentence in sentence_int:
        for w in sentence:
            if w == unk_int:
                unk_count += 1
            word_count += 1
    return word_count, unk_count

question_word_count, question_unk_count = count_unk(questions_int, questions_vocab_to_int['<UNK>'])
answer_word_count, answer_unk_count = count_unk(answers_int, answers_vocab_to_int['<UNK>'])
word_count = question_word_count + answer_word_count
unk_count = question_unk_count + answer_unk_count

unk_ratio = unk_count / word_count

print('total number of words: ', word_count)
print('Number of times <UNK> is used: ', unk_count)
print('ratio of words that are <UNK>: ', '{}'.format(unk_ratio))

total number of words:  2334533
Number of times <UNK> is used:  92436
ratio of words that are <UNK>:  0.03959507104847094


In [13]:
# Sort questions and answers by the length of questions
# (This will reduce the amount of padding during training, which should speed up training and help to reduce the loss
sorted_questions = []
sorted_answers = []

for length in range(1, max_line_length + 1):
    for i in enumerate(questions_int):
        if len(i[1]) == length:
            sorted_questions.append(questions_int[i[0]])
            sorted_answers.append(answers_int[i[0]])

# Train_test split
train_test_split = int(len(sorted_questions) * 0.15)

train_questions = sorted_questions[train_test_split:]
train_answers = sorted_answers[train_test_split:]

test_questions = sorted_questions[:train_test_split]
test_answers = sorted_answers[:train_test_split]

print(len(train_questions))
print(len(test_questions))

117585
20750


In [14]:
def model_inputs():
    # Create palceholders for inputs to the model
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    return input_data, targets, lr, keep_prob

def pad_sentence_batch(sentence_batch, vocab_to_int):
    # Pad sentences with <PAD> 
    max_sentence_len = 0
    for sentence in sentence_batch:
        max_sentence_len = max(max_sentence_len, len(sentence))
    return [sentence + ([vocab_to_int['<PAD>']] * (max_sentence_len - len(sentence))) for sentence in sentence_batch]

def batch_data(questions, answers, batch_size):
    # Iterator to batch questions and answers
    for batch_i in range(0, len(questions) // batch_size):
        start_i = batch_i * batch_size
        questions_batch = questions[start_i : start_i + batch_size]
        answers_batch = answers[start_i : start_i + batch_size]
        pad_questions_batch = np.array(pad_sentence_batch(questions_batch, questions_vocab_to_int))
        pad_answers_batch = np.array(pad_sentence_batch(answers_batch, answers_vocab_to_int))
        yield pad_questions_batch, pad_answers_batch

In [15]:
# Set the Hyperparameters
epochs = 50
batch_size = 64
rnn_size = 256
num_layers = 3
encoding_embedding_size = 256
decoding_embedding_size = 256
learning_rate = 0.1
learning_rate_decay = 0.9
min_learning_rate = 0.01
keep_probability = 0.9

In [16]:
# Reset the graph to ensure that it is ready for training
tf.reset_default_graph()
sess = tf.InteractiveSession()
    
# Load the model inputs    
input_data, targets, lr, keep_prob = model_inputs()
# Sequence length will be the max line length for each batch
sequence_length = tf.placeholder_with_default(max_line_length, None, name = 'sequence_length')
# Find the shape of the input data for sequence_loss
input_shape = tf.shape(input_data)

# Create the training and inference logits
train_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]), targets, keep_prob, batch_size, sequence_length, 
                                               len(answers_vocab_to_int), len(questions_vocab_to_int), encoding_embedding_size, 
                                               decoding_embedding_size, rnn_size, num_layers, questions_vocab_to_int)

# Create a tensor for the inference logits, needed if loading a checkpoint version of the model
tf.identity(inference_logits, 'logits')

with tf.name_scope("optimization"):
    seq_loss = tf.contrib.seq2seq.sequence_loss(train_logits, targets, tf.ones([input_shape[0], sequence_length]))
    optimizer = tf.train.AdamOptimizer(learning_rate)
    gradients = optimizer.compute_gradients(seq_loss)
    # Gradient Clipping
    capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

In [18]:
# If you're loading a pre-train model (ref_logits), DO NOT RUN THE BLCOK BELOW!!! 
# Run this block and you'll load the ref_logits 
#checkpoint = "./ckpts/best_model.ckpt" 
#saver = tf.train.Saver() 
#saver.restore(sess, checkpoint)

In [17]:
# DO NOT RUN THIS BLOCK IF YOU'RE LOADING A MODEL!! See the instruction in the previous block

display_step = 300 # Check training loss after every 300 batches
validation_check = ((len(train_questions)) // batch_size // 2) - 1
total_train_loss = 0 # Record the training loss for each display step
summary_valid_loss = [] # Record the validation loss for saving improvements in the model
itr = 1 # Cur iteration

checkpoint = "./ckpts/best_model.ckpt" 
sess.run(tf.global_variables_initializer())

for epoch_i in range(1, epochs + 1):
    for batch_i, (questions_batch, answers_batch) in enumerate(batch_data(train_questions, train_answers, batch_size)):
        start_time = time.time()
        _, loss = sess.run([train_op, seq_loss],{input_data: questions_batch, targets: answers_batch, lr: learning_rate,
                                                 sequence_length: answers_batch.shape[1], keep_prob: keep_probability})

        total_train_loss += loss
        end_time = time.time()
        batch_time = end_time - start_time

        if batch_i % display_step == 0:
            print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'.format(epoch_i, epochs, batch_i, 
                          len(train_questions) // batch_size, total_train_loss / display_step, batch_time*display_step))
            total_train_loss = 0

        if batch_i % validation_check == 0 and batch_i > 0:
            total_valid_loss = 0
            start_time = time.time()
            for batch_ii, (questions_batch, answers_batch) in enumerate(batch_data(test_questions, test_answers, batch_size)):
                valid_loss = sess.run(seq_loss, {input_data: questions_batch, targets: answers_batch, lr: learning_rate,
                                                 sequence_length: answers_batch.shape[1], keep_prob: 1})
                total_valid_loss += valid_loss
            end_time = time.time()
            batch_time = end_time - start_time
            avg_valid_loss = total_valid_loss / (len(test_questions) / batch_size)
            print('Valid Loss: {:>6.3f}, Seconds: {:>5.2f}'.format(avg_valid_loss, batch_time))
            
            # Reduce learning rate, but not below its minimum value
            learning_rate *= learning_rate_decay
            if learning_rate < min_learning_rate:
                learning_rate = min_learning_rate
            
            if itr > 1:
                summary_valid_loss.append(avg_valid_loss)
            if itr > 1 and avg_valid_loss <= min(summary_valid_loss): 
                print('New Record!') 
                #saver = tf.train.Saver() 
                #saver.save(sess, checkpoint)
            
            elif itr == 1:
                print ("Not Fully Initiated")
            
            else:
                print("No Improvement.")
            
            saver = tf.train.Saver() 
            saver.save(sess, checkpoint)
            itr += 1

Epoch   1/50 Batch    0/1837 - Loss:  0.030, Seconds: 323.34
Epoch   1/50 Batch  300/1837 - Loss: 64.806, Seconds: 36.35
Epoch   1/50 Batch  600/1837 - Loss: 91.210, Seconds: 37.59
Epoch   1/50 Batch  900/1837 - Loss: 93.497, Seconds: 39.42
Valid Loss: 46.235, Seconds: 11.92
Not Fully Initiated
Epoch   1/50 Batch 1200/1837 - Loss: 76.653, Seconds: 41.92
Epoch   1/50 Batch 1500/1837 - Loss: 93.026, Seconds: 43.87
Epoch   1/50 Batch 1800/1837 - Loss: 94.422, Seconds: 50.25
Valid Loss: 242.490, Seconds: 11.88
New Record!
Epoch   2/50 Batch    0/1837 - Loss: 30.001, Seconds: 33.71
Epoch   2/50 Batch  300/1837 - Loss: 103.049, Seconds: 37.46
Epoch   2/50 Batch  600/1837 - Loss: 78.963, Seconds: 37.98
Epoch   2/50 Batch  900/1837 - Loss: 80.814, Seconds: 39.45
Valid Loss: 112.021, Seconds: 11.87
New Record!
Epoch   2/50 Batch 1200/1837 - Loss: 78.490, Seconds: 41.30
Epoch   2/50 Batch 1500/1837 - Loss: 76.052, Seconds: 44.14
Epoch   2/50 Batch 1800/1837 - Loss: 79.187, Seconds: 49.42
Valid L

In [87]:
def question_to_seq(question, vocab_to_int):
    #Prepare the question for the model
    question = clean_text(question)
    return [vocab_to_int.get(word, vocab_to_int['<UNK>']) for word in question.split()]

# Create your own input question
input_question = 'I like you'

# Use a question from the data as your input
random = np.random.choice(len(short_questions))
input_question = short_questions[random]

# Prepare input question
input_question = input_question
input_question = question_to_seq(input_question, questions_vocab_to_int)
input_question.append(questions_vocab_to_int['<EOS>'])

# Pad the questions until it equals the max_line_length
input_question = input_question + [questions_vocab_to_int["<PAD>"]] * (max_line_length - len(input_question))
# Add empty questions so the the input_data is the correct shape
batch_shell = np.zeros((batch_size, max_line_length))
# Set the first question to be out input question
batch_shell[0] = input_question    
    
# Run the model with the input question
answer_logits = sess.run(inference_logits, {input_data: batch_shell, keep_prob: 1.0})[0]

print (batch_shell[0])
# Remove the padding from the Question and Answer
pad_q = questions_vocab_to_int["<PAD>"]
pad_a = answers_vocab_to_int["<PAD>"]
print('Question')
print('  Input Words: {}'.format([questions_int_to_vocab[i] for i in input_question if i != pad_q]))
print()
print('Answer')
print('  Response Words: {}'.format([answers_int_to_vocab[i] for i in np.argmax(answer_logits, 1) if i != pad_a]))

[  116.  5882.  6824.  1269.  6459.   944.  1925.  3701.  6712.  8093.
  8092.  8092.  8092.  8092.  8092.  8092.  8092.  8092.  8092.  8092.]
Question
  Input Words: ['that', 'was', 'so', 'good', 'i', 'am', 'gonna', 'have', 'another', '<EOS>']

Answer
  Response Words: ['you', 'is', 'a']
