In [33]:
import numpy as np
import re
import tensorflow as tf
import time

DATA PREPROCESSING

In [7]:
lines = open('movie_lines.txt', encoding = 'utf-8', errors ='ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding = 'utf-8', errors ='ignore').read().split('\n')

In [8]:
id2line = {}
for line in lines:
     _line = line.split(' +++$+++ ') 
     if len(_line) == 5:
         id2line[_line[0]] =_line[4]


In [9]:
#CREATING LIST OF CONVERSATION

conversations_ids = []
for conversation in conversations[:-1]:
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
    conversations_ids.append(_conversation.split(','))

In [10]:
questions = []
answers = []

for conversation in conversations_ids:
    for i in range(len(conversation) -1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])


In [11]:
#cleaning of text 
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"it's", " it is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text) 
    text = re.sub(r"\'re", " are", text) 
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text) 
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>=+{}[.?,]]", "", text)

    return text

In [13]:
#cleaning questions 
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))

In [15]:

#cleaning answers
clean_answers = []
for answer in answers:
    clean_answers.append(clean_text(answer))


In [16]:
word2count = {}
for question in clean_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1


In [17]:
for answer in clean_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1


In [18]:
threshold = 20
questionsword2int = {}
word_number = 0
for word, count in word2count.items():
    if count > threshold:
        questionsword2int[word] = word_number
        word_number += 1


In [19]:
answersword2int = {}
word_number = 0
for word, count in word2count.items():
    if count > threshold:
        answersword2int[word] = word_number
        word_number += 1


In [20]:
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    questionsword2int[token] = len(questionsword2int) + 1

In [21]:
for token in tokens:
    answersword2int[token] = len(answersword2int) + 1

In [22]:
answersint2word = {w_i: w for w, w_i in answersword2int.items()}

In [23]:
for i in range(len(clean_answers)):
    clean_answers[i] += ' <EOS>'


In [29]:
questions_to_int = []
for question in clean_questions:
    ints = []
    for word in question.split():
        if word not in questionsword2int:
            ints.append(questionsword2int['<OUT>'])
        else:
            ints.append(questionsword2int[word])
    questions_to_int.append(ints)


In [30]:
answers_to_int = []
for answer in clean_answers:
    ints = []
    for word in answer.split():
        if word not in answersword2int:
            ints.append(answersword2int['<OUT>'])
        else:
            ints.append(answersword2int[word])
    answers_to_int.append(ints)



In [31]:
sorted_clean_questions = []
sorted_clean_answers = []

for length in range(1, 25 + 1):
    for i in enumerate(questions_to_int):
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_to_int[i[0]])
            sorted_clean_answers.append(answers_to_int[i[0]])

MODEL DESIGN


In [34]:
def model_input():
    inputs = tf.placeholder(tf.int32, [None, None], name = 'input')
    targets = tf.placeholder(tf.int32, [None, None], name = 'target')
    lr = tf.placeholder(tf.int32, name = 'learning_rate')
    keep_prob = tf.placeholder(tf.int32, name = 'keep_prob')

    return inputs, targets, lr, keep_prob

In [35]:
#preprocessing targets
def preprocess_targets(targets, word2int, batch_size):
    left_side = tf.fill([batch_size, 1], word2int['<SOS>'])
    right_side = tf.strided_slice(targets, [0, 0], [batch_size, -1], [1, 1])
    preprocessed_targets = tf.concat([left_side, right_side], 1)

    return preprocessed_targets


In [36]:
#creating RNN encoding layer
def encoder_rnn_layer(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
    _, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw = encoder_cell, cell_bw = encoder_cell, sequence_length = sequence_length, inputs = rnn_inputs, dtype = tf.float32)

    return encoder_state

In [37]:
#decoding training set

def decode_training_set(encoder_state, decoder_cell, decoder_embedded_input, sequence_length, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size , 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function,attention_construct_function  = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = 'bahdanau', num_units = decoder_cell.output_size)
    training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0], attention_keys,attention_values, attention_score_function, attention_construct_function, name = "attn_dec_train")

    decoder_output, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell, training_decoder_function, decoder_embedded_input, sequence_length, scope = decoding_scope)
    decoder_output_dropout = tf.nn.dropout(decoder_output, keep_prob)

    return output_function(decoder_output_dropout)


In [38]:
  #decoding test/validation
def decode_test_set(encoder_state, decoder_cell, decoder_embedded_matrix, sos_id, eos_id, maximum_length, num_words, sequence_length, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size , 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function,attention_construct_function  = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = 'bahdanau', num_units = decoder_cell.output_size)
    test_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_inference(output_function, encoder_state[0], attention_keys,attention_values, attention_score_function, attention_construct_function,decoder_embedded_matrix, sos_id, eos_id ,maximum_length, num_words, name = "attn_dec_inf")


    test_predictions, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell, test_decoder_function, scope = decoding_scope)
    #decoder_output_dropout = tf.nn.dropout(test_predictions, keep_prob)

    return test_predictions


In [40]:
#DEcoder RNN
def decoder_rnn(decoder_embedded_input, decoder_embedded_matrix, encoder_state, num_words, sequence_length, rnn_size, num_layers, word2int, keep_prob, batch_size):

    with tf.variable_scope("decoding") as decoding_scope:
        lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
        lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
        decoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
        """ more return values will be added """
        return decoder_cell, lstm, lstm_dropout