### THIS MODEL CREATES A CHATBOT FOR MOVIE CONVERSATIONS

###### DOWNLOAD AND CLEAN THE DATA

In [29]:
import pandas as pd
import numpy as np
import csv
import nltk
import re
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#The logic behind this execution is that we will convert all the movie lines by thier id into a cosumable pandas df
#The second part is enriching the conversation with details

In [3]:
#Function to convert a file into 
def read_csv_into_ds(file_name):
    result = []
    with open(file_name,encoding="utf8") as csvfile:
        readCSV = csv.reader(csvfile, delimiter='\t')
        for line in readCSV:
            result.append(line)
    return result
    
movie_lines = 'dialog-data/movie_lines.tsv'
movie_conversations = 'dialog-data/movie_conversations.tsv'
movie_characters = 'dialog-data/movie_characters_metadata.tsv'
movie_titles = 'dialog-data/movie_titles_metadata.tsv'

lines_ds = (read_csv_into_ds(movie_lines))
lines_dict = {}
for line in lines_ds:
    if(len(line)==5):
        lines_dict[line[0]] = [line[2],line[3],line[4]]

conv_ds = (read_csv_into_ds(movie_conversations))
conv_df = pd.DataFrame(conv_ds)

chars_ds = (read_csv_into_ds(movie_characters))
chars_df = pd.DataFrame(chars_ds).iloc[:, : 6]

titles_ds = (read_csv_into_ds(movie_titles))
titles_dict = {}
for title in titles_ds:
    if len(title) > 1:
        titles_dict[title[0]] = [title[1]]
    
print(list(lines_dict.items())[:5])
print(conv_df.head(5))
print(chars_df.head(5))
print(list(titles_dict.items())[:5])
#Loading the movie lines data set

[('L1045', ['m0', 'BIANCA', 'They do not!']), ('L1044', ['m0', 'CAMERON', 'They do to!']), ('L985', ['m0', 'BIANCA', 'I hope so.']), ('L984', ['m0', 'CAMERON', 'She okay?']), ('L925', ['m0', 'BIANCA', "Let's go."])]
    0   1   2                              3
0  u0  u2  m0  ['L194' 'L195' 'L196' 'L197']
1  u0  u2  m0                ['L198' 'L199']
2  u0  u2  m0  ['L200' 'L201' 'L202' 'L203']
3  u0  u2  m0         ['L204' 'L205' 'L206']
4  u0  u2  m0                ['L207' 'L208']
    0         1   2                           3  4  5
0  u0    BIANCA  m0  10 things i hate about you  f  4
1  u1     BRUCE  m0  10 things i hate about you  ?  ?
2  u2   CAMERON  m0  10 things i hate about you  m  3
3  u3  CHASTITY  m0  10 things i hate about you  ?  ?
4  u4      JOEY  m0  10 things i hate about you  m  6
[('m0', ['10 things i hate about you']), ('m1', ['1492: conquest of paradise']), ('m2', ['15 minutes']), ('m3', ['2001: a space odyssey']), ('m4', ['48 hrs.'])]


In [4]:
#Enrich the Conversation data and pull in list of conversations
dialog_records = []
for record in conv_df.values:
    dialog_record = []        
    dialoges = [x[1:-1] for x in record[3][1:-1].split(' ')]
    for dialog in dialoges:
        if dialog in lines_dict.keys() and record[2] in titles_dict.keys():
            dia_details = lines_dict[dialog]
            dialog_record.append((titles_dict[record[2]][0], dia_details[1], dia_details[2]))
    if(len(dialog_record)!=0):        
        dialog_records.append(dialog_record)
print("Total Conversations:" ,len(dialog_records))
dialog_records[:1]

Total Conversations: 79808


[[('10 things i hate about you',
   'BIANCA',
   'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.'),
  ('10 things i hate about you',
   'CAMERON',
   "Well I thought we'd start with pronunciation if that's okay with you."),
  ('10 things i hate about you',
   'BIANCA',
   'Not the hacking and gagging and spitting part.  Please.'),
  ('10 things i hate about you',
   'CAMERON',
   "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?")]]

In [33]:
#Pick only the conversation data to make sequences

dialog_sequences = list(map(lambda x:[re.sub('[^A-Za-z0-9\s]+', '', y[2]).lower().replace(y[1].lower(), '<UNK>') for y in x] ,dialog_records))
dialog_sequences[:10]

[['can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again',
  'well i thought wed start with pronunciation if thats okay with you',
  'not the hacking and gagging and spitting part  please',
  'okay then how bout we try out some french cuisine  saturday  night'],
 ['youre asking me out  thats so cute whats your name again', 'forget it'],
 ['no no its my fault  we didnt have a proper introduction ',
  '<UNK>',
  'the thing is cameron  im at the mercy of a particularly hideous breed of loser  my sister  i cant date until she does',
  'seems like she could get a date easy enough'],
 ['why',
  'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
  'thats a shame'],
 ['gosh if only we could find kat a boyfriend', 'let me see what i can do'],
 ['cesc ma tete this is my head',
  'right  see  youre ready for the quiz',
  'i dont want to kno

##### CLEAN THE DATA AND TOKENIZE EVERY WORD IN THE DIAGLOUGES

In [35]:
word_counter = {}

for conv in dialog_sequences:
    for seq in conv:
        for word in seq.lower().split(' '):#word_tokenize(seq.lower()):
            if word in word_counter.keys():
                word_counter[word] = word_counter[word] + 1
            else:
                word_counter[word] = 1
print("Total Number of Words:",len(word_counter.keys()))
print("Top 20 Frequent Words")
#The first 3 are assigned to 1.START 2. END 3. PAD 
word_int = dict(list(zip(word_counter.keys(),range(4,len(word_counter.keys())+3))))
sorted(word_counter.items(), key=lambda x: x[1],reverse=True)[:20]

#TODO might have to remove the '' character sometimes

Total Number of Words: 62389
Top 20 Frequent Words


[('you', 119431),
 ('', 117300),
 ('i', 95026),
 ('the', 90589),
 ('to', 74725),
 ('a', 65163),
 ('it', 43736),
 ('and', 41320),
 ('of', 35982),
 ('that', 32368),
 ('in', 31250),
 ('what', 29757),
 ('me', 29550),
 ('is', 26814),
 ('dont', 22898),
 ('this', 22556),
 ('for', 21568),
 ('do', 21266),
 ('im', 20949),
 ('know', 20104)]

In [36]:
#TODO Figure out why there are NULL values
def check_word(word):
    if word in word_int.keys():
        return word_int[word]
    
words_in_seq = list(map(lambda x: list(map(lambda y: [check_word(word) for word in y.lower().split(' ')], x)),dialog_sequences))
words_in_seq[:1]

[[[4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   9,
   26],
  [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 32, 37],
  [38, 24, 39, 12, 40, 12, 41, 42, 9, 43],
  [36, 44, 45, 46, 5, 47, 48, 49, 50, 51, 9, 52, 9, 53]]]

###### PREPARE TRAINING AND TESTING SETS

In [37]:
# convert 3d data into 2d
encoder_inputs = []
decoder_inputs = []
decoder_outputs = []

for conversation in words_in_seq:
    for id in range(0,len(conversation)-2):
        # Adding START for decoder inputs and Appending END for decoder outputs
        # Pad every sentence to 5 or 10 words
        seq_len = 10
        def pad(input_list, pad_len):
            return input_list + [3 for x in range(0,pad_len-len(input_list))]
        e_input = list(filter(None.__ne__, conversation[id]))
        e_input = pad(e_input,seq_len)
        d_input = list(filter(None.__ne__, conversation[id+1]))
        d_input = [1] + pad(d_input,seq_len-1)
        d_output = list(filter(None.__ne__, conversation[id+1]))#.append(2)
        d_output = pad(d_output,seq_len-1)[:seq_len-1] + [2]
        encoder_inputs.append(e_input[:seq_len])
        decoder_inputs.append(d_input[:seq_len])
        decoder_outputs.append(d_output)

In [39]:
print(encoder_inputs[:5])
print(decoder_inputs[:5])
print(decoder_outputs[:5])

[[4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [27, 28, 29, 30, 31, 32, 33, 34, 35, 36], [64, 64, 65, 66, 67, 9, 5, 68, 69, 70], [73, 3, 3, 3, 3, 3, 3, 3, 3, 3], [97, 3, 3, 3, 3, 3, 3, 3, 3, 3]]
[[1, 27, 28, 29, 30, 31, 32, 33, 34, 35], [1, 38, 24, 39, 12, 40, 12, 41, 42, 9], [1, 73, 3, 3, 3, 3, 3, 3, 3, 3], [1, 24, 74, 75, 76, 9, 77, 78, 24, 79], [1, 98, 99, 9, 89, 100, 101, 102, 103, 104]]
[[27, 28, 29, 30, 31, 32, 33, 34, 35, 2], [38, 24, 39, 12, 40, 12, 41, 42, 9, 2], [73, 3, 3, 3, 3, 3, 3, 3, 3, 2], [24, 74, 75, 76, 9, 77, 78, 24, 79, 2], [98, 99, 9, 89, 100, 101, 102, 103, 104, 2]]


###### BUILD THE NETWORK

In [40]:
def model_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None])
    learning_rate = tf.placeholder(tf.float32)
    keep_probability = tf.placeholder(tf.float32, name='keep_prob')
    target_sequence_length = tf.placeholder(tf.int32, [None], name='target_sequence_length')
    max_target_len = tf.placeholder(tf.int32, [None], name='max_target_len')
    max_target_len = tf.reduce_max(target_sequence_length)
    source = tf.placeholder(tf.int32, [None], name='source_sequence_length')
    return inputs, targets, learning_rate, keep_probability, target_sequence_length, max_target_len, source

###### ENCODER

In [42]:
def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob, 
                   source_sequence_length, source_vocab_size, 
                   encoding_embedding_size):
    # Encoder embedding
    enc_embed_input = tf.contrib.layers.embed_sequence(rnn_inputs, source_vocab_size,  )
    
        # RNN cell
    def make_cell(rnn_size):
        lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)

    lstm_cell = tf.contrib.rnn.MultiRNNCell([make_cell(rnn_size) for _ in range(num_layers)])
    
    return tf.nn.dynamic_rnn(lstm_cell, enc_embed_input, sequence_length=source_sequence_length, dtype=tf.float32)

###### NETWORK LAYER

In [43]:
def seq2seq_model(input_data, target_data, keep_prob, batch_size,
                  source_sequence_length, target_sequence_length,
                  max_target_sentence_length,
                  source_vocab_size, target_vocab_size,
                  enc_embedding_size, dec_embedding_size,
                  rnn_size, num_layers, target_vocab_to_int):
    pass