In [1]:
import pandas as pd
import numpy as np
import re
import time
from operator import itemgetter
from copy import deepcopy
from keras.models import Model
from keras.layers import Input, LSTM, Dense



Using TensorFlow backend.


### Pre-Processing

In [2]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    # lower the words
    text = text.lower()
    # Extend some abbreviations to original form
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    
    # Remove all the symbols 
    text = re.sub(r"[-()\"#/@;:<>{}`+*=~|.!?,]", "", text)
    
    # Convert more than one space between two words to one space
    text = re.sub(' +',' ',text)
    return text

def line_sort(movie):
    new_frame = deepcopy(movie)
    for ind, i in enumerate(movie):
        line_seq = int(re.search('L(.*)', i[0]).group(1))
        
        # strip the sentence
        new_frame[ind][4] = new_frame[ind][4].strip()
        new_frame[ind][0] = line_seq

    #  Sort each sentence based on line ID in ascend order

    new_frame = sorted(new_frame, key=itemgetter(0))
    return new_frame

# Slice many stories based on the continuous line ID. 
def scenario_generator(a_movie):
    story = []
    people = set()
    
    sorted_a_movie = line_sort(a_movie)
    a_sentence = []
    for ind, am in enumerate(sorted_a_movie):
        if (ind + 1 < len(sorted_a_movie)) and (sorted_a_movie[ind+1][0]) == (am[0] + 1):
            people.add(am[1])
            
# we assume that in a story, there are two characters having conversation. 
            if len(people) < 3:
                a_sentence.append(am[4]) 
            else:
                story.append(a_sentence)
                a_sentence = []
                a_sentence.append(am[4])
                people = set()
            
        else:
            a_sentence.append(am[4])
            
            story.append(a_sentence)
            a_sentence = []
            people = set()

    return story


In [3]:

dialogue = []
error_less = []
error_greater = []

with open('movie_lines.tsv') as f:
    lines = f.read().split('\n')[:-1]
    
dialogue = []
for row in lines:
    row_li = row.split('\t')
    modify_row = ' '.join(row_li[4:])
    modify_row = re.sub(' +',' ',modify_row)
    modify_row = modify_row.replace(" \' ","'")
    if len(row_li) > 5:

        dialogue.append(row_li[:4] + [modify_row])
    else:
        dialogue.append(row_li[:4] + [modify_row])
        
movie_dict = dict()

for i in dialogue:
    if i[2] not in movie_dict:
        movie_dict[i[2]] = []
    movie_dict[i[2]].append(i)
    
convs = {}

for k, v in movie_dict.items():
    convs[k] = scenario_generator(v)
    
# Sort the sentences into questions (inputs) and answers (targets)
questions = []
answers = []
for s in convs.values():
    for conv in list(s):
        for i in range(len(conv)-1):
            questions.append(conv[i])
            answers.append(conv[i+1])
            

# Clean the data
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
clean_questions =  clean_questions 
clean_answers = []    
for answer in answers:
    clean_answers.append(clean_text(answer))
clean_answers =  clean_answers   

# Find the length of sentences
lengths = []
for question in clean_questions:
    lengths.append(len(question.split()))
for answer in clean_answers:
    lengths.append(len(answer.split()))

# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])

# Remove questions and answers that are shorter than 2 words and longer than 20 words.
min_line_length = 2
max_line_length = 20

# Filter out the questions that are too short/long
short_questions_temp = []
short_answers_temp = []

i = 0
for question in clean_questions:
    if len(question.split()) >= min_line_length and len(question.split()) <= max_line_length:
        short_questions_temp.append(question)
        short_answers_temp.append(clean_answers[i])
    i += 1

# Filter out the answers that are too short/long
short_questions = []
short_answers = []

i = 0
for answer in short_answers_temp:
    if len(answer.split()) >= min_line_length and len(answer.split()) <= max_line_length:
        short_answers.append(answer)
        short_questions.append(short_questions_temp[i])
    i += 1
    
# Create a dictionary for the frequency of the vocabulary
vocab = {}
for question in short_questions:
    for word in question.split():
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1
            
for answer in short_answers:
    for word in answer.split():
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1
            
# Remove rare words from the vocabulary.
# We will aim to replace fewer than 5% of words with <UNK>
# You will see this ratio soon.
threshold = 10
count = 0
for k,v in vocab.items():
    if v >= threshold:
        count += 1
        
# In case we want to use a different vocabulary sizes for the source and target text, 
# we can set different threshold values.
# Nonetheless, we will create dictionaries to provide a unique integer for each word.
questions_vocab_to_int = {}

word_num = 0
for word, count in vocab.items():
    if count >= threshold:
        questions_vocab_to_int[word] = word_num
        word_num += 1
        
answers_vocab_to_int = {}

word_num = 0
for word, count in vocab.items():
    if count >= threshold:
        answers_vocab_to_int[word] = word_num
        word_num += 1
        
# Add <EOS>, <Start>, and <UNK> to vocab and modify the sentences by them

codes = ['<EOS>','<UNK>','<Start>']

for code in codes:
    questions_vocab_to_int[code] = len(questions_vocab_to_int)+1
    
for code in codes:
    answers_vocab_to_int[code] = len(answers_vocab_to_int)+1
    
# Create dictionaries to map the unique integers to their respective words.
# i.e. an inverse dictionary for vocab_to_int.
questions_int_to_vocab = {v_i: v for v, v_i in questions_vocab_to_int.items()}
answers_int_to_vocab = {v_i: v for v, v_i in answers_vocab_to_int.items()}

modified_short_questions = []

for i in range(len(short_answers)):
    short_answers[i] += ' <EOS>'
    
# Add the end of sentence token to the end of every answer.
for i in range(len(short_questions)):
    short_questions[i] += ' <EOS>'
    
# Convert the text to integers. 
# Replace any words that are not in the respective vocabulary with <UNK> 
questions_int = []
for question in short_questions:
    ints = []
    for word in question.split():
        if word not in questions_vocab_to_int:
            ints.append(questions_vocab_to_int['<UNK>'])
        else:
            ints.append(questions_vocab_to_int[word])
    questions_int.append(ints)
    
answers_int = []
for answer in short_answers:
    ints = []
    for word in answer.split():
        if word not in answers_vocab_to_int:
            ints.append(answers_vocab_to_int['<UNK>'])
        else:
            ints.append(answers_vocab_to_int[word])
    answers_int.append(ints)
    
# Sort questions and answers by the length of questions.
# This will reduce the amount of padding during training
# Which should speed up training and help to reduce the loss

sorted_questions = []
sorted_answers = []

for length in range(1, max_line_length+1):
    for i in enumerate(questions_int):
        if len(i[1]) == length:
            sorted_questions.append(questions_int[i[0]])
            sorted_answers.append(answers_int[i[0]])
            


## Model Construction

In [4]:
batch_size = 128  # Batch size for training.
epochs = 5  # Number of epochs to train for.
latent_dim = 128  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.
# Path to the data txt file on disk.

# Vectorize the data.
input_texts = short_questions
target_texts = short_answers

input_characters = sorted(list(set(questions_int_to_vocab.values())))
target_characters = sorted(list(set(answers_int_to_vocab.values())))


num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt.split()) for txt in input_texts[:num_samples]])
max_decoder_seq_length = max([len(txt.split()) for txt in target_texts[:num_samples]])

print('Number of samples:', len(input_texts[:num_samples]))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

input_token_index = questions_vocab_to_int
target_token_index = answers_vocab_to_int



encoder_input_data = np.zeros(
    (len(input_texts[:num_samples]), max_encoder_seq_length, num_encoder_tokens),
    dtype= np.float32)
decoder_input_data = np.zeros(
    (len(input_texts[:num_samples]), max_decoder_seq_length, num_decoder_tokens),
    dtype= np.float32)
decoder_target_data = np.zeros(
    (len(input_texts[:num_samples]), max_decoder_seq_length, num_decoder_tokens),
    dtype= np.float32)



for i, (input_text, target_text) in enumerate(zip(input_texts[:num_samples], target_texts[:num_samples])):
    for t, char in enumerate(input_text.split()):
        try:
            encoder_input_data[i, t, input_token_index[char.lower()]] = 1.
        except KeyError:
            encoder_input_data[i, t, input_token_index['<UNK>']] = 1.

    for t, char in enumerate(target_text.split()):
#         print(t)
        # decoder_target_data is ahead of decoder_input_data by one timestep
        try:
            decoder_input_data[i, t, target_token_index[char.lower()]] = 1.
        except KeyError:
            decoder_input_data[i, t, target_token_index['<UNK>']] = 1.
        
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            try:
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.
            except KeyError:
                decoder_target_data[i, t - 1, target_token_index['<UNK>']] = 1.
        
        



encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
print(model.summary())

# model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# model.fit(train_questions, train_answers, batch_size=128, epochs=10)
model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

Number of samples: 10000
Number of unique input tokens: 8739
Number of unique output tokens: 8739
Max sequence length for inputs: 21
Max sequence length for outputs: 21
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 8739)   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 8739)   0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 128), (None, 4540416     input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None,

In [5]:
# from keras.utils import plot_model
# plot_model(model, to_file='model.png')

## Load the model from .h5 checkpoint
#### Fit the model(if loaded, you don't have to run the fitting)

In [6]:
load_ = True

In [7]:
from keras.models import load_model
if load_ == True:
    model = load_model('chatModel.h5')
    
else:
    model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=1000,
          validation_split=0.2)

## Decode Result

In [8]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    decoder_length = 25
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    #target_seq[0, 0, target_token_index['<Start>']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)
        
        #print(output_tokens)
        
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
        decoded_sentence += ' '

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '<EOS>' or
           len(decoded_sentence) > decoder_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

## Run chating

In [None]:
def get_input_embedding(string):
    wordNum = 0
    stringVec = np.zeros([1,21,num_encoder_tokens])
    for word in string.split():
        if not word in ['<UNK>','<EOS>']:
            spl_word = clean_text(word).split()
            for word in spl_word:
                try:
                    idx = questions_vocab_to_int[word.lower()]
                    wordVec = np.zeros(num_encoder_tokens)
                    wordVec[idx] = 1
                    stringVec[0,wordNum] = wordVec
                except KeyError:
                    continue
            wordNum += 1
        if wordNum>=21:
            break
    
    return  stringVec

while True:
    print('==========================')
    input_seq = input("Human:")

    decoded_sentence = decode_sequence(get_input_embedding(input_seq))

    print('Chatbot:', decoded_sentence)

Human:Hi
Chatbot: dumping carlo v'ger merrick 
Human:How are you
Chatbot: hair fits fee compound doug 
Human:What's my name
Chatbot: control puppies lady's union 
Human:whta's your name
Chatbot: channing ripped conner umm 
