# Seq2Seq Chatbot
## 2018.11.20
Homework Instruction:

In this homework you will be creating a Chatbot using a sequence-to-sequence model. You are allowed to work in
groups of up to 2 students. This homework is of an open format; all we will be providing you with is the data. It is up
to you to pre-process the data, build the seq-2-seq model with keras, and train the model. You will be submitting your
code and write-up containing the three sections deﬁned below.

<img src="https://cdn.technologyadvice.com/wp-content/uploads/2018/02/friendly-chatbot-700x408.jpg" width=500 height=500>

# Setup:
## Prepare data for neural network
We'll begin by importing the needed models and load the data.

In [2]:
import numpy as np
import pandas as pd
import re
import math
import pickle
import h5py

from keras.models import Sequential
from keras.models import Model, load_model
from keras.layers import LSTM, Embedding 
from keras.layers import Input
from keras.layers import Dense, Flatten, Reshape
from keras import optimizers

# hyperparameter
mxlen = 20 # Max length for a sequence of tokens
batch_size = 128  # Batch size for training.
epochs = 10  # Number of epochs to train for.
char_dim= 50 # Embedding size
latent_dim = 50  # Latent dimensionality of the encoding space.

# dictionary
word2id = {} # Count all word library
id2word = {} # Reverse word2id
input_word2id = {} # Count input word library
output_word2id = {} # Count output word library
input_id2word = {} # Reverse input_word2id
output_id2word = {} # Reverse output_word2id

data_path = "data/movie_lines.tsv"
conversation_path = "data/movie_conversations.txt"

In [3]:
# 1) Remove bad symbols and tokenization
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

def text_prepare(text):
    """
        text: a string
        
        return: modified string tokens 
                [tok1, tok2 , ...] which is a single sentence from one character
    """
    tok = ["<START>"] # add START token to represent sentence start
    text = text.lower() # lowercase text
    text = re.sub(REPLACE_BY_SPACE_RE, ' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = re.sub(BAD_SYMBOLS_RE, '', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    tok += (text.split()+["<EOS>"]) # add EOS token to represent sentence end
    
    return tok

In [4]:
# 2) Dictionary of all words from train corpus with their counts.
#    Dictionary of all words with its ids
def count_words(line_dict, word2id, id2word):
    """
    count:
    { tok1: count1, tok2: count2, ...}
    word2id:
    { tok1: id1, tok2: id2, ...}
    id2word:
    { id1: tok1, id2: tok2, ...}
    
    """
    count = {}
    
    # Special Tokens
    word2id["<START>"] = 0
    word2id["<EOS>"] = 1
    word2id["<UNK>"] = 2
    index = 3
    
    for toks in line_dict.values():
        for word in toks:
            # Count the words
            if not word in count:
                count[word] = 1
            else:
                count[word] += 1
            # Make dictionary
            if not word in word2id:
                word2id[word] = index
                index += 1
    
    # Count the words that appears only once.
    scarce_words_counts = [x[0] for x in sorted(count.items(), key = lambda x: x[1], reverse=True) if x[1] == 1]
    
    # Remove scarce words in word2id dictionary and reindex all words
    for word in scarce_words_counts:
        del word2id[word]
    
    # Arrange word2id and id2word
    word2id = {key: i for i, key in enumerate(word2id.keys())}
    id2word = {i:symbol for symbol, i in word2id.items()}
    
    return scarce_words_counts, word2id, id2word

In [8]:
def load_text(data_path, word2id, id2word):
    """
        Load the movie_lines.tsv file which contains the data. 
        The ﬁle has ﬁve tab separated columns containing the following ﬁelds:
        1. lineID
        2. characterID (who uttered this phrase)
        3. movieID
        4. character name
        5. text of the utterance
        
        Here we only extract lineID and utterance
        
        line_dict = {lineID1: utterance1,
                     lineID2: utterance2,
                     lineID3: utterance3,
                     ...}
    """
    file1 = open(data_path)
    line_dict = {}
    
    i = 0
    
    for line in file1:
        cols = line.rstrip().split("\t")
        line_dict[cols[0].replace('"','')] = text_prepare(cols[-1])
        
        i += 1
        # 400 movies
        if i == 308298:
            break
    
    scarce_words_counts, word2id, id2word = count_words(line_dict, word2id, id2word)
    
    return line_dict, scarce_words_counts, word2id, id2word

In [14]:
def load_conversations(conversation_path, line_dict, scarce_words_counts, prev_sent=2):
    """
        Load movie_conversations.txt which has the conversation lists
        all_convs = [converation 1: [('', sent1, sent2),
                                     (sent1, sent2, sent3),
                                     (sent2, sent3, sent4),
                                     ...]]
        num_data: number of all data
        num_conv: number of all conversation pairs
    
    """
    
    file2 = open(conversation_path, 'r').read().split('\n')[:-1]
    all_convs = []
    num_data, num_conv = 0, 0
    for i,conv in enumerate(file2):
        convs = []
        DELETE = re.compile('[/(){}\[\]\|@,;\']')
        conv = re.sub(DELETE, '', conv.split(' +++$+++ ')[-1]).split()
        
        con_a_1 = []
        for i in range(len(conv)-1):
            con_a_2 = [line_dict[conv[i]]
            con_b = line_dict[conv[i+1]]
            convs.append((con_a_1+con_a_2, con_b) if prev_sent==2 else (con_a_2, con_b))
            num_data += 1
            con_a_1 = con_a_2
            
        num_conv += 1
        all_convs.append(convs)
    
    return all_convs, num_data, num_conv

In [17]:
%%time
line_dict, scarce_words_counts, word2id, id2word = load_text(data_path, word2id, id2word)
all_convs, num_data, num_conv = load_conversations(line_dict, conversation_path, 2)

CPU times: user 5.5 s, sys: 173 ms, total: 5.67 s
Wall time: 5.74 s


In [18]:
all_convs[0][1]

(['<START>',
  'well',
  'i',
  'thought',
  'wed',
  'start',
  'with',
  'pronunciation',
  'if',
  'thats',
  'okay',
  'with',
  'you',
  '<EOS>'],
 ['<START>',
  'not',
  'the',
  'hacking',
  'and',
  'gagging',
  'and',
  'spitting',
  'part',
  'please',
  '<EOS>'])

### 2. Replace and restrict Word length
In the following steps, we will replace the word that only appears once with <UNK> token and restrict sentence to mxlen length.


In [8]:
def modify(all_toks_new, scarce_words_counts, mxlen):
    """
    all_toks_new: (each with same length mxlen)
     [
        movie 0:[ line 0: [id1, id2, ...],
                  line 1: [id1, id2, ...],
                  ... ]
        movie 1:[ line 0: [id1, id2, ...],
                  line 1: [id1, id2, ...],
                  ... ]
        movie 2:[ line 0: [id1, id2, ...],
                  line 1: [id1, id2, ...],
                  ... ]
        ...

     ]
     
    scarce_words_counts: A list with words that only appear once
    [ token1, token2, token3, ...]
    
    """
    # Replace the word with <UNK> that appears only once.
    # for movie in all_toks_new:
    for i in range(len(all_toks_new)):
        if i % 100 == 0:
            print("Iteration (per 100 movies): ",int(i/100))
        for toks in all_toks_new[i]:
            for j in range(len(toks)):
                if toks[j] in scarce_words_counts:
                    toks[j] = "<UNK>"
    
    # Cut the sentence to mxlen
    for movie in all_toks_new:
        for i in range(len(movie)):
            movie[i] = np.array(movie[i][:mxlen])
    
    return all_toks_new
    

In [10]:
%%time
############################################
# Default: Skip, load the data directly
all_toks_new = modify(all_toks_new, scarce_words_counts, mxlen)
file1=open("all_toks_new.bin","wb")
file2=open("word2id.bin","wb")
pickle.dump(all_toks_new,file1)
pickle.dump(word2id,file2)
file1.close()
file2.close()
############################################

Iteration (per 100 movies):  0
Iteration (per 100 movies):  1
Iteration (per 100 movies):  2
CPU times: user 7min 35s, sys: 2.74 s, total: 7min 38s
Wall time: 8min 17s


In [9]:
# Load the processed data to save time
file1=open("all_toks_new.bin","rb")
file2=open("word2id.bin","rb")
all_toks_new=pickle.load(file1)
word2id=pickle.load(file2)

### 3. Make encoding and decoding data
1) Turn the data into two main data forms: <br>
   **input_tokens, output_tokens**<br>
2) Then we will have to prepare two word dictionary **input_word2id, output_word2id** to turn word into ids (bag-of-word) <br>
3) Calculate their length as **num_encoder_tokens, num_decoder_tokens**

### First part: 
Turn into input and output tokens

In [10]:
def separate_conv(ids, toks):
    """
    Separate the sequence of characters and their words if they utter continuously without waiting for the other to speak
    For example:
    ids = [2, 0, 2, 0, 2, 0, 0 ,2]
    toks = [tok1, tok2, tok3, tok4, tok5, tok6, tok7, tok8]
    sep_toks = [[tok1, tok2, tok3, tok4, tok5, tok6], [tok7, tok8]]
    
    """
    sep_toks = []
    for i in range(len(ids)):
        if i == 0:
            temp = ids[i]
            idx = i
        else:
            if temp == ids[i]:
                sep_toks.append(toks[idx:i])
                idx = i
            temp = ids[i]
        
        if i == (len(ids)-1):
            sep_toks.append(toks[idx:len(ids)])
    
    return sep_toks            

In [11]:
def make_data(all_lineids, all_ids, all_toks_new):
    """
    Transform our original data with all dialogues all_toks_new into training data (input_tokens, output_tokens)
    
    A movie can be seen as an entity with sequential characters' conversations.
    We deem a conversation end when two line ids are not consecutive.
    for example, if a lineid sequence is [242, 241, 237, 236, 235]
    we can make it into two conversations: [242, 241], [237, 236, 235]
    
    After specifying the conversations, we can then prepare the training data as follows:
    for two conversations: [242, 241], [237, 236, 235] and corresponding token sequence is [toks1, toks2], [toks3, toks4, toks5]
    we make input_tokens as [toks2], [toks4, toks5]
            output_tokens as [toks1], [toks3, toks4]
    
    
    Then we combine all tokens input-output pairs of every conversation in every movie.
    so we will have
    input_tokens = [toks2, toks4, toks5, toks7, ...]
    output_tokens = [toks1, toks3, toks4, toks6, ...]
    
    Finally we wish to have our target output tokens to be almost same as output_tokens with each data ahead by one timestep.
    output_target_tokens = [toks1, toks3, toks4, toks6, ...]
    
    """
    input_tokens = []
    output_tokens = []
    
    N = len(all_lineids) #number of movies

    for i in range(N):
        #For a single movie
        movie = all_lineids[i]
        for j in range(len(movie)):
            if j == 0:
                temp = movie[j]
                idx = j
            else:
                if (temp-movie[j]) is not 1:
                    sep_toks = separate_conv(all_ids[i][idx:j], all_toks_new[i][idx:j])
                    for toks in sep_toks:
                        input_tokens += toks[1:]
                        output_tokens += toks[:-1]

                    idx = j
                temp = movie[j]

            #Last Sequence
            if j == len(movie)-1:
                sep_toks = separate_conv(all_ids[i][idx:len(movie)], all_toks_new[i][idx:len(movie)])
                for toks in sep_toks:
                    input_tokens += toks[1:]
                    output_tokens += toks[:-1]
            
    return input_tokens, output_tokens

In [12]:
input_tokens, output_tokens = make_data(all_lineids, all_ids, all_toks_new)
input_tokens = np.asarray(input_tokens)
output_tokens = np.asarray(output_tokens)

In [13]:
print(input_tokens[23444])
print(output_tokens[23444])

['<START>' 'tell' 'you' 'what' 'lets' 'ditch' 'the' 'limo' 'let' 'me'
 'drive' 'you' 'up' 'to' 'that' 'red' 'carpet' 'in' 'my' 'beat']
['<START>' 'the' 'hell' 'you' 'will' 'harry' 'york' '<EOS>']


### Second part: 
Turn into encoder_input_data, decoder_input_data, decoder_target_data

In [14]:
print(len(input_tokens))

79590


In [15]:
input_tokens = input_tokens[:30000]
output_tokens = output_tokens[:30000]

In [36]:
# Initialize parameters
all_input_words = set()
all_output_words = set()

# Calculate input words and output words as a sorted list
for toks in input_tokens:
    for tok in toks:
        if tok not in all_input_words:
            all_input_words.add(tok)
for toks in output_tokens:
    for tok in toks:
        if tok not in all_output_words:
            all_output_words.add(tok)
all_input_words = sorted(list(all_input_words))
all_output_words = sorted(list(all_output_words))

# Make input and output libraries
num_encoder_tokens = len(all_input_words)
num_decoder_tokens = len(all_output_words)
input_word2id = dict([(word, i) for i, word in enumerate(all_input_words)])
output_word2id = dict([(word, i) for i, word in enumerate(all_output_words)])
input_id2word = dict((i, tok) for tok, i in input_word2id.items())
output_id2word = dict((i, tok) for tok, i in output_word2id.items())

# Make encoder_input_data, decoder_input_data, decoder_target_data
encoder_input_data = np.zeros((len(input_tokens), mxlen), dtype='float32')
decoder_input_data = np.zeros((len(output_tokens), mxlen), dtype='float32')
decoder_target_data = np.zeros((len(output_tokens), mxlen, num_decoder_tokens), dtype='float32')

for i, (input_text, output_text) in enumerate(zip(input_tokens, output_tokens)):
    for t, word in enumerate(input_text):
        encoder_input_data[i, t] = input_word2id[word]
    for t, word in enumerate(output_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = output_word2id[word]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, output_word2id[word]] = 1.

# -----------------------------------
# Model:
## Define a basic LSTM-based Seq2Seq model using keras

Instructions:

You will be implementing the sequence-to-sequence model described in class where the model makes predictions
using the left context and the dialogue context. More information model can be found in these lecture slides, this
paper, or the reading.

In [17]:
# Encoder
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,), name="Encoder_input")

encoder_embedding = Embedding(input_dim=num_encoder_tokens, 
                              output_dim=char_dim, name="Encoder_Embedding")
encoder_e = encoder_embedding(encoder_inputs)

encoder_lstm = LSTM(latent_dim, return_state=True, name="Encoder_lstm")
encoder_outputs, state_h, state_c = encoder_lstm(encoder_e)
#We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]



# decoder, using `encoder_states` as initial state.
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_inputs = Input(shape=(None,), name="Decoder_input")

decoder_embedding = Embedding(input_dim=num_decoder_tokens, 
                              output_dim=char_dim, name="Decoder_Embedding")
decoder_e = decoder_embedding(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name="Decoder_lstm")
decoder_outputs, _, _ = decoder_lstm(decoder_e, initial_state=encoder_states)

decoder_dense = Dense(num_decoder_tokens, activation='softmax', name="Dense_layer")
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
my_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [18]:
# Run training
optimizer = optimizers.RMSprop(lr=0.002, rho=0.9, epsilon=None, decay=0.0)
my_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
print(my_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Decoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Encoder_Embedding (Embedding)   (None, None, 50)     667500      Encoder_input[0][0]              
__________________________________________________________________________________________________
Decoder_Embedding (Embedding)   (None, None, 50)     673050      Decoder_input[0][0]              
__________________________________________________________________________________________________
Encoder_ls

In [None]:
my_model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

############################################
#################Optional###################
############################################
# Save model
my_model.save('s2s.h5')
############################################
#################Optional###################
############################################

Train on 33250 samples, validate on 1750 samples
Epoch 1/10


# Next: Inference mode (sampling)
Here we do the sampling to retrieve initial decoder state. <br>

1) encode input and retrieve initial decoder state <br>
2) run one step of decoder with this initial state and a "start of sequence" token as target. Output will be the next target token <br>
3) Repeat with the current target token and current states

In [47]:
# Define sampling encoder models
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder_input (InputLayer)   (None, None)              0         
_________________________________________________________________
Encoder_Embedding (Embedding (None, None, 50)          468800    
_________________________________________________________________
Encoder_lstm (LSTM)          [(None, 50), (None, 50),  20200     
Total params: 489,000
Trainable params: 489,000
Non-trainable params: 0
_________________________________________________________________


In [48]:
# Define sampling decoder models
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_e2 = decoder_embedding(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(decoder_e2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

# Next: Predict decoder sequence
At the last stage, we could predict the input sequence by putting in our predefined model. <br>

In [60]:
e_model = load_model("encoder.h5")
d_model = load_model("decoder.h5")

def decode_sequence(input_seq,e_model,d_model):
    
    # Encode the input as state vectors.
    states_value = e_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = output_word2id['<START>']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        outputs, h, c = d_model.predict(
            [target_seq] + states_value)
        
        # Sample a token
        sampled_token_index = np.argmax(outputs[0, -1, :])
        sampled_tok = output_id2word[sampled_token_index]
        decoded_sentence += ' '+sampled_tok

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_tok == '<EOS>'):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]
    
    decoded_sentence = decoded_sentence.strip('<EOS>')
    return decoded_sentence



# -----------------------------------
# Testing
#### Here we can do the final prediction

In [61]:
def tokenize_seq(data):
    token_data = [text_prepare(text) for text in data]
    encoder_data = np.zeros((len(token_data), mxlen), dtype='float32')

    for i, input_text in enumerate(token_data):
        for t, word in enumerate(input_text):
            if word in input_word2id:
                encoder_data[i, t] = input_word2id[word]
            else:
                encoder_data[i, t] = 2
    return encoder_data

In [62]:
# Test input data
data = ["my name is david, what is my name?",
        "my name is john, what is my name?",
        "are you a leader or a follower?",
        "are you a follower or a leader?",
        "what is moral?",
        "what is immoral?",
        "what is altruism?",
        "ok ... so what is the deﬁnition of morality?",
        "tell me the deﬁnition of morality , i am quite upset now!"]

encoder_data = tokenize_seq(data)
for seq_index in range(len(encoder_data)):
    input_seq = encoder_data[seq_index:seq_index+1]
    decoded_sentence = decode_sequence(input_seq,e_model,d_model)
    print('-')
    print('Input sentence:', data[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: my name is david, what is my name?
Decoded sentence:  oh no i dont know it 
-
Input sentence: my name is john, what is my name?
Decoded sentence:  <UNK> 
-
Input sentence: are you a leader or a follower?
Decoded sentence:  what do you want me to say 
-
Input sentence: are you a follower or a leader?
Decoded sentence:  hey what do you mean 
-
Input sentence: what is moral?
Decoded sentence:  <UNK> <UNK> <UNK> 
-
Input sentence: what is immoral?
Decoded sentence:  the <UNK> 
-
Input sentence: what is altruism?
Decoded sentence:  the <UNK> 
-
Input sentence: ok ... so what is the deﬁnition of morality?
Decoded sentence:  the world is the only thing that would never have live to the same 
-
Input sentence: tell me the deﬁnition of morality , i am quite upset now!
Decoded sentence:  what the fuck are you doing here 


In [63]:
# Randomly test data in training set
for seq_index in np.random.permutation(len(encoder_input_data))[:100]:
    input_seq = encoder_input_data[seq_index:seq_index+1]
    decoded_sentence = decode_sequence(input_seq,e_model,d_model)
    print('-')
    print('Input sentence:', " ".join(input_tokens[seq_index][1:-1].tolist()))
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: i dont know what if it breaks
Decoded sentence:  what are you talking about 
-
Input sentence: i wouldnt know anything about that
Decoded sentence:  i know that 
-
Input sentence: and lydia telling natalie the truth makes you a victim in what way
Decoded sentence:  a lot of <UNK> 
-
Input sentence: no not crazy
Decoded sentence:  i dont know 
-
Input sentence: sort of um
Decoded sentence:  what do you mean 
-
Input sentence: after i get outta this <UNK> gonna live forever
Decoded sentence:  i dont know 
-
Input sentence: i told her she doesnt feel she can do that something about her father not letting her go
Decoded sentence:  well i know that you have to go 
-
Input sentence: i didnt mother did
Decoded sentence:  yeah 
-
Input sentence: im sorry cole
Decoded sentence:  you dont understand 
-
Input sentence: were not dropping it now thats the reason isnt it im a dirty degenerate arent i im not
Decoded sentence:  whats the matter of this 
-
Input sentence: when do you 

-
Input sentence: you cant help them right now theyre being cocooned just like the others
Decoded sentence:  oh god oh god please you dont know how to get this just go 
-
Input sentence: you dont crap out of specrecon and get another shot without <UNK> from someone up in flag country
Decoded sentence:  how long 
-
Input sentence: will scraps be able to sit with us dad
Decoded sentence:  well i think he was the only way 
-
Input sentence: we did not leave together
Decoded sentence:  well i think he was the only way 
-
Input sentence: the maze you mean the air ducts
Decoded sentence:  yeah 
-
Input sentence: yes
Decoded sentence:  i dont know 
-
Input sentence: this is maybe worse than you and i are used to what im talking about im talking about
Decoded sentence:  <UNK> 
-
Input sentence: you mean to tell me that there is no one who holds a special place in your heart
Decoded sentence:  oh yes 
-
Input sentence: good i called you at work today they said you were home sick
Decoded sentenc