In [1]:
!python -V

Python 3.8.10


## Data

In [248]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

#### Read In

In [19]:
movie_lines_features = ["LineID", "Character", "Movie", "Name", "Line"]
movie_lines = pd.read_csv("cornell movie-dialogs corpus/movie_lines.txt", 
                          sep = "\+\+\+\$\+\+\+", 
                          engine = "python", 
                          index_col = False, names = movie_lines_features)

# Using only the required columns, namely, "LineID" and "Line"
movie_lines = movie_lines[["LineID", "Line"]]

# Strip the space from "LineID" for further usage and change the datatype of "Line"
movie_lines["LineID"] = movie_lines["LineID"].apply(str.strip)

In [20]:
print(movie_lines.shape)
movie_lines.head()

(304713, 2)


Unnamed: 0,LineID,Line
0,L1045,They do not!
1,L1044,They do to!
2,L985,I hope so.
3,L984,She okay?
4,L925,Let's go.


In [88]:
movie_conversations_features = ["Character1", "Character2", "Movie", "Conversation"]
movie_conversations = pd.read_csv("cornell movie-dialogs corpus/movie_conversations.txt", 
                                  sep = "\+\+\+\$\+\+\+", 
                                  engine = "python", 
                                  index_col = False, 
                                  names = movie_conversations_features)


In [89]:
print(movie_conversations.shape)
movie_conversations.head()

(83097, 4)


Unnamed: 0,Character1,Character2,Movie,Conversation
0,u0,u2,m0,"['L194', 'L195', 'L196', 'L197']"
1,u0,u2,m0,"['L198', 'L199']"
2,u0,u2,m0,"['L200', 'L201', 'L202', 'L203']"
3,u0,u2,m0,"['L204', 'L205', 'L206']"
4,u0,u2,m0,"['L207', 'L208']"


#### Create utterance and response pairs

In [94]:
# remove unneeded chars from Converstation column
def remove_chars(conv_str):
    # input: " ['L194', 'L195', 'L196', 'L197']"
    # output: 'L194,L195,L196,L197'
    chars_to_remove = [' ', "'", '[', ']']
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    return re.sub(rx, "",  conv_str)


# generate pairs of consecutive lines out of conversations
def generate_pairs(conv_list):
    # input: ['L194', 'L195', 'L196', 'L197'])
    # output: [['L194', 'L195'], ['L195', 'L196'], ['L196', 'L197']]
    
    return [[conv_list[i], conv_list[i+1]] for i in range(len(conv_list)-1)]

In [95]:
# get pairs of lines
movie_conversations['conv_pairs'] = movie_conversations["Conversation"].apply(lambda x: generate_pairs(remove_chars(x).split(',')))

# expand conv_pairs column
pairs = movie_conversations['conv_pairs'].apply(pd.Series, 1).stack()
pairs.index = pairs.index.droplevel(-1)
pair_df = pairs.apply(lambda x: pd.Series(x))
pair_df.columns = ['L1','L2']

movie_conversations = movie_conversations.join(pair_df)

In [121]:
print(movie_conversations.shape)
movie_conversations.head()

(221616, 7)


Unnamed: 0,Character1,Character2,Movie,Conversation,conv_pairs,L1,L2
0,u0,u2,m0,"['L194', 'L195', 'L196', 'L197']","[[L194, L195], [L195, L196], [L196, L197]]",L194,L195
0,u0,u2,m0,"['L194', 'L195', 'L196', 'L197']","[[L194, L195], [L195, L196], [L196, L197]]",L195,L196
0,u0,u2,m0,"['L194', 'L195', 'L196', 'L197']","[[L194, L195], [L195, L196], [L196, L197]]",L196,L197
1,u0,u2,m0,"['L198', 'L199']","[[L198, L199]]",L198,L199
2,u0,u2,m0,"['L200', 'L201', 'L202', 'L203']","[[L200, L201], [L201, L202], [L202, L203]]",L200,L201


In [130]:
conv_df = movie_conversations.merge(movie_lines, left_on='L1', right_on='LineID') \
                   .drop('LineID', axis=1) \
                   .rename(columns={'Line':'line_1'}) \
                   .merge(movie_lines, left_on='L2', right_on='LineID') \
                   .drop('LineID', axis=1) \
                   .rename(columns={'Line':'line_2'})

(221616, 9)


In [131]:
print(conv_df.shape)

# take a look at data
pd.set_option('display.max_colwidth', -1)
conv_df.sample()

(221616, 9)


  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,Character1,Character2,Movie,Conversation,conv_pairs,L1,L2,line_1,line_2
33167,u1357,u1364,m90,"['L284619', 'L284620', 'L284621', 'L284622', 'L284623', 'L284624', 'L284625', 'L284626', 'L284627', 'L284628', 'L284629', 'L284630', 'L284631', 'L284632', 'L284633']","[[L284619, L284620], [L284620, L284621], [L284621, L284622], [L284622, L284623], [L284623, L284624], [L284624, L284625], [L284625, L284626], [L284626, L284627], [L284627, L284628], [L284628, L284629], [L284629, L284630], [L284630, L284631], [L284631, L284632], [L284632, L284633]]",L284630,L284631,"Hello, Max, Hildy Johnson. Was there an old lady --?","Butch! I'd put my arm in fire for you -- up to here! Now, you can't double-cross me!... She does? All right -- put her on. I'll talk to her... Hello! Oh, hello, Madam... Now listen, you ten-cent glamour girl, you can't keep Butch away from his duty... What's that? You say that again and I'll come over there and knock your eye out! Hello? I'll kill 'em! I'll kill both of 'em! Duffy! Mousing around with some big blonde Annie on my time! That's co-operation! Duffy!!"


#### Data Preprocessing

In [206]:
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords', quiet=True)
stopwords = stopwords.words('english')

# Remove all english stopwords
def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word not in (stopwords)])
    return text

def clean_text(string):
    string = string.strip().lower()
    # remove remove unwanted characters (restrict chars to a closed set)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`:]", " ", string)
    # separate “‘s”, "'ve", r"n't", "'re", "'d", '"'ll" from words to help the tokenizer.
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    # Punctuation will be separated from text to help the tokenizer.
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    # Normalize spaces to a single space.
    string = re.sub(r"\s{2,}", " ", string)
    
    # If the token is a number, we substitute it with the “NUM” token
    string = re.sub(r"[0-9]+", "NUM", string)
    
    # remove stopwords
    # this would remove some esential words like 'I', etc. which is necessary to generate response
    # string = remove_stopwords(string)
    
    return string

In [358]:
def add_start_end_token(string):
    # add start and end to indicate the start and end of each sentence.
    string = "start " + string + " end"
    return string

In [207]:
string = "Well I assure you, Sir, I have no desire to create difficulties. 45"
string

'Well I assure you, Sir, I have no desire to create difficulties. 45'

In [208]:
clean_text(string)

'well i assure you , sir , i have no desire to create difficulties NUM'

#### Train, Validation, Test

In [359]:
x = conv_df['line_1'].astype(str).apply(lambda x: clean_text(x))
y = conv_df['line_2'].astype(str).apply(lambda x: clean_text(x)).apply(lambda x: add_start_end_token(x))

x = np.asarray(x)
y = np.asarray(y)

len(x), len(y)

(221616, 221616)

In [360]:
x[:5]

array(['can we make this quick? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad again ',
       "well , i thought we 'd start with pronunciation , if that 's okay with you ",
       'not the hacking and gagging and spitting part please ',
       "you 're asking me out that 's so cute what 's your name again?",
       "no , no , it 's my fault we did n't have a proper introduction "],
      dtype=object)

In [361]:
y[:5]

array(["start well , i thought we 'd start with pronunciation , if that 's okay with you  end",
       'start not the hacking and gagging and spitting part please  end',
       "start okay then how 'bout we try out some french cuisine saturday? night? end",
       'start forget it  end', 'start cameron  end'], dtype=object)

In [419]:
len(x)

221616

In [424]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random

def format_input(x, y, MAX_NB_WORDS=100000, VALIDATION_SPLIT=0.3, MAX_SEQUENCE_LENGTH=1000, sample_ratio=1):
    # format our text samples and labels into tensors that can be fed into a neural network
    # Input:
    #     x: numpy array of utterances
    #     y: numpy array of responses
    # Output:
    #     x_train, y_train, x_val, y_val
    sample_size = int(len(x) * sample_ratio)
    x = x[:sample_size]
    y = y[:sample_size]
    texts = np.concatenate((x,y))
    
    # initialize a tokenizer
    tokenizer = Tokenizer(nb_words=MAX_NB_WORDS,
                          oov_token='<OOV>')

    # Updates the internal word-index dictionary (tokenizer.word_index) in the order of word frequency (lower integer means more frequent)
    # 0 is reserved for padding. 
    tokenizer.fit_on_texts(texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    print('word_index is like', dict(list(word_index.items())[:5]))

    
    # Transforms each texts into a sequence of integers 
    # by replacing each token into the index in tokenizer.word_index
    x_vec = tokenizer.texts_to_sequences(x)
    y_vec = tokenizer.texts_to_sequences(y)
    
    sample_ind = random.randint(0, len(x))

    print('len(x_vec), len(y_vec): ', len(x_vec), len(y_vec))
    print(f'len(x[{sample_ind}]), len(x_vec[{sample_ind}]): ', len(x[sample_ind].split(' ')), len(x_vec[sample_ind]))
    print(f'x[{sample_ind}] is like: \n\t', x[sample_ind])
    print(f'x_vec[{sample_ind}] is like: \n\t', x_vec[sample_ind])
    print(f'y[{sample_ind}] is like: \n\t', y[sample_ind])
    print(f'y_vec[{sample_ind}] is like: \n\t', y_vec[sample_ind])
    
    
    
    # pad as deep learning methods using sequential data require fixed length
    x_padded = pad_sequences(x_vec, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    y_padded = pad_sequences(y_vec, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

    print('len(x_padded), len(y_padded): ', len(x_padded), len(y_padded))
    print(f'len(x_padded[{sample_ind}]): ', len(x_padded[sample_ind]))
    print(f'x_padded[{sample_ind}] is like: \n\t',  x_padded[sample_ind])
    print(f'y_padded[{sample_ind}] is like: \n\t',  y_padded[sample_ind])


    nb_validation_samples = int(VALIDATION_SPLIT * x.shape[0])

    x_train = x_padded[:-nb_validation_samples]
    y_train = y_padded[:-nb_validation_samples]
    x_val = x_padded[-nb_validation_samples:]
    y_val = y_padded[-nb_validation_samples:]

    print('len(x_train), len(y_train), len(x_val), len(y_val): ', len(x_train), len(y_train), len(x_val), len(y_val))
    
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_val = np.array(x_val)
    y_val = np.array(y_val)

    return tokenizer, x_train, y_train, x_val, y_val

In [439]:
MAX_NB_WORDS = 100000
# MAX_SEQUENCE_LENGTH = max([len(text) for text in texts])
MAX_SEQUENCE_LENGTH = 32
tokenizer, x_train, y_train, x_val, y_val = format_input(x, y, MAX_NB_WORDS=MAX_NB_WORDS, VALIDATION_SPLIT=0.3, 
                                                         MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH,
                                                        sample_ratio=0.5)

word_index = tokenizer.word_index
index_word = tokenizer.index_word
vocab_size = len(word_index) + 1
print(vocab_size)

Found 35274 unique tokens.
word_index is like {'<OOV>': 1, 'start': 2, 'end': 3, 'you': 4, 'i': 5}
len(x_vec), len(y_vec):  110808 110808
len(x[40538]), len(x_vec[40538]):  7 7
x[40538] is like: 
	 where 's sarah? where are the boys?
x_vec[40538] is like: 
	 [84, 9, 2037, 84, 36, 6, 534]
y[40538] is like: 
	 start sit down , jake  end
y_vec[40538] is like: 
	 [2, 404, 118, 848, 3]
len(x_padded), len(y_padded):  110808 110808
len(x_padded[40538]):  32
x_padded[40538] is like: 
	 [  84    9 2037   84   36    6  534    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]
y_padded[40538] is like: 
	 [  2 404 118 848   3   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
len(x_train), len(y_train), len(x_val), len(y_val):  77566 77566 33242 33242


## Transfer Embedding from pre-trained GloVe

In [430]:
from tensorflow.keras.layers import Dense, Embedding, LSTM, Input, Concatenate, TimeDistributed
from tensorflow.keras.models import Sequential, load_model, model_from_json
from tensorflow.keras import optimizers

In [280]:
import gensim.downloader as api
import json

# check available pre-trained models
info = api.info()

for model_name, model_data in sorted(info['models'].items()):
    print(
        '%s (%d records): %s' % (
            model_name,
            model_data.get('num_records', -1),
            model_data['description'][:60] + '...',
        )
    )

__testing_word2vec-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Word vecrors of the movie matrix....
conceptnet-numberbatch-17-06-300 (1917247 records): ConceptNet Numberbatch consists of state-of-the-art semantic...
fasttext-wiki-news-subwords-300 (999999 records): 1 million word vectors trained on Wikipedia 2017, UMBC webba...
glove-twitter-100 (1193514 records): Pre-trained vectors based on  2B tweets, 27B tokens, 1.2M vo...
glove-twitter-200 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M voc...
glove-twitter-25 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M voc...
glove-twitter-50 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M voc...
glove-wiki-gigaword-100 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword 5.6B ...
glove-wiki-gigaword-200 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B...
glove-wiki-gigaword-300 (400000 records): P

In [283]:
# get detailed info of a pre-trained model
pret_model_name = 'glove-wiki-gigaword-100'
detailed_info = api.info(pret_model_name)
print(json.dumps(detailed_info, indent=4))

glove_model = api.load(pret_model_name)

{
    "num_records": 400000,
    "file_size": 134300434,
    "base_dataset": "Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)",
    "reader_code": "https://github.com/RaRe-Technologies/gensim-data/releases/download/glove-wiki-gigaword-100/__init__.py",
    "license": "http://opendatacommons.org/licenses/pddl/",
    "parameters": {
        "dimension": 100
    },
    "description": "Pre-trained vectors based on Wikipedia 2014 + Gigaword 5.6B tokens, 400K vocab, uncased (https://nlp.stanford.edu/projects/glove/).",
    "preprocessing": "Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i <fname> -o glove-wiki-gigaword-100.txt`.",
    "read_more": [
        "https://nlp.stanford.edu/projects/glove/",
        "https://nlp.stanford.edu/pubs/glove.pdf"
    ],
    "checksum": "40ec481866001177b8cd4cb0df92924f",
    "file_name": "glove-wiki-gigaword-100.gz",
    "parts": 1
}

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





In [444]:
def get_embedding_matrix(word_index, embeddings_index, EMBEDDING_DIM):
    # form embedding metrix for all words in vocabulary
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        if word in embeddings_index:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embeddings_index[word]
    return embedding_matrix 

In [445]:
embeddings_index = glove_model
EMBEDDING_DIM = 100
embedding_matrix = get_embedding_matrix(word_index, embeddings_index, EMBEDDING_DIM)


len(word_index), embedding_matrix.shape

(35274, (35275, 100))

## Sequence-to-sequence Model

In [408]:
# download the attention layer from https://github.com/thushv89/attention_keras/blob/master/src/layers/attention.py
# and copy it in a different file called attention.py. 
# This attention is an implementation of ‘Bahdanau Attention’ .
from attention import AttentionLayer
from tensorflow.keras.utils import plot_model
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping

In [446]:
K.clear_session() 
latent_dim = 100
# Encoder
encoder_inputs = Input(shape=(MAX_SEQUENCE_LENGTH,)) 
# enc_emb_layer = Embedding(vocab_size, latent_dim, trainable=True)
# use pre-trained word embeddings
enc_emb_layer = Embedding(vocab_size,
                            latent_dim,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
enc_emb = enc_emb_layer(encoder_inputs)
#LSTM 1 
encoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True) 
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)
#LSTM 2 
encoder_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True) 
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)
#LSTM 3 
encoder_lstm3 = LSTM(latent_dim, return_state=True, return_sequences=True) 
encoder_outputs, state_h, state_c = encoder_lstm3(encoder_output2)
# Decoder
decoder_inputs = Input(shape=(None,)) 
# dec_emb_layer = Embedding(vocab_size, latent_dim, trainable=True)
# use pre-trained word embeddings
dec_emb_layer = Embedding(vocab_size,
                            latent_dim,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
dec_emb = dec_emb_layer(decoder_inputs)
#LSTM using encoder_states as initial state
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True) 
decoder_outputs, decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb, initial_state=[state_h, state_c])
#Attention Layer
attn_layer = AttentionLayer(name='attention_layer') 
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])
# Concat attention output and decoder LSTM output 
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])
#Dense layer
decoder_dense = TimeDistributed(Dense(vocab_size, activation='softmax')) 
decoder_outputs = decoder_dense(decoder_concat_input)
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 32)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 32, 100)      3527500     input_1[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 32, 100), (N 80400       embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
______________________________________________________________________________________________

#### Train

In [None]:
# Run training
batch_size = 32
epochs = 1

model.compile(optimizer='rmsprop',
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

history = model.fit([x_train, y_train[:,:-1]], y_train.reshape(y_train.shape[0], y_train.shape[1],1)[:,1:], 
                    epochs=epochs, 
                    callbacks=[es],
                    batch_size=batch_size,
                    validation_data = ([x_val, y_val[:,:-1]], y_val.reshape(y_val.shape[0], y_val.shape[1], 1)[:,1:]))




In [428]:
# save model
model_json = model.to_json()
with open("chatbot_model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("chatbot_model_weight.h5")
print("Saved model to disk")

Saved model to disk


In [431]:
# loading the model architecture and asigning the weights
json_file = open('chatbot_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model_loaded = model_from_json(loaded_model_json, custom_objects={'AttentionLayer': AttentionLayer})
# load weights into new model
model_loaded.load_weights("chatbot_model_weight.h5")

#### Inference

In [432]:
latent_dim=500
# encoder inference
encoder_inputs = model_loaded.input[0]  #loading encoder_inputs
encoder_outputs, state_h, state_c = model_loaded.layers[6].output #loading encoder_outputs
#print(encoder_outputs.shape)
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])
# decoder inference
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(32,latent_dim))
# Get the embeddings of the decoder sequence
decoder_inputs = model_loaded.layers[3].output
#print(decoder_inputs.shape)
dec_emb_layer = model_loaded.layers[5]
dec_emb2= dec_emb_layer(decoder_inputs)
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_lstm = model_loaded.layers[7]
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])
#attention inference
attn_layer = model_loaded.layers[8]
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
concate = model_loaded.layers[9]
decoder_inf_concat = concate([decoder_outputs2, attn_out_inf])
# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_dense = model_loaded.layers[10]
decoder_outputs2 = decoder_dense(decoder_inf_concat)
# Final decoder model
decoder_model = Model([decoder_inputs] + [decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c],
                      [decoder_outputs2] + [state_h2, state_c2])

In [433]:
def decode_sequence(word_index, index_word, input_seq, MAX_SEQUENCE_LENGTH):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Chose the 'start' word as the first word of the target sequence
    target_seq[0, 0] = word_index['start']
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        # skip pad token
        if sampled_token_index == 0:
            break
        else:
            sampled_token = index_word[sampled_token_index]
        if sampled_token != 'end':
            decoded_sentence += ' '+sampled_token
        # Exit condition: either hit max length or find stop word.
        if sampled_token == 'end' or len(decoded_sentence.split()) >= (MAX_SEQUENCE_LENGTH - 1):
            stop_condition = True
        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
        # Update internal states
        e_h, e_c = h, c
    return decoded_sentence

def seq2summary(word_index, index_word, input_seq):
    newString=''
    for i in input_seq:
        if((i != 0 and i != word_index['start']) and i != word_index['end']):
            newString = newString + index_word[i] + ' '
    return newString

def seq2text(index_word, input_seq):
    newString = ''
    for i in input_seq:
        # 0 represents pading token
        if(i != 0):
            newString = newString + index_word[i] + ' '
    return newString

In [435]:
index_word = tokenizer.index_word
for i in range(10):  
    print("Line_1:", seq2text(index_word, x_val[i]))
    print("Original Line_2:", seq2summary(word_index, index_word, y_val[i]))
    print("Predicted Line_2:", decode_sequence(word_index, index_word, x_val[i].reshape(1, 32), MAX_SEQUENCE_LENGTH))
    print("\n")

Line_1: partnership 
Original Line_2: well you got to admit we come a long way 
Predicted Line_2:  i do n't know


Line_1: you okay 
Original Line_2: yeah but i was n't there for a second 
Predicted Line_2:  i do n't know


Line_1: yeah but i was n't there for a second 
Original Line_2: you did pick a real strange time to go and be brave all on your own 
Predicted Line_2:  i do n't know


Line_1: okay reggie start bustin' my chops tell me how great you were with that chick 
Original Line_2: hey jack real men do n't have to go in for that macho bullshit but i was fantastic 
Predicted Line_2:  i do n't know


Line_1: a minute cates i 've been waitin' three years for that i do n't think it 's fair man what about the merit system you were gonnna give me a few thousand 
Original Line_2: there 's nothin' to talk about 
Predicted Line_2:  i do n't know


Line_1: it 's your money it 'll be here in six months when you get out 
Original Line_2: and you 're tellin' me you do n't want any of this 

In [None]:
## TO-DO
# clean code
# run in colab to use gpu

## Evaluation

In [None]:
# bleu score
