# Sequence to Sequence and Attention

## Build Character Training Set

In [1]:
from nlpia.loaders import get_data
df = get_data('moviedialog')
input_texts, target_texts = [], []
# holds seen files in the input and target text
# we use a set to get UNIQUE characters to build one-hot matrices
input_vocabulary = set()
output_vocabulary = set()
start_token = '\t'
stop_token = '\n'
#max_training_samples: num lines used for training
max_training_samples = min(25000, len(df)-1)

In [2]:
df.head(3)

Unnamed: 0,statement,reply
0,you're asking me out. that's so cute. what's y...,forget it.
1,"no, no, it's my fault we didn't have a proper ...",cameron.
2,"gosh, if only we could find kat a boyfriend...",let me see what i can do.


In [4]:
for input_text, target_text in zip(df.statement, df.reply):
    target_text = start_token + target_text + stop_token
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_vocabulary:
            input_vocabulary.add(char)
    for char in target_text:
        if char not in output_vocabulary: 
            output_vocabulary.add(char)

##  Character Model Parameters

In [5]:
input_vocabulary = sorted(input_vocabulary)
output_vocabulary = sorted(output_vocabulary)

input_vocab_size = len(input_vocabulary)
output_vocab_size = len(output_vocabulary)

max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

input_token_index = dict([(char, i) for i, char in enumerate(input_vocabulary)])
target_token_index = dict([(char, i) for i, char in enumerate(output_vocabulary)])

reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

## Construct Character Encoder-Decoder Training Set

In [6]:
import numpy as np

#shape: num_samples, max_len_sequence, num_unique_tokens_in_vocab
encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, input_vocab_size), dtype='float32')
decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, output_vocab_size), dtype='float32')
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, output_vocab_size), dtype='float32')


for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        #decoder target data is one time step behind decoder input data
        if t>0:
            decoder_target_data[i, t-1, target_token_index[char]] = 1


## Construct and Train Character Sequence Encoder-Decoder Network
We have converted the preprocessed corpus into input and target samples, created index lookup dictionaries, and converted the samples into one-hot tensors

In [8]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

batch_size = 64
epochs = 100
num_neurons = 256

encoder_inputs = Input(shape=(None, input_vocab_size))
encoder = LSTM(num_neurons, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]


decoder_inputs = Input(shape=(None, output_vocab_size))
decoder_lstm = LSTM(num_neurons, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(output_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.fit(
    [encoder_input_data, decoder_input_data], 
    decoder_target_data, 
    batch_size=batch_size, 
    epochs=epochs,
    validation_split=0.1)



Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 57915 samples, validate on 6436 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100

KeyboardInterrupt: 

## Construct Response Generator Model

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)
thought_input = [Input(shape=(num_neurons,)), Input(shape=(num_neurons,))]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=thought_input)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    inputs=[decoder_inputs] + thought_input,
    output=[decoder_outputs] + decoder_states)

In [None]:
print([decoder_inputs] + thought_input)

In [None]:
print([decoder_outputs] + decoder_states)

## Build a character-based translator

In [None]:
'''
Input: the one hot encoding of a character
Output: The generated series of characters
'''
def decode_sequence(input_seq):
    #The thought vector will be the input to the decoder
    thought = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1, 1, output_vocab_size))
    target_seq[0, 0, target_token_index[stop_token]] = 1.
    
    stop_condition = False
    generated_sequence = ''
    
    while not stop_condition:
        # Passing the already-generated tokens and the tatest state to the decoder to predict next sequence
        output_tokens, h, c = decoder.model.predict([target_seq]+thought)
        
        generated_token_idx = np.argmax(output_tokens[0, -1, :])
        generated_char = reverse_target_char_index[generated_token_idx]
        generated_sequence += generated_char
        if (generated_char == stop_token or len(generated_sequence) > max_decoder_seq_length):
            stop_condition = True
            # Update the target sequence and use the last generated token as the input to the next generation step
            target_seq = np.zeros((1, 1, output_vocab_size))
            target_seq[0, 0, generated_token_idx] = 1.
            thought = [h, c]
        return generated_sequence
    

In [None]:
def response(input_text):
    input_seq = np.zeros((1, max_encoder_seq_length, input_vocab_size), dtype='float32')
    for t, char in enumerate(input_text):
        input_seq[0, t, input_token_index[char]] = 1.
    decoded_sentence = decode_sequence(input_seq)
    print("Bot Reply (Decoded sentence): ", decoded_sentence)
    