In [110]:
from __future__ import print_function

import os
import pandas as pd
import numpy as np

from keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense
from keras.models import Model, load_model

from katakana import encoding

INPUT_LENGTH = 20
OUTPUT_LENGTH = 15


# Data pre-processing

In [111]:
data = pd.read_csv('../data/joined_titles.csv', header=None)
data = data.sample(frac=1, random_state=0)

data_input = [s.decode('utf-8').lower() for s in data[0]]
data_output = [s.decode('utf-8') for s in data[1]]
print(data_input[0], data_output[0])
print(data_input[5], data_output[5])

data_size = len(data)

# We will use the first 0-60th %-tile (60%) of data for the training
training_input  = data_input[data_size*0/100:data_size*60/100]
training_output = data_output[data_size*0/100:data_size*60/100]

# We will use the first 60-70th %-tile (10%) of data for the training
validation_input = data_input[data_size*60/100:data_size*70/100]
validation_output = data_output[data_size*60/100:data_size*70/100]

print('training size', len(training_input))
print('validation size', len(validation_input))

dorogobuzh ドロゴブージ
brian cowen ブライアン・カウエン
training size 64356
validation size 10726


### Transform data into Numpy arrays

We transform the sequences of characters into sequences of integer IDs. This will be done by using pre-written functions in `encoding` module. 
- First, `encoding.build_characters_encoding` will build encoding/decoding dictionary from the data. 
- Then, `encoding.transform` will transform the data into numpy array.

Check [the previous notebook](./Writing Katakana using Sequence-to-Sequence in Keras) for the details about the transformation.

In [113]:
input_encoding, input_decoding, input_dict_size = encoding.build_characters_encoding(data_input)
output_encoding, output_decoding, output_dict_size = encoding.build_characters_encoding(data_output)

print('English character dict size:', input_dict_size)
print('Katakana character dict size:', output_dict_size)

encoded_training_input = encoding.transform(
    input_encoding, training_input, vector_size=INPUT_LENGTH)
encoded_training_output = encoding.transform(
    output_encoding, training_output, vector_size=OUTPUT_LENGTH)

print('encoded_training_input', encoded_training_input.shape)
print('encoded_training_output', encoded_training_output.shape)

encoded_validation_input = encoding.transform(
    input_encoding, validation_input, vector_size=INPUT_LENGTH)
encoded_validation_output = encoding.transform(
    output_encoding, validation_output, vector_size=OUTPUT_LENGTH)

print('encoded_validation_input', encoded_validation_input.shape)
print('encoded_validation_output', encoded_validation_output.shape)

English character dict size: 54
Katakana character dict size: 89
encoded_training_input (64356, 20)
encoded_training_output (64356, 15)
encoded_validation_input (10726, 20)
encoded_validation_output (10726, 15)


# Sequence-to-Sequence in Keras

In [125]:
encoder_input = Input(shape=(INPUT_LENGTH,))
decoder_input = Input(shape=(OUTPUT_LENGTH,))

### Encoder / Decoder

In [126]:
from keras.layers import SimpleRNN

encoder = Embedding(input_dict_size, 64, input_length=INPUT_LENGTH, mask_zero=True)(encoder_input)
encoder = LSTM(64, return_sequences=True, unroll=True)(encoder)
encoder_last = encoder[:,-1,:]

print('encoder', encoder)
print('encoder_last', encoder_last)

decoder = Embedding(output_dict_size, 64, input_length=OUTPUT_LENGTH, mask_zero=True)(decoder_input)
decoder = LSTM(64, return_sequences=True, unroll=True)(decoder, initial_state=[encoder_last, encoder_last])

print('decoder', decoder)

encoder Tensor("lstm_86/transpose_2:0", shape=(?, 20, 64), dtype=float32)
encoder_last Tensor("strided_slice_40:0", shape=(?, 64), dtype=float32)
decoder Tensor("lstm_87/transpose_2:0", shape=(?, 15, 64), dtype=float32)


### Attention Mechanism

Reference: 
[Effective Approaches to Attention-based Neural Machine Translation](https://arxiv.org/pdf/1508.04025.pdf)'s 
**Global Attention** with **Dot**-based scoring function (Section 3, 3.1)


In [138]:
from keras.layers import Activation, dot, concatenate

scores = dot([decoder, encoder], axes=[2, 2])
attention = Activation('softmax')(scores)
print('attention', attention)

context = dot([attention, encoder], axes=[2,1])
print('context', context)

#
decoder_combined_context = concatenate([context, decoder])
decoder_combined_context = TimeDistributed(Dense(64, activation="linear"))(decoder_combined_context)
print('decoder_combined_context', decoder_combined_context)

decoder_output = TimeDistributed(Dense(output_dict_size, activation="softmax"))(decoder_combined_context)
print('decoder_output', decoder_output)


attention Tensor("activation_3/div:0", shape=(?, 15, 20), dtype=float32)
context Tensor("dot_71/MatMul:0", shape=(?, 15, 64), dtype=float32)
decoder_combined_context Tensor("time_distributed_13/Reshape_1:0", shape=(?, 15, 64), dtype=float32)
decoder_output Tensor("time_distributed_14/Reshape_1:0", shape=(?, 15, 89), dtype=float32)


In [139]:
model = Model(inputs=[encoder_input, decoder_input], outputs=[decoder_output])
model.compile(optimizer='adam', loss='binary_crossentropy')

## Training the model

In [141]:
training_encoder_input = encoded_training_input
training_decoder_input = np.zeros_like(encoded_training_output)
training_decoder_input[:, 1:] = encoded_training_output[:,:-1]
training_decoder_input[:, 0] = encoding.CHAR_CODE_START
training_decoder_output = np.eye(output_dict_size)[encoded_training_output.astype('int')]

validation_encoder_input = encoded_validation_input
validation_decoder_input = np.zeros_like(encoded_validation_output)
validation_decoder_input[:, 1:] = encoded_validation_output[:,:-1]
validation_decoder_input[:, 0] = encoding.CHAR_CODE_START
validation_decoder_output = np.eye(output_dict_size)[encoded_validation_output.astype('int')]

In [142]:
if os.path.isfile('model.h5'):
    model = load_model('model.h5')
else:
    model.fit(x=[training_encoder_input, training_decoder_input], y=[training_decoder_output],
          validation_data=([validation_encoder_input, validation_decoder_input], [validation_decoder_output]),
          verbose=2, batch_size=64, epochs=10)

model.fit(x=[training_encoder_input, training_decoder_input], y=[training_decoder_output],
          validation_data=([validation_encoder_input, validation_decoder_input], [validation_decoder_output]),
          verbose=2, batch_size=64, epochs=5)

model.save('model.h5')

Train on 64356 samples, validate on 10726 samples
Epoch 1/10
153s - loss: 0.0456 - val_loss: 0.0397
Epoch 2/10
106s - loss: 0.0349 - val_loss: 0.0292
Epoch 3/10
113s - loss: 0.0232 - val_loss: 0.0197
Epoch 4/10
127s - loss: 0.0183 - val_loss: 0.0173
Epoch 5/10
117s - loss: 0.0166 - val_loss: 0.0161
Epoch 6/10
117s - loss: 0.0155 - val_loss: 0.0153
Epoch 7/10
104s - loss: 0.0148 - val_loss: 0.0148
Epoch 8/10
132s - loss: 0.0142 - val_loss: 0.0142
Epoch 9/10
119s - loss: 0.0138 - val_loss: 0.0140
Epoch 10/10
119s - loss: 0.0134 - val_loss: 0.0137
Train on 64356 samples, validate on 10726 samples
Epoch 1/5
108s - loss: 0.0131 - val_loss: 0.0134
Epoch 2/5
107s - loss: 0.0128 - val_loss: 0.0133
Epoch 3/5
110s - loss: 0.0126 - val_loss: 0.0131
Epoch 4/5
106s - loss: 0.0124 - val_loss: 0.0130
Epoch 5/5
108s - loss: 0.0123 - val_loss: 0.0128


  str(node.arguments) + '. They will not be included '
