In [1]:
from __future__ import print_function

import os
import pandas as pd
import numpy as np

from keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense
from keras.models import Model, load_model

Using TensorFlow backend.


# Explore and transform the data

In [2]:
data = pd.read_csv('./data/joined_titles.csv', header=None)
data = data.sample(frac=1, random_state=0)

print(len(data))
print(data[0:5])

107261
                         0              1
11206           Dorogobuzh         ドロゴブージ
80376         Gail Hopkins      ゲイル・ホプキンス
38108              Novatek          ノヴァテク
29960      Gyula Cseszneky     チェスネキー・ジュラ
22295  Occhieppo Superiore  オッキエッポ・スペリオーレ


In [3]:
data_input = [s.decode('utf-8').lower() for s in data[0]]
data_output = [s.decode('utf-8') for s in data[1]]

print(data_input[0:3])
print(data_output[0:3])

[u'dorogobuzh', u'gail hopkins', u'novatek']
[u'\u30c9\u30ed\u30b4\u30d6\u30fc\u30b8', u'\u30b2\u30a4\u30eb\u30fb\u30db\u30d7\u30ad\u30f3\u30b9', u'\u30ce\u30f4\u30a1\u30c6\u30af']


In [4]:
data_size = len(data)

# We will use the first 0-60th %-tile (60%) of data for the training
training_input  = data_input[data_size*0/100:data_size*60/100]
training_output = data_output[data_size*0/100:data_size*60/100]

# We will use the first 60-70th %-tile (10%) of data for the training
validation_input = data_input[data_size*60/100:data_size*70/100]
validation_output = data_output[data_size*60/100:data_size*70/100]

print(len(training_input))
print(len(validation_input))

42904
10726


### Encoding character input

We will create a character dictionary and encode the title from a string (a sequence of character) into a sequence of IDs. We will also create the reverse dictionary that will be used for getting the result later.

Note that in practice, we must not build the dictionary from all data (`data_input` and `data_output`), but only use the training set (`training_input` and `training_output`). We also have to handle out-of-dictionary characters. However, for now, I will skip that part.

Note:
- We will use 0 for padding and 1 for 'START'. So, `count` starts from 2. 
- This is to take advantage of `mask_zero=True` feature for Embedding Layer in Keras

In [5]:
START_CHAR_CODE = 1

def encode_characters(titles):
    count = 2
    encoding = {}
    decoding = {1: 'START'}
    for c in set([c for title in titles for c in title]):
        encoding[c] = count
        decoding[count] = c
        count += 1
    return encoding, decoding, count


input_encoding, input_decoding, input_dict_size = encode_characters(data_input)
output_encoding, output_decoding, output_dict_size = encode_characters(data_output)


print('English character dict size:', input_dict_size)
print('Katakana character dict size:', output_dict_size)

print(input_encoding.items()[0:5])
print(input_decoding.items()[0:5])

English character dict size: 54
Katakana character dict size: 89
[(u'j', 35), (u'u', 44), (u' ', 5), (u'z', 50), (u's', 42)]
[(1, 'START'), (2, u'\xea'), (3, u'\u017c'), (4, u'\u0175'), (5, u' ')]


###  Transforming the titles

In [6]:
def transform(encoding, data, vector_size):
    transformed_data = np.zeros(shape=(len(data), vector_size))
    for i in range(len(data)):
        for j in range(min(len(data[i]), vector_size)):
            transformed_data[i][j] = encoding[data[i][j]]
    return transformed_data

INPUT_LENGTH = 20
OUTPUT_LENGTH = 20

encoded_training_input = transform(input_encoding, training_input, vector_size=INPUT_LENGTH)
encoded_training_output = transform(output_encoding, training_output, vector_size=OUTPUT_LENGTH)
encoded_validation_input = transform(input_encoding, validation_input, vector_size=INPUT_LENGTH)
encoded_validation_output = transform(output_encoding, validation_output, vector_size=OUTPUT_LENGTH)

print('input', encoded_training_input)
print('output', encoded_training_output)

input [[ 29.  38.  43. ...,   0.   0.   0.]
 [ 30.  25.  32. ...,   0.   0.   0.]
 [ 39.  38.  47. ...,   0.   0.   0.]
 ..., 
 [ 34.  25.  45. ...,   0.   0.   0.]
 [ 42.  32.  37. ...,   0.   0.   0.]
 [ 35.  38.  42. ...,   0.   0.   0.]]
output [[ 57.  66.  29. ...,   0.   0.   0.]
 [ 72.  25.  21. ...,   0.   0.   0.]
 [ 79.  45.  47. ...,   0.   0.   0.]
 ..., 
 [ 50.  19.  53. ...,   0.   0.   0.]
 [  8.  21.  36. ...,   0.   0.   0.]
 [ 30.  20.   8. ...,   0.   0.   0.]]


# Sequence-to-Sequence in Keras

In [7]:
encoder_input = Input(shape=(INPUT_LENGTH,))
decoder_input = Input(shape=(OUTPUT_LENGTH,))

### Encoder

First, we will use [Embedding layer](https://keras.io/layers/embeddings/) to transform input char-id sequence into dense vectors.  

The input vectors will be passed to a [Recurrent layer](https://keras.io/layers/recurrent/) (we use LSTM) that will transform the vectors of each input character to a single output vector.

In [8]:
# Encoder
encoder = Embedding(input_dict_size, 64, input_length=INPUT_LENGTH, mask_zero=True)(encoder_input)
encoder = LSTM(64)(encoder)

print(encoder.get_shape())

(?, 64)


### Decoder

Our decoder generate Katakana sequence (as a softmax prediction) on characrter at the time. Every generated output at decoding step will be passed back as an input of the decoder to generate the next output.

Similar to the encoder, the input will be passed to an Embedding layer to transform the input into dense vectors and pass them to LSTM.

We will use the encoder's output to initialize decoder state (`initial_state`).

The final layer will be (time distributed) Dense layer that will produce the softmax prediction.

In [9]:
decoder = Embedding(output_dict_size, 64, input_length=OUTPUT_LENGTH, mask_zero=True)(decoder_input)

decoder = LSTM(64, return_sequences=True)(decoder, initial_state=[encoder, encoder])
decoder = TimeDistributed(Dense(output_dict_size, activation="softmax"))(decoder)

print(decoder.get_shape())

(?, 20, 89)


In [10]:
model = Model(inputs=[encoder_input, decoder_input], outputs=[decoder])
model.compile(optimizer='adam', loss='binary_crossentropy')

In [11]:
# Encoder Input
training_encoder_input = encoded_training_input

# Decoder Input (need padding py START_CHAR_CODE)
training_decoder_input = np.zeros_like(encoded_training_output)
training_decoder_input[:, 1:] = encoded_training_output[:,:-1]
training_decoder_input[:, 0] = START_CHAR_CODE

# Decoder Output (one-hot encode)
training_decoder_output = np.eye(output_dict_size)[encoded_training_output.astype('int')]


print('encoder input', training_encoder_input[:1])
print('decoder input', training_decoder_input[:1])
print('decoder output', training_decoder_output[:1].argmax(axis=2))
print('decoder output (one-hot)', training_decoder_output[:1])

encoder input [[ 29.  38.  43.  38.  30.  38.  27.  44.  50.  33.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.]]
decoder input [[  1.  57.  66.  29.  81.  46.  30.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.]]
decoder output [[57 66 29 81 46 30  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
decoder output (one-hot) [[[ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 1.  0.  0. ...,  0.  0.  0.]
  [ 1.  0.  0. ...,  0.  0.  0.]
  [ 1.  0.  0. ...,  0.  0.  0.]]]


In [12]:
validation_encoder_input = encoded_validation_input
validation_decoder_input = np.zeros_like(encoded_validation_output)
validation_decoder_input[:, 1:] = encoded_validation_output[:,:-1]
validation_decoder_input[:, 0] = START_CHAR_CODE
validation_decoder_output = np.eye(output_dict_size)[encoded_validation_output.astype('int')]

## Training the model

In [15]:
if os.path.isfile('model.h5'):
    model = load_model('model.h5')
else:
    model.fit(x=[training_encoder_input, training_decoder_input], y=[training_decoder_output],
          validation_data=([validation_encoder_input, validation_decoder_input], [validation_decoder_output]),
          verbose=2,
          batch_size=64,
          epochs=60)
    
model.save('model.h5')

Train on 42904 samples, validate on 10726 samples
Epoch 1/60
105s - loss: 0.0483 - val_loss: 0.0441
Epoch 2/60
99s - loss: 0.0422 - val_loss: 0.0410
Epoch 3/60
100s - loss: 0.0399 - val_loss: 0.0396
Epoch 4/60
102s - loss: 0.0385 - val_loss: 0.0380
Epoch 5/60
100s - loss: 0.0369 - val_loss: 0.0364
Epoch 6/60
97s - loss: 0.0352 - val_loss: 0.0347
Epoch 7/60
98s - loss: 0.0335 - val_loss: 0.0331
Epoch 8/60
98s - loss: 0.0319 - val_loss: 0.0316
Epoch 9/60
104s - loss: 0.0304 - val_loss: 0.0300
Epoch 10/60
100s - loss: 0.0289 - val_loss: 0.0288
Epoch 11/60
99s - loss: 0.0278 - val_loss: 0.0277
Epoch 12/60
99s - loss: 0.0268 - val_loss: 0.0269
Epoch 13/60
102s - loss: 0.0259 - val_loss: 0.0263
Epoch 14/60
104s - loss: 0.0252 - val_loss: 0.0257
Epoch 15/60
103s - loss: 0.0246 - val_loss: 0.0251
Epoch 16/60
101s - loss: 0.0240 - val_loss: 0.0247
Epoch 17/60
98s - loss: 0.0236 - val_loss: 0.0242
Epoch 18/60
98s - loss: 0.0231 - val_loss: 0.0238
Epoch 19/60
99s - loss: 0.0227 - val_loss: 0.0236

### Testing the model

During the testing or after deploy the model, to generate the output we will use "greedy" generating approach, which is generating one output at a time by maximize softmax score and feed the output back as the next decoder input character. 

We won't use [beam-search decoding](https://www.quora.com/Why-is-beam-search-required-in-sequence-to-sequence-transduction-using-recurrent-neural-networks)

In [54]:
def generate(text):
    encoder_input = transform(input_encoding, [text.lower()], 20)
    decoder_input = np.zeros(shape=(len(encoder_input), OUTPUT_LENGTH))
    decoder_input[:,0] = START_CHAR_CODE
    for i in range(1, OUTPUT_LENGTH):
        output = model.predict([encoder_input, decoder_input]).argmax(axis=2)
        decoder_input[:,i] = output[:,i]
    return decoder_input[:,1:]

def decode(decoding, sequence):
    text = ''
    for i in sequence:
        if i == 0:
            break
        text += output_decoding[i]
    return text

def to_katakana(text):
    decoder_output = generate(text)
    return decode(output_decoding, decoder_output[0])

If the model is trained correctly, typical names should be translate correctly.

In [44]:
common_american_names = ['James', 'John', 'Robert', 'Mary', 'Patricia', 'Linda']
for name in common_american_names:
    print(name, to_katakana(name))

James ジェームズ
John ジョン
Robert ロベルト
Mary マリー
Patricia パトリシア
Linda リンダ


Because we train the model with mostly people and places names, some English words may not be written correctly.

In [55]:
print(to_katakana('computer'))
print(to_katakana('taxi'))

コンプーター
タクシ
