In [1]:
from __future__ import print_function

import sys
import os
import pandas as pd
import numpy as np

import torch
from torch.nn import Module, Embedding, LSTM, Linear, CrossEntropyLoss, NLLLoss
from torch.optim import Adam
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

sys.path.append(os.path.abspath(os.path.join('..')))
import katakana.encoding as encoding

# Data Pre-Processing

We use the joined titles dataset similar to the [TensorFlow's version](./Writing%20Katakana%20using%20Sequence-to-Sequence%20in%20TensorFlow.ipynb).

In [2]:
data = pd.read_csv('../data/joined_titles.csv', header=None)
data = data.sample(frac=1, random_state=0)

data_input = [s.lower() for s in data[0]]
data_output = [s.lower() for s in data[1]]
print(data_input[0:3])
print(data_output[0:3])

data_size = len(data)
training_split_index = int(data_size*60/100)
validation_split_index = int(data_size*70/100)

# We will use the first 0-60th %-tile (60%) of data for the training
training_input  = data_input[:training_split_index]
training_output = data_output[:training_split_index]

# We will use the first 60-70th %-tile (10%) of data for the training
validation_input = data_input[training_split_index:validation_split_index]
validation_output = data_output[training_split_index:validation_split_index]

print('training size', len(training_input))
print('validation size', len(validation_input))

['dorogobuzh', 'gail hopkins', 'novatek']
['ドロゴブージ', 'ゲイル・ホプキンス', 'ノヴァテク']
training size 64356
validation size 10726


We will also reuse the data encoding and transform already written in `katakana/encoding.py`. (See. [Writing Katakana using Sequence-to-Sequence in TensorFlow](./Writing%20Katakana%20using%20Sequence-to-Sequence%20in%20TensorFlow.ipynb))

In [3]:
english_encoding, english_decoding, english_dict_size = encoding.build_characters_encoding(data_input)
japanese_encoding, japanese_decoding, japanese_dict_size = encoding.build_characters_encoding(data_output)

print('English character dict size:', english_dict_size)
print('Katakana character dict size:', japanese_dict_size)

English character dict size: 54
Katakana character dict size: 89


In [4]:
INPUT_LENGTH = 20
OUTPUT_LENGTH = 20

encoded_training_input = encoding.transform(
    english_encoding, training_input, vector_size=INPUT_LENGTH)
encoded_training_output = encoding.transform(
    japanese_encoding, training_output, vector_size=OUTPUT_LENGTH)

print('encoded_training_input', encoded_training_input.shape)
print('encoded_training_output', encoded_training_output.shape)

encoded_validation_input = encoding.transform(
    english_encoding, validation_input, vector_size=INPUT_LENGTH)
encoded_validation_output = encoding.transform(
    japanese_encoding, validation_output, vector_size=OUTPUT_LENGTH)

print('encoded_validation_input', encoded_validation_input.shape)
print('encoded_validation_output', encoded_validation_output.shape)

encoded_training_input (64356, 20)
encoded_training_output (64356, 20)
encoded_validation_input (10726, 20)
encoded_validation_output (10726, 20)


In [5]:
print(encoded_training_input[0])

[13 24 12 24  7 24 18 35 47 28  0  0  0  0  0  0  0  0  0  0]


In [6]:
print(encoded_training_output[0])

[85 50 17 65 21 58  0  0  0  0  0  0  0  0  0  0  0  0  0  0]


# Sequence-to-Sequence in Pytorch

### Encoder

The encoder consists of [Embedding](https://pytorch.org/docs/stable/nn.html#embedding) and [LSTM](https://pytorch.org/docs/stable/nn.html#lstm). 

It first embeds each character input into a vector. Then, it feeds the embeded input into the LSTM. After all characters have been processed, we take the final LSTM output as the encoder output.

Note: we need to set `batch_first=True` to make Pytorch's LSTM taking input with dimensions (batch_size, sequnece_size, vector_size) similar to TensorFlow's LSTM.

In [7]:
class Encoder(Module):
    
    def __init__(self, input_dict_size=english_dict_size):
        super(Encoder, self).__init__()
        self.embedding = Embedding(input_dict_size, 64)
        self.lstm = LSTM(64, 64, batch_first=True)

    def forward(self, encoder_input_sequences):
        embedded = self.embedding(encoder_input_sequences)
        output, _ = self.lstm(embedded)
        return output[:, -1]
    
encoder = Encoder()
print(encoder)

encoder_input = torch.tensor(encoded_training_input[:2], dtype=torch.long)
encoder_output = encoder(encoder_input)
print('encoder_input', encoder_input.shape, encoder_input.dtype)
print('encoder_output', encoder_output.shape, encoder_output.dtype)

Encoder(
  (embedding): Embedding(54, 64)
  (lstm): LSTM(64, 64, batch_first=True)
)
encoder_input torch.Size([2, 20]) torch.int64
encoder_output torch.Size([2, 64]) torch.float32


### Decoder

The encoder consists of [Embedding](https://pytorch.org/docs/stable/nn.html#embedding), [LSTM](https://pytorch.org/docs/stable/nn.html#lstm), and [Linear](https://pytorch.org/docs/stable/nn.html#linear). 

We train decoder to output the next Katakana character in the sequence. The decoder inputs are Katakana sequences and the output from the encoder.

Similar to the encoder, the decoder embeds input the sequence and pass the embeded sequence to LSTM. However, this time, we initialize the LSTM's state with encoder's output. The LSTM's output are then passed into the linear layer to produce the final output.

Note: We don't apply Softmax activation to the final output to make it easier to apply `CrossEntropyLoss` (see "Training the model"). Applying the Softmax also won't change the result when we use the decoder to generate the output greedily (see "Testing the model").

In [8]:
class Decoder(Module):
    
    def __init__(self, output_dict_size=japanese_dict_size):
        super(Decoder, self).__init__()
        self.embedding = Embedding(output_dict_size, 64)
        self.lstm = LSTM(64, 64, batch_first=True)
        self.linear = Linear(64, output_dict_size)

    def forward(self, encoder_output, decoder_input_sequence):
        encoder_output = encoder_output.unsqueeze(0)
        
        embedded = self.embedding(decoder_input_sequence)
        output, _ = self.lstm(embedded, [encoder_output, encoder_output])
        output = self.linear(output)
        
        return output

decoder = Decoder()
print(decoder)

decoder_input = torch.tensor(encoded_training_output[:2], dtype=torch.long)
decoder_output = decoder(encoder_output, decoder_input)
print('decoder_input', decoder_input.shape, decoder_input.dtype)
print('decoder_output', decoder_output.shape, decoder_output.dtype)

Decoder(
  (embedding): Embedding(89, 64)
  (lstm): LSTM(64, 64, batch_first=True)
  (linear): Linear(in_features=64, out_features=89, bias=True)
)
decoder_input torch.Size([2, 20]) torch.int64
decoder_output torch.Size([2, 20, 89]) torch.float32


In [9]:
class Seq2Seq(Module):
    
    def __init__(self):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
    
    def forward(self, encoder_input_sequences, decoder_input_sequences):
        encoder_output = self.encoder(encoder_input_sequences)
        decoder_output = self.decoder(encoder_output, decoder_input_sequences)
        return decoder_output

model = Seq2Seq()
print(model)

encoder_input = torch.tensor(encoded_training_input[:2], dtype=torch.long)
decoder_input = torch.tensor(encoded_training_output[:2], dtype=torch.long)
model_output = model(encoder_input, decoder_input)
print('model_output', model_output)


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(54, 64)
    (lstm): LSTM(64, 64, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(89, 64)
    (lstm): LSTM(64, 64, batch_first=True)
    (linear): Linear(in_features=64, out_features=89, bias=True)
  )
)
model_output tensor([[[ 0.0956,  0.0912, -0.1650,  ...,  0.1132,  0.0783, -0.0100],
         [ 0.0156,  0.1449, -0.1823,  ...,  0.0277,  0.1391,  0.0440],
         [-0.0021,  0.0035, -0.1248,  ...,  0.0535,  0.1420,  0.0121],
         ...,
         [ 0.0977,  0.0540,  0.0132,  ...,  0.1889,  0.0122,  0.1002],
         [ 0.0979,  0.0544,  0.0131,  ...,  0.1889,  0.0120,  0.1003],
         [ 0.0981,  0.0546,  0.0130,  ...,  0.1890,  0.0119,  0.1004]],

        [[-0.0080,  0.0080,  0.0017,  ...,  0.1097,  0.1302, -0.0538],
         [ 0.0537,  0.0877, -0.1111,  ...,  0.1541,  0.1805, -0.2501],
         [-0.0922,  0.1256, -0.0422,  ...,  0.0806,  0.0285, -0.1616],
         ...,
         [ 0.0981,  0.0526,  0.015

In [10]:
# Encoder Input
training_encoder_input = encoded_training_input
training_decoder_output = encoded_training_output

# Decoder Input (need padding py START_CHAR_CODE)
training_decoder_input = np.zeros_like(encoded_training_output)
training_decoder_input[:, 1:] = encoded_training_output[:,:-1]
training_decoder_input[:, 0] = encoding.CHAR_CODE_START

print('encoder input', training_encoder_input[:1])
print('decoder input', training_decoder_input[:1])
print('(expected) decoder output', training_decoder_output[:1])

encoder input [[13 24 12 24  7 24 18 35 47 28  0  0  0  0  0  0  0  0  0  0]]
decoder input [[ 1 85 50 17 65 21 58  0  0  0  0  0  0  0  0  0  0  0  0  0]]
(expected) decoder output [[85 50 17 65 21 58  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]


In [11]:
validation_encoder_input = encoded_validation_input
validation_decoder_input = np.zeros_like(encoded_validation_output)
validation_decoder_input[:, 1:] = encoded_validation_output[:,:-1]
validation_decoder_input[:, 0] = encoding.CHAR_CODE_START
validation_decoder_output = encoded_validation_output

print('encoder input', validation_encoder_input[:1])
print('decoder input', validation_decoder_input[:1])
print('(expected) decoder output', validation_decoder_output[:1])

encoder input [[26 38 16 28 24 38 34 22 19 24 12 12 24 37  0  0  0  0  0  0]]
decoder input [[ 1  7 11 19 52 21  8 64 50 21  0  0  0  0  0  0  0  0  0  0]]
(expected) decoder output [[ 7 11 19 52 21  8 64 50 21  0  0  0  0  0  0  0  0  0  0  0]]


## Training the model

In [23]:
def train_epoch(model, optimizer, 
                batch_size=64, 
                criterion=CrossEntropyLoss(),
                encoder_input=training_encoder_input,
                decoder_input=training_decoder_input,
                decoder_output=training_decoder_output):
    
    # re-shuffle the training_data:
    permutation = np.random.permutation(encoder_input.shape[0])
    encoder_input = encoder_input[permutation]
    decoder_input = decoder_input[permutation]
    decoder_output = decoder_output[permutation]
    
    epoch_loss = 0
    iteration_count = 0
    
    for begin_index in range(0, len(encoder_input), batch_size):    
        end_index = begin_index + batch_size
        iteration_count += 1
        
        encoder_input_step = torch.tensor(encoder_input[begin_index:end_index])
        decoder_input_step = torch.tensor(decoder_input[begin_index:end_index])
        decoder_output_step = torch.tensor(decoder_output[begin_index:end_index])
        
        # If training on GPU...
        # model.cuda()
        # encoder_input_step = encoder_input_step.cuda()
        # decoder_input_step = decoder_input_step.cuda()
        # decoder_output_step = decoder_output_step.cuda()
        
        optimizer.zero_grad()
        
        output = model(encoder_input_step, decoder_input_step)
        target = decoder_output_step.view(-1)
        output = output.view(-1, output.shape[-1])
        loss = criterion(output, target)
        loss.backward()

        optimizer.step()
        epoch_loss += loss.item()
            
    return epoch_loss / iteration_count


In [24]:
def validate(model, 
             criterion=CrossEntropyLoss(),
             encoder_input=validation_encoder_input,
             decoder_input=validation_decoder_input,
             decoder_output=validation_decoder_output):
    
    encoder_input = torch.tensor(encoder_input)
    decoder_input = torch.tensor(decoder_input)
    decoder_output = torch.tensor(decoder_output)

    # If training on GPU...
    # model.cuda()
    # encoder_input = encoder_input.cuda()
    # decoder_input = decoder_input.cuda()
    # decoder_output = decoder_output.cuda()
    
    output = model(encoder_input, decoder_input)
    
    target = decoder_output.view(-1)
    output = output.view(-1, output.shape[-1])
    loss = criterion(output, target)
    
    return loss.item()


In [36]:
def train_model(model, optimizer, n_epoch=30, validate_every_n_epoach=3):
    
    for i in range(1, n_epoch + 1):
        print('Epoch %i' % i)
        
        loss = train_epoch(model, optimizer)
        print('> Training Loss', loss)
        
        if i % validate_every_n_epoach == 0:
            validation_loss = validate(model)
            print('> Validation Loss', validation_loss)
        
model = Seq2Seq()
optimizer = Adam(model.parameters())
train_model(model, optimizer, n_epoch=30)

Epoch 1
> Training Loss 1.5449128637967953
Epoch 2
> Training Loss 1.2714332525820191
Epoch 3
> Training Loss 1.1162601499031362
> Validation Loss 1.0606048107147217
Epoch 4
> Training Loss 1.008581491042795
Epoch 5
> Training Loss 0.9326041009032703
Epoch 6
> Training Loss 0.8779584057406924
> Validation Loss 0.8624294996261597
Epoch 7
> Training Loss 0.8386473264656295
Epoch 8
> Training Loss 0.8072047768601366
Epoch 9
> Training Loss 0.7812739739242652
> Validation Loss 0.7808693051338196
Epoch 10
> Training Loss 0.7585053891832264
Epoch 11
> Training Loss 0.7382411765412356
Epoch 12
> Training Loss 0.7207917072782459
> Validation Loss 0.7323684692382812
Epoch 13
> Training Loss 0.7055101474047181
Epoch 14
> Training Loss 0.6918286189407288
Epoch 15
> Training Loss 0.6797983193729317
> Validation Loss 0.6897333860397339
Epoch 16
> Training Loss 0.669035366379006
Epoch 17
> Training Loss 0.6592021589250735
Epoch 18
> Training Loss 0.6500768760148858
> Validation Loss 0.66958141326904

### Testing the model

During the testing or after deploy the model, to generate the output we will use "greedy" generating approach, which is generating one output at a time by maximize softmax score and feed the output back as the next decoder input character. 

We won't use [beam-search decoding](https://www.quora.com/Why-is-beam-search-required-in-sequence-to-sequence-transduction-using-recurrent-neural-networks)

In [58]:
def generate_output(input_sequence):
    
    decoder_input = np.zeros(shape=(len(input_sequence), OUTPUT_LENGTH), dtype='int')
    decoder_input[:,0] = encoding.CHAR_CODE_START
    
    encoder_input = torch.tensor(input_sequence)
    decoder_input = torch.tensor(decoder_input)
    
    for i in range(1, OUTPUT_LENGTH):
        model.cpu()
        output = model(encoder_input, decoder_input)
        output = output.argmax(dim=2)
        decoder_input[:,i] = output[:,i-1]
        
    return decoder_input[:,1:].detach().numpy()

def to_katakana(text):
    input_sequence = encoding.transform(english_encoding, [text.lower()], 20)
    output_sequence = generate_output(input_sequence)
    return encoding.decode(japanese_decoding, output_sequence[0])

generate('test')

array([53, 33, 47,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0])

If the model is trained correctly, typical names should be translate correctly.

In [59]:
common_american_names = ['James', 'John', 'Robert', 'Mary', 'Patricia', 'Linda']
for name in common_american_names:
    print(name, to_katakana(name))

James ジェームズ
John ジョン
Robert ロベルト
Mary マーリー
Patricia パトリアイシア
Linda リンダ


Because we train the model with mostly people and places names, some English words may not be written correctly.

In [60]:
print(to_katakana('computer'))

コンプター


In [61]:
print(to_katakana('taxi'))

タキシ


In [62]:
print(to_katakana('banana'))

バナーナ
