# Neural Translation with Attentional seq2seq  

지난 실습에서는 seq2seq를 사용하여 날짜언어->날짜포맷 으로의 번역을 시도해 보았습니다.  
이번 실습에서는 동일한 task를 attention 개념이 포함된 seq2seq 모델을 이용해 다시 처리해 보겠습니다.

(참고)  
https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html  
https://medium.com/datalogue/attention-in-keras-1892773a4f22  
https://medium.com/@jbetker/implementing-seq2seq-with-attention-in-keras-63565c8e498c  
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html  
https://github.com/neonbjb/ml-notebooks/blob/master/keras-seq2seq-with-attention/keras_translate_notebook.ipynb  
https://wanasit.github.io/attention-based-sequence-to-sequence-in-keras.html

In [47]:
import os
import json
import pandas as pd
import numpy as np
import random
import unicodedata
import re
import time
import shutil
from collections import Counter

# Start by importing all the things we'll need.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Layer, Dot, Concatenate, Input, Activation, LSTM, Dense, Embedding, CuDNNLSTM, Flatten, TimeDistributed, Dropout, LSTMCell, RNN, Bidirectional
from keras.layers.recurrent import Recurrent
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.utils import tf_utils
from tensorflow.keras import backend as K

# This enables the Jupyter backend on some matplotlib installations.
%matplotlib notebook
import matplotlib.pyplot as plt

In [17]:
random.seed(1984)

## Load Dataset

In [18]:
# csv 포맷의 데이터 경로를 지정합니다.
train_dataset_filepath = 'datasets/nmt_date/nmt_date_train.csv'
test_dataset_filepath = 'datasets/nmt_date/nmt_date_test.csv'
df = pd.read_csv(train_dataset_filepath, header=None, names=['X', 'Y'])
x_corpus = df.iloc[:,0] 
y_corpus = df.iloc[:,1] 
x_corpus_list = x_corpus.values.tolist()
y_corpus_list = y_corpus.values.tolist()
x_char_list = np.concatenate([list(tuple(x)) for x in x_corpus_list], axis=0)
y_char_list = np.concatenate([list(tuple(y)) for y in y_corpus_list], axis=0)

## Build Vocabulary

In [19]:
counter_x = Counter(x_char_list)
counter_y = Counter(y_char_list)
x_vocab = ['PAD', 'BOS', 'EOS', 'UNK']
x_vocab = x_vocab + list(Counter(dict(counter_x.most_common())))
y_vocab = ['PAD', 'BOS', 'EOS', 'UNK']
y_vocab = y_vocab + list(Counter(dict(counter_y.most_common())))
idx2char_x = dict(enumerate(x_vocab))
char2idx_x = {char:index for index, char in enumerate(x_vocab)}
idx2char_y = dict(enumerate(y_vocab))
char2idx_y = {char:index for index, char in enumerate(y_vocab)}

## Prepare Train Dataset

In [20]:
def convert_sentence_to_indexed_corpus(corpus, char2idx):
    indexed_corpus = [char2idx['BOS']]
    indexed_corpus = indexed_corpus + [char2idx[char] if char in char2idx else char2idx['UNK'] for char in tuple(corpus)]
    indexed_corpus = indexed_corpus + [char2idx_x['EOS']]
    return indexed_corpus

In [21]:
indexed_x_corpus_list = []
for doc in x_corpus_list:
    indexed_x_corpus_list.append(convert_sentence_to_indexed_corpus(doc, char2idx_x))

In [22]:
indexed_y_corpus_list = []
for doc in y_corpus_list:
    indexed_y_corpus_list.append(convert_sentence_to_indexed_corpus(doc, char2idx_y))

In [23]:
max_x_corpus_length = max([len(doc) for doc in indexed_x_corpus_list])
max_y_corpus_length = max([len(doc) for doc in indexed_y_corpus_list])

In [24]:
input_data = tf.keras.preprocessing.sequence.pad_sequences(indexed_x_corpus_list, maxlen=max_x_corpus_length, padding="post")
output_data = tf.keras.preprocessing.sequence.pad_sequences(indexed_y_corpus_list, maxlen=max_y_corpus_length, padding="post")
teacher_data = output_data

target_data = [[teacher_data[n][i+1] for i in range(len(teacher_data[n])-1)] for n in range(len(teacher_data))]
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_data, maxlen=max_y_corpus_length, padding="post")
target_data = target_data.reshape((target_data.shape[0], target_data.shape[1], 1))

print(input_data.shape)
print(teacher_data.shape)
print(target_data.shape)

(500000, 68)
(500000, 12)
(500000, 12, 1)


In [25]:
BUFFER_SIZE = len(x_corpus_list)
BATCH_SIZE = 32
embedding_dim = 16
units = 32
x_vocab_size = len(idx2char_y)
y_vocab_size = len(idx2char_y)
len_input = max_x_corpus_length
len_target = max_y_corpus_length

## Build Attentional seq2seq model

In [30]:
# Create the Encoder layers first.
encoder_inputs = Input(shape=(len_input,))
encoder_emb = Embedding(input_dim=x_vocab_size, output_dim=embedding_dim)
# encoder_lstm = CuDNNLSTM(units=units, return_sequences=True, return_state=True)
encoder_lstm = Bidirectional(CuDNNLSTM(units=units, return_sequences=True), merge_mode='concat')

encoder_outputs = encoder_lstm(encoder_emb(encoder_inputs))
# encoder_states = [state_h, state_c]

In [45]:
# Now create the Decoder layers.
decoder_inputs = Input(shape=(None,))
decoder_emb = Embedding(input_dim=y_vocab_size, output_dim=embedding_dim)
decoder_lstm = CuDNNLSTM(units=units*2, return_sequences=True, return_state=True)
decoder_lstm_out, _, _ = decoder_lstm(decoder_emb(decoder_inputs))#, initial_state=encoder_states)
decoder_lstm_out.shape

TensorShape([Dimension(None), Dimension(None), Dimension(64)])

### Context and Attention

In [50]:
# attention = K.dot([decoder_lstm_out, encoder_outputs], axes=[2, 2])
attention = Dot(axes=[2, 2])([decoder_lstm_out, encoder_outputs])
attention = Activation('softmax')(attention)

# context = K.dot([attention, encoder_outputs], axes=[2,1])
context = Dot(axes=[2, 1])([attention, encoder_outputs])
# context.shape
decoder_combined_context = Concatenate(axis=2)([context, decoder_lstm_out])
decoder_combined_context.shape

TensorShape([Dimension(None), Dimension(None), Dimension(128)])

In [51]:
output = TimeDistributed(Dense(units, activation="tanh"))(decoder_combined_context) # equation (5) of the paper
output = TimeDistributed(Dense(y_vocab_size, activation="softmax"))(output) # equation (6) of the paper

In [56]:
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=[output])
model.summary()
model.compile(optimizer=tf.train.AdamOptimizer(), loss="sparse_categorical_crossentropy", metrics=['sparse_categorical_accuracy'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 68)           0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, None, 16)     240         input_9[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 68, 16)       240         input_5[0][0]                    
__________________________________________________________________________________________________
cu_dnnlstm

In [57]:
epochs = 1
history = model.fit([input_data, teacher_data], target_data,
                 batch_size=BATCH_SIZE,
                 epochs=epochs,
                 validation_split=0.2)

Train on 400000 samples, validate on 100000 samples
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


## Inference

In [96]:
def sentence_to_vector(sentence):
    pre = convert_sentence_to_indexed_corpus(sentence, char2idx_x)
    vec = np.zeros(len_input)
    for i,w in enumerate(pre):
        vec[i] = w
    return vec

def generate(input_sentence):
    encoder_input = sentence_to_vector(input_sentence)
    # Reshape so we can use the encoder model. New shape=[samples,sequence length]
    encoder_input = encoder_input.reshape(1,len(encoder_input))

    decoder_input = np.zeros(shape=(len(encoder_input), len_target))
    decoder_input[:,0] = char2idx_y['BOS']
    for i in range(1, len_target):
        output = model.predict([encoder_input, decoder_input]).argmax(axis=2)
        decoder_input[:,i] = output[:,i]
        if output[:,i] == char2idx_y['EOS']:
            break
    return decoder_input[:,1:]

def decode(idx2char, sequence):
    text = ""
    for i in sequence:
        if i == char2idx_y['PAD'] or i == char2idx_y['EOS']:
            break
        text += idx2char[i]
    return text

def translate(input_sentence):
    decoder_output = generate(input_sentence)
    print(decoder_output)
    return decode(idx2char_y, decoder_output[0])

In [97]:
translate("June 11th, 2019")

[[7. 6. 8. 4. 5. 4. 6. 2. 0. 0. 0.]]


'219-0-1'