In [2]:
!python --version
#https://wdprogrammer.tistory.com/37

Python 3.6.5 :: Anaconda, Inc.


In [3]:
import tensorflow as tf
tf.__version__

'2.0.0'

In [4]:
import numpy as np

In [5]:
with open('fra-eng/fra.txt','r',encoding='utf-8') as f:
    lines = f.read().split('\n')

input_texts = []
target_texts = []
target_text = ""

for line in lines[:3000]:
    input_text, target_text = line.split('\t')[:2]
    input_texts.append(input_text)
    target_text = '\t' + target_text + '\n'
    target_texts.append(target_text)

모델에 넣기 전 One-hot Vectorization

In [6]:
latent_dim = 256
input_characters = set()
target_characters = set()
for input_text, target_text in zip(input_texts, target_texts):
    for ch in input_text:
        if ch not in input_characters:
            input_characters.add(ch)
    for ch in target_text:
        if ch not in target_characters:
            target_characters.add(ch)

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

input_token_index = { char: id for id, char in enumerate(input_characters)}
target_token_index = { char: id for id, char in enumerate(target_characters)}

encoder_input_data = np.zeros(shape=(len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32')
decoder_input_data = np.zeros(shape=(len(target_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')
decoder_target_data = np.zeros(shape=(len(target_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for j, ch in enumerate(input_text):
        encoder_input_data[i, j, input_token_index[ch]] = 1.
    for j, ch in enumerate(target_text):
        decoder_input_data[i, j, target_token_index[ch]] = 1.
        if j > 0:
            decoder_target_data[i, j-1, target_token_index[ch]] = 1.

모델 구축

In [7]:
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense, BatchNormalization

# a part of encoder
encoder_inputs = Input(shape=(None, num_encoder_tokens), name='encoder_input')
encoder = LSTM(latent_dim, return_sequences=True, return_state=True, name='encoder')
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

# a part of decoder
decoder_inputs = Input(shape=(None, num_decoder_tokens), name='decoder_input')
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder')
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
batchNorm = BatchNormalization() # 학습이 조금 더 빠르게 되게하기 위해 Dense layer 전에 Batch Normalization을 추가.
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(batchNorm(decoder_outputs))

# a model to train
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

Using TensorFlow backend.


모델 학습

In [8]:
#[OMP: Error #15: Initializing libiomp5.dylib]에러 해결용
import os
os.environ['KMP_DUPLICATE_LIB_OK']='TRUE'

In [9]:
batch_size=64
epochs=10
optimizer='rmsprop'
loss='categorical_crossentropy'
load_model_path=None
save_model_path='s2s.h5'

if not load_model_path == None:
    load_model(load_model_path)
model.compile(optimizer, loss)
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, 
          batch_size = batch_size,
          epochs=epochs,
          validation_split=0.2)
model.save(save_model_path)

Train on 2400 samples, validate on 600 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


모델 사용

In [11]:
# encoder model to decode
encoder_model = Model(encoder_inputs, encoder_states)

# decoder model to decode
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [26]:
def decode_sequence(input_seq):
    # Encode the input as state vectors. [state_h, state_c]
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # 점점 디코드된 문자열을 추가해나감.
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
            len(decoded_sentence) > max_decoder_seq_length):
                stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]
         
    return decoded_sentence

In [27]:
def translate(input_seq):
    reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())
    # 입력을 모델에 주입할 수 있도록 벡터화한다.
    input_vec = np.zeros(shape=(len(input_seq), max_encoder_seq_length,                                        num_encoder_tokens), dtype='float32')
    for i, txt in enumerate(input_seq):
        for j, ch in enumerate(txt):
            input_vec[i, j, input_token_index[ch]] = 1.

    decoded_sequences = []
    for seq_idx in range(input_vec.shape[0]):
        decoded_sequences.append(decode_sequence(input_vec[seq_idx: seq_idx+1]))

    return decoded_sequences

In [28]:
print(lines[:15])

['Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', 'Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)', 'Hi.\tSalut.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)', 'Run!\tCours\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)', 'Run!\tCourez\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)', 'Who?\tQui ?\tCC-BY 2.0 (France) Attribution: tatoeba.org #2083030 (CK) & #4366796 (gillux)', 'Wow!\tÇa alors\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #374631 (zmoo)', 'Fire!\tAu feu !\tCC-BY 2.0 (France) Attribution: tatoeba.org #1829639 (Spamster) & #4627939 (sacredceltic)', "Help!\tÀ l'aide\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #435084 (lukaszpp) & #128430 (sysko)", 'Jump.\tSaute.\tCC-BY 2.0 (France) Attribution: tatoeba.org #631038 (Shishir) & #241

In [29]:
print(translate('Hi.'))

NameError: name 'reverse_target_char_index' is not defined