## 딥러닝 학습
Seq2Seq 선택

+ RNN의 경우 문장이 길어질수록 앞의 정보가 뒤로 충분히 전달되지 못하기 때문에, 최대 길이가 100글자가 넘는 문장을 처리하기에 부적합
+ RNN기반인 LSTM, GRU 또한 이런 장기의존성 문제를 갖고 있음  
    => 인코더-디코더를 이용해 장기의존성 문제를 해결한 Attention 모델 활용  
    => Seq2Seq는 2014년 발표되어 챗봇과 기계번역에 많이 쓰이는 모델로,  
        입력 시퀀스와 출력 시퀀스를 각각 입력 문장과 번역 문장으로 만들면 번역기를 만들 수 있을 것으로 예상

In [1]:
#!pip install keras
#!pip install tensorflow
#!pip install sklearn

In [2]:
import pandas as pd
import numpy as np
import glob, os, re, jieba
import time
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.models import Model, save_model, load_model
from keras.layers import Input, LSTM, Dense, Flatten
from keras.callbacks import EarlyStopping

In [3]:
# tensorflow GPU 사용방법 참고)https://github.com/tensorflow/docs-l10n/blob/master/site/ko/guide/gpu.ipynb 
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_memory_growth(gpus[0], True)
    print('GPU running')
  except RuntimeError as e:
    print('Runtime Error')

## 토큰화

In [4]:
def tokenize(language):
    lang_tokenizer = Tokenizer(filters=' ')
    lang_tokenizer.fit_on_texts(language)
    tensor = lang_tokenizer.texts_to_sequences(language)
    tensor = pad_sequences(tensor, padding='post')
    return tensor, lang_tokenizer

In [5]:
#1_preprocessing.ipynb에서 만들었던 함수 활용
def preprocess_ch(w):
    w = ' '.join(jieba.cut(w, cut_all=False))   
    w = w.rstrip().strip()
    w = '<start> ' + w + ' <end>'
    return w

#한국어 전처리
def preprocess_kr(w):
    w = w.strip()
    w = re.sub(r"([?.'!,¿\"])", r" \1 ", w)
    w = re.sub(r'[ |ㄱ-ㅎ|ㅏ-ㅣ]+', " ", w)
    w = w.strip()
    w = '<start> ' + w + ' <end>'
    return w  

def preprocess(path):
    files = glob.glob(os.path.join(path, '*.csv'))
    ch, ko = [], []
    
    for f in files:
        df = pd.read_csv(f)
        ch.extend(df['중국어'].values)
        ko.extend(df['한국어'].values)
    
    ch_series = pd.Series(ch)
    ko_series = pd.Series(ko)
    
    df = pd.concat([ch_series, ko_series], axis=1)
    df.columns = ['중국어', '한국어']
    
    df['중국어'] = df['중국어'].apply(preprocess_ch)
    df['한국어'] = df['한국어'].apply(preprocess_kr)
    
    return df

def tokenize_dataset(num_data):  #원래 path를 인자로 넘기려 했으나, 용량문제 해결 위해 num_data 사용
    df = pd.read_csv('train_df.csv', index_col=0)
    df = df.sample(num_data, random_state=2)
    ch_tensor, ch_tokenizer = tokenize(df['중국어'].values)
    ko_tensor, ko_tokenizer = tokenize(df['한국어'].values)
    return ch_tensor, ch_tokenizer, ko_tensor, ko_tokenizer

In [7]:
ch_tensor, ch_tokenizer, ko_tensor, ko_tokenizer = tokenize_dataset(10000)

print(ch_tensor.shape, ko_tensor.shape)

((10000, 90), (10000, 54))

In [8]:
def max_length(tensor):
    return max(len(t) for t in tensor)

max_length_ch, max_length_ko = max_length(ch_tensor), max_length(ko_tensor)
print(max_length_ch, max_length_ko)

ch_tensor_train, ch_tensor_val, ko_tensor_train, ko_tensor_val = train_test_split(ch_tensor, ko_tensor, test_size=0.2)
print(len(ch_tensor_train), len(ch_tensor_val))
print(len(ko_tensor_train), len(ko_tensor_val))

90 54
8000 2000
8000 2000


In [9]:
def convert(tokenizer, tensor):
    for t in tensor:
        if t != 0:
            print("%d ----> %s" % (t, tokenizer.index_word[t]))

In [10]:
print("Input Language; index to word mapping")
convert(ch_tokenizer, ch_tensor_train[0])
print()
print("Target Language; index to word mapping")
convert(ko_tokenizer, ko_tensor_train[0])

Input Language; index to word mapping
3 ----> <start>
1850 ----> 反而
11 ----> 是
3544 ----> 超出
470 ----> 必要
385 ----> 范围
1 ----> 的
9108 ----> 多巴胺
557 ----> 引起
1 ----> 的
1518 ----> 不安
10 ----> 和
2947 ----> 动摇
4934 ----> 只会
276 ----> 使
3553 ----> 能量
861 ----> 迅速
5877 ----> 燃烧
5 ----> 。
4 ----> <end>

Target Language; index to word mapping
2 ----> <start>
1073 ----> 오히려
2170 ----> 필요
293 ----> 이상의
57327 ----> 도파민이
57328 ----> 불러오는
57329 ----> 불안과
57330 ----> 동요는
4652 ----> 에너지를
657 ----> 빠르게
57331 ----> 연소시킬
3016 ----> 뿐이다
1 ----> .
3 ----> <end>


In [11]:
BUFFER_SIZE = len(ch_tensor_train)
BATCH_SIZE = 2
steps_per_epoch = len(ch_tensor_train) // BATCH_SIZE
embedding_size = 256
units = 1024
vocab_input_size = len(ch_tokenizer.word_index)   #중국어 토큰 개수(47066)
vocab_target_size = len(ko_tokenizer.word_index)  #한국어 토큰 개수(274429)

In [12]:
dataset = tf.data.Dataset.from_tensor_slices(
    (ch_tensor_train, ko_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [13]:
example_input_batch, example_targ_batch = next(iter(dataset))
print(example_input_batch.shape, example_targ_batch.shape)

(2, 90) (2, 54)


### 인코더 모델 생성
+ input(중국어) => [batch_size, max_length_inp]
+ 임베딩 층 => [batch_size,max_length_inp, embedding_dim]
+ GRU 층 => output(한국어)[batch_size, max_length_inp, enc_units],  
                히든레이어[batch_size, enc_units]

In [14]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))

In [15]:
encoder = Encoder(vocab_input_size, embedding_size, units, BATCH_SIZE)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

print(f'Encoder output (batch size, sequence length, units) = {sample_output.shape}')   
print(f'Encoder Hidden state  (batch size, units) = {sample_hidden.shape}')            

Encoder output (batch size, sequence length, units) = (2, 90, 1024)
Encoder Hidden state  (batch size, units) = (2, 1024)


### Attention 매커니즘
+ output [batch_size, max_length_inp, enc_units] => values 로 사용
+ 히든레이어 [batch_size, enc_units] => query 로 사용
+ (참고)https://hcnoh.github.io/2018-12-11-bahdanau-attention

In [16]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)
        # [batch_size, 1, enc_units]

        score = self.V(
            tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        # [batch_size, mex_length_inp, 1]

        attention_weights = tf.nn.softmax(score, axis=1)
        # [batch_size, mex_length_inp, 1]

        context_vector = attention_weights * values
        # [batch_size, max_length_inp, enc_units]

        context_vector = tf.reduce_sum(context_vector, axis=1)
        # [batch_size, enc_units]

        return context_vector, attention_weights

In [17]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print(f"Attention result shape: (batch size, units) {attention_result.shape}")
print(f"Attention weights shape: (batch_size, sequence_length, 1) {attention_weights.shape}")

Attention result shape: (batch size, units) (2, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (2, 90, 1)


### 디코더 모델 생성

In [18]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # 단어 하나하나 해석 진행 
        # x: [batch_size, 1],
        # hidden: [batch_size, units]
        # enc_output: [batch_size, max_length_inp, enc_units]

        context_vector, attention_weights = self.attention(hidden, enc_output)
        # [batch_size, enc_units]，[batch_size, max_length_inp, 1]

        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        # [batch_size, 1, embedding_dim+enc_units]

        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)

        return x, state, attention_weights

In [19]:
decoder = Decoder(vocab_target_size, embedding_size, units, BATCH_SIZE)
sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)

print(f'Decoder output shape: (batch_size, vocab size) {sample_decoder_output.shape}')

Decoder output shape: (batch_size, vocab size) (2, 64511)


### optimizer, 손실함수 최적화

In [20]:
optimizer = tf.keras.optimizers.Adam()
loss_objects = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                             reduction='none')


def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    # 0이면 False, 그 외는 True로 처리
    # 데이터 벡터화하면서 모두 뒷부분에 0을 붙여 동일한 길이가 되도록 마스킹
    
    loss_ = loss_objects(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

### 모델 훈련

In [21]:
checkpoint_dir = '../checkpoint/ch_to_ko_attention'
# checkpoint_prefix = os.path.join(checkpoint_dir, "cpkt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, 
                                 encoder=encoder,
                                 decoder=decoder)

manager = tf.train.CheckpointManager(checkpoint, directory=checkpoint_dir,
                                     checkpoint_name='model.ckpt',
                                     max_to_keep=3)

In [22]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([ko_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden,
                                                 enc_output)
            loss += loss_function(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(grads_and_vars=zip(gradients, variables))
    return batch_loss

In [24]:
epochs = 10

for epoch in range(epochs):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print("Epoch {} Batch {} Loss {:4f}".format(
                epoch + 1, batch, batch_loss.numpy()))

        if (epoch + 1) % 2 == 0:
            # checkpoint.save(file_prefix=checkpoint_prefix)

            manager.save()

        print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                            total_loss / steps_per_epoch))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
        
# ResourceExhaustedError: Graph execution error 발생 => https://www.pythonfixing.com/2022/05/fixed-tensorflow-invalidargumenterror.html

Epoch 1 Batch 0 Loss 3.999214
Epoch 1 Loss 0.0010
Time taken for 1 epoch 64.46890091896057 sec

Epoch 1 Loss 0.0019
Time taken for 1 epoch 78.56889963150024 sec

Epoch 1 Loss 0.0027
Time taken for 1 epoch 92.1432614326477 sec

Epoch 1 Loss 0.0038
Time taken for 1 epoch 106.45506596565247 sec

Epoch 1 Loss 0.0046
Time taken for 1 epoch 119.64306831359863 sec

Epoch 1 Loss 0.0058
Time taken for 1 epoch 135.2016031742096 sec

Epoch 1 Loss 0.0065
Time taken for 1 epoch 148.17160320281982 sec

Epoch 1 Loss 0.0072
Time taken for 1 epoch 162.9225480556488 sec

Epoch 1 Loss 0.0081
Time taken for 1 epoch 177.35154843330383 sec

Epoch 1 Loss 0.0089
Time taken for 1 epoch 193.4756829738617 sec

Epoch 1 Loss 0.0099
Time taken for 1 epoch 208.7083432674408 sec

Epoch 1 Loss 0.0110
Time taken for 1 epoch 224.88794326782227 sec

Epoch 1 Loss 0.0117
Time taken for 1 epoch 237.90194272994995 sec

Epoch 1 Loss 0.0125
Time taken for 1 epoch 251.80194282531738 sec

Epoch 1 Loss 0.0131
Time taken for 1 epo

In [None]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_ko, max_length_ch))

    sentence = preprocess_ch(sentence)
    inputs = [ch_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs], maxlen=max_length_ch, padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([ko_tokenizer.word_index['<start>']], 0)
    for t in range(max_length_ko):
        predictions, dec_hidden, attention_weights = decoder(
            dec_input, dec_hidden, enc_out)
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += ko_tokenizer.index_word[predicted_id] + ' '

        if ko_tokenizer.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [None]:
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}
    ax.set_xticklabels([''] + sentence, fontdict=fontdict)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [None]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print("Input : %s" % (sentence))
    print("Predicted translation : {}".format(result))
    attention_plot = attention_plot[:len(result.split(' ')
                                         ), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))


checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

translate(u'我迷失了')

In [None]:
def tokenize(train_df):
    #글자단위 토큰화
    ch_vocab, ko_vocab = set(), set()

    for line in train_df['중국어']:
        for c in line:
            ch_vocab.add(c)

    for line in train_df['한국어']:
        for c in line:
            ko_vocab.add(c)
            
    ch_vocab_size = len(ch_vocab) + 1  #94
    ko_vocab_size = len(ko_vocab) + 1  #4837
    
    #set -> list(데이터 변경 용이한 자료구조로 변환)
    ch_vocab = sorted(list(ch_vocab))
    ko_vocab = sorted(list(ko_vocab))
    
    ch_to_index = dict([(c, i+1) for i, c in enumerate(ch_vocab)])
    ko_to_index = dict([(c, i+1) for i, c in enumerate(ko_vocab)])
    
    #중국어 문장 인코딩
    encoder_input = []
    for li in train_df['중국어']:
        t = []
        for c in li:
            t.append(ch_to_index[c])
        encoder_input.append(t)
        
    #한국어 문장 인코딩
    decoder_input = []
    for li in train_df['한국어']:
        t = []
        for c in li:
            t.append(ko_to_index[c])
        decoder_input.append(t)   
        
    #번역되어 나올 한국어 문장 인코딩에서 '\t' 제거
    decoder_ko = []
    for li in train_df['한국어']:
        t = []
        i = 0
        for c in li:
            if i > 0:
                t.append(ko_to_index[c])
            i += 1
        decoder_ko.append(t)    
     
    #패딩
    max_len_ch = 1689
    max_len_ko = 373
    
    #문장 -> int -> padding
    encoder_input = pad_sequences(encoder_input, maxlen=max_len_ch, padding='post')
    decoder_input = pad_sequences(decoder_input, maxlen=max_len_ko, padding='post')
    decoder_ko = pad_sequences(decoder_ko, maxlen=max_len_ko, padding='post') 
    
    #문장들을 3차원 배열로 변환 : (encoder_input, decoder_input, decoder_target)
    #encoder_input은 (문장 개수, 문장 최대 길이, 문자 종류 수) 형태의 3차원 배열로 중국어 문장의 one-hot 형식 벡터 데이터
    #decoder_input은 (문장 개수, 문장 최대 길이, 문자 종류 수) 형태의 3차원 배열로 한국어 문장의 one-hot 형식 벡터 데이터
    #decoder_ko은 decoder_input과 같지만, 하나의 time step만큼 offset, 즉, decoder_target[:, t, :] = decoder_input[:, t+1, :]
    encoder_input = np_utils.to_categorical(encoder_input)
    decoder_input = np_utils.to_categorical(decoder_input)
    decoder_ko = np_utils.to_categorical(decoder_ko)
    
    return encoder_input, decoder_input, decoder_ko, ch_vocab_size, ko_vocab_size, index_to_ch, index_to_ko

#### 데이터가 너무 많으면 한번에 토큰화할 수 없기 때문에,
#### 데이터를 4000개씩 나누어 토큰화 > 모델학습 > 저장 > 전이학습 반복

In [None]:
df = train_df[:4000]
    
encoder_input, decoder_input, decoder_ko, ch_vocab_size, ko_vocab_size, index_to_ch, index_to_ko = tokenize(df)

#중국어 인코딩
tmp_dict = dict((i,c ) for c , i in index_to_ch.items()) 

for i in tmp_dict:
    try:
        tmp_dict[i] = tmp_dict[i].encode('EUC_CN')
    except:
        pass
    
index_to_ch = dict((i,c ) for c , i in tmp_dict.items()) 

In [None]:
# 트레이닝 시 이전 상태의 실제값을 현재상태의 디코더 입력으로 해야함(예측값으로 하면 안됨)
encoder_inputs = Input(shape=(None, ch_vocab_size), name='encoder_input')
decoder_inputs = Input(shape=(None, ko_vocab_size ), name='decoder_input')

# 인코더 LSTM 셀
encoderLSTM = LSTM(units=256, return_state=True, name='encoderLSTM')    #return_state :인코더의 마지막 상태 정보를 디코더의 입력 상태 정보로 전달
decoderLSTM = LSTM(units=256, return_sequences=True, return_state=True, name='decoderLSTM')

# 인코더 LSTM셀의 입력 정의
encoder_outputs, stateH, stateC = encoderLSTM(encoder_inputs) # _, 히든상태(위), 셀상태(오른쪽)
encoder_state = [stateH, stateC] # 컨텍스트 벡터

decoder_output, _, _ = decoderLSTM(decoder_inputs, initial_state=encoder_state)
decoder_softmax = Dense(ko_vocab_size, activation="softmax")
decoder_output = decoder_softmax(decoder_output)

model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_output)

model.summary()

In [None]:
model.compile(optimizer="adam", loss="categorical_crossentropy")

early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model.fit(x=[encoder_input,decoder_input], y=decoder_ko, batch_size=64, epochs=50, callbacks=early_stopping)
save_model(model, 'ch_to_ko.h5', overwrite=True)

In [None]:
for i in range(1,len(train_df) // 2500):
    df = train_df[i*(2500):(i+1)*2500]
    
    encoder_input, decoder_input, decoder_ko, ch_vocab_size, ko_vocab_size = tokenize(df)

    model = load_model('ch_to_ko.h5')

    early_stopping = EarlyStopping(monitor='val_loss', patience=5)
    model.fit(x=[encoder_input, decoder_input], y=decoder_ko, batch_size=64, epochs=3, callbacks=early_stopping)
    save_model(model, 'ch_to_ko.h5', overwrite=True)

In [None]:
encoder_model = Model(inputs = encoder_inputs, outputs = encoder_state)

ch_to_index = dict((i,c ) for c , i in index_to_ch.items()) 
ko_to_index = dict((i,c ) for c , i in index_to_ko.items()) 

In [None]:
encoder_model.summary()

## 기본 LSTM 기반의 seq2seq 모델을 이용해 decoder_ko 예측

In [None]:
# 디코더
decoder_state_input_hidden = Input(shape=(256,))
decoder_state_input_cell = Input(shape=(256,))
decoder_state_input = [decoder_state_input_hidden, decoder_state_input_cell]

decoder_output, state_hidden, state_cell = decoderLSTM(decoder_inputs, initial_state = decoder_state_input)
decoder_state = [state_hidden, state_cell]
decoder_outputs = decoder_softmax(decoder_output)

decoder_model = Model(inputs=[decoder_inputs]+decoder_state_input, outputs=[decoder_output]+decoder_state)

In [None]:
decoder_model.summary()

In [None]:
def decode_seq(input_seq): 
    
    state_value = encoder_model.predict(input_seq)
    print('encoder_model의 예상 state_value :',np.shape(state_value))
    
    target_seq = np.zeros((1,1,ko_vocab_size))   #(1, 1, 1134)
    target_seq[0,0,ko_to_index['\t']] = 1      # 원핫인코딩
    
    stop = False
    decoded_sent=""
    while not stop: # "\n"문자를 만날때까지 반복
        
        output, h, c = decoder_model.predict([target_seq]+state_value)
        # 예측값을 한국어 문자로 변환
        token_index = np.argmax(output[0,-1,:]) 
        pred_char = index_to_ko[token_index]
        
        # 현시점 예측문자가 예측문장에 추가
        decoded_sent += pred_char
        
        if (pred_char == "\n" or len(decoded_sent) > 373):
            stop = True
            
        # 현시점 예측결과가 다음 시점에 입력으로 
        target_seq = np.zeros((1,1,ko_vocab_size))
        target_seq[0,0,token_index] = 1
        
        # 현시점 상태를 다음 시점 상태로 사용
        state_value = [h,c]
    
    return decoded_sent # 번역결과

In [None]:
for seq_index in [1,50,100,200,300]:
    
    input_seq = encoder_input[seq_index:seq_index+1]    # (1, 117, 2326)
    decoded_seq = decode_seq(input_seq)
    
    print("입력문장:", train_df['중국어'][seq_index])
    print("정답:", train_df['한국어'][seq_index][1:len(train_df['한국어'][seq_index])-1])   # "\t", "\n" 제거
    print("번역기:", decoded_seq[:len(decoded_seq)-1])
    print("\n")

## 모델이 잘 작동하는지 확인하기 위해 일부 문장 디코딩
    -encoder_input을 샘플링해 decoder_target으로 변환해본다.