In [16]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
import pandas as pd
import numpy as np
from langdetect import detect
import re
import os
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, GRU, Embedding, Layer
from tensorflow.keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
import pydot

from src.helpme import clean_text,start_end_tagger,max_length,tokenize,preprocess,preprocess_sentence

In [18]:
df_test = pd.read_csv('data/kor.txt', sep='\t', names=['eng','kor','drop_me'])
df_test = df_test.drop(columns='drop_me')

In [19]:
df_test = df_test.drop([3317,3316])

In [20]:
df_test

Unnamed: 0,eng,kor
0,Go.,가.
1,Hi.,안녕.
2,Run!,뛰어!
3,Run.,뛰어.
4,Who?,누구?
...,...,...
3311,Why don't we just reformat the hard disk? You'...,우리 그냥 하드 디스크를 새로 포맷하는 건 어때? 너무 그걸로 스트레스 많이 받고 ...
3312,"I knew that Tom was just a freshman, so I was ...","난 톰이 그냥 신입생일 뿐이라고만 알았는데, 그러다보니 톰이랑 선배들이 서로 어울려..."
3313,Tom always cried when his sister took away his...,"톰은 누나가 자기 장난감을 빼앗아 갔을 때마다 울음을 터뜨렸고, 누나는 바로 그런 ..."
3314,Science fiction has undoubtedly been the inspi...,공상 과학 소설은 의심의 여지 없이 오늘날 존재하는 많은 기술에 영감을 주었어.


In [21]:
eng = preprocess(df_test['eng'])
kor = preprocess(df_test['kor'])

input_tensor, input_lang_tokenizer = tokenize(eng)
target_tensor, target_lang_tokenizer = tokenize(kor)

In [22]:
eng_vocab_size = len(input_lang_tokenizer.word_index)+1
kor_vocab_size = len(target_lang_tokenizer.word_index)+1

In [23]:
print(f'English vocab size: {eng_vocab_size}')
print(f'Korean vocab size: {kor_vocab_size}')

English vocab size: 2338
Korean vocab size: 5063


In [24]:
eng_max_length = len(input_tensor[0])
kor_max_length = len(target_tensor[0])

In [25]:
print(f'Longest English Sentence: {eng_max_length}')
print(f'Longest Korean Sentence: {kor_max_length}')

Longest English Sentence: 31
Longest Korean Sentence: 19


In [33]:
encoder_input = Input(shape=(None,),name='Encoder_input')
embedding_dim=100
embedded_input = Embedding(input_dim=eng_vocab_size,
                           output_dim=embedding_dim,
                           name='Embedding_layer')(encoder_input)
encoder_lstm = LSTM(units=100,
                   activation='relu',
                   return_sequences=False,
                   return_state=True,
                   name='Encoder_lstm')
encoder_out, enc_h_state, enc_c_state = encoder_lstm(embedded_input)

decoder_input = Input(shape=(None,1), name='Decoder_input')
# embedded_decoder = Embedding(kor_vocab_size,
#                             100,
#                             name='Decoder_embedded_layer')(decoder_input)
decoder_lstm = LSTM(units=100,
                   activation='relu',
                   return_sequences=True,
                   return_state=True,
                   name='Decoder_lstm')
decoder_out,_,_ = decoder_lstm(decoder_input,initial_state=[enc_h_state,enc_c_state])

final_dense = Dense(kor_vocab_size,activation='softmax',name='Final_dense_layer')
logits = final_dense(decoder_out)

model = Model([encoder_input,decoder_input],logits)

model.compile(loss='sparse_categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])

In [34]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Embedding_layer (Embedding)     (None, None, 100)    233800      Encoder_input[0][0]              
__________________________________________________________________________________________________
Decoder_input (InputLayer)      [(None, None, 1)]    0                                            
__________________________________________________________________________________________________
Encoder_lstm (LSTM)             [(None, 100), (None, 80400       Embedding_layer[0][0]            
____________________________________________________________________________________________

In [35]:
decoder_kor_input = target_tensor.reshape((-1,kor_max_length,1))[:,:-1,:]

In [36]:
decoder_kor_target = target_tensor.reshape((-1,kor_max_length,1))[:,1:,:]

In [37]:
model.fit([input_tensor,decoder_kor_input],decoder_kor_target,
         epochs=15,
         batch_size=100,
         validation_split=0.2)

Train on 2652 samples, validate on 664 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x2168b3a0148>

In [38]:
inf_encoder_model = Model(encoder_input, [enc_h_state, enc_c_state])

decoder_initial_states = [Input(shape=(100,)),
                         Input(shape=(100,))]

decoder_output, dec_h_state, dec_c_state = decoder_lstm(decoder_input, initial_state=decoder_initial_states)

logits = final_dense(decoder_output)

inf_decoder_model = Model(decoder_input + decoder_initial_states, logits + [dec_h_state, dec_c_state])

ValueError: Dimensions must be equal, but are 5063 and 100 for 'add_3' (op: 'AddV2') with input shapes: [?,?,5063], [2,?,100].

In [39]:
kor_id2word = {idx:word for word, idx in target_lang_tokenizer.word_index.items()}    

In [40]:
kor_id2word

{1: '<start>',
 2: '<end>',
 3: '톰은',
 4: '있어',
 5: '난',
 6: '톰이',
 7: '나는',
 8: '내가',
 9: '그',
 10: '수',
 11: '내',
 12: '이',
 13: '것',
 14: '네가',
 15: '않아',
 16: '같아',
 17: '거야',
 18: '너',
 19: '없어',
 20: '더',
 21: '걸',
 22: '할',
 23: '안',
 24: '좀',
 25: '것을',
 26: '해',
 27: '그는',
 28: '왜',
 29: '이건',
 30: '적',
 31: '너무',
 32: '우린',
 33: '있는',
 34: '알고',
 35: '정말',
 36: '프랑스어를',
 37: '사람은',
 38: '한',
 39: '우리는',
 40: '톰을',
 41: '가장',
 42: '있었어',
 43: '네',
 44: '아주',
 45: '그걸',
 46: '했어',
 47: '싶어',
 48: '있다',
 49: '줄',
 50: '나',
 51: '잘',
 52: '계속',
 53: '넌',
 54: '좋아해',
 55: '날',
 56: '하고',
 57: '건',
 58: '자기',
 59: '아니야',
 60: '이게',
 61: '우리',
 62: '것은',
 63: '좋아하는',
 64: '않았어',
 65: '아직도',
 66: '얼마나',
 67: '톰한테',
 68: '하지',
 69: '생각해',
 70: '거짓말',
 71: '마',
 72: '사람들은',
 73: '톰의',
 74: '아직',
 75: '법을',
 76: '모두',
 77: '돼',
 78: '많이',
 79: '있을',
 80: '메리가',
 81: '와',
 82: '미안해',
 83: '그만',
 84: '모든',
 85: '적이',
 86: '메리는',
 87: '말해',
 88: '누가',
 89: '못',
 90: '톰에게',
 91: '웃었어',
 92:

In [None]:
def translate(sentence):
    