In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
from langdetect import detect
import re
import os
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, GRU, Embedding, Layer
from tensorflow.keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
import pydot

from src.helpme import clean_text,start_end_tagger,max_length,tokenize,preprocess,preprocess_sentence

In [3]:
df_test = pd.read_csv('data/kor.txt', sep='\t', names=['eng','kor','drop_me'])
df_test = df_test.drop(columns='drop_me')

In [4]:
df_test = df_test.drop([3317,3316])

In [5]:
df_test

Unnamed: 0,eng,kor
0,Go.,가.
1,Hi.,안녕.
2,Run!,뛰어!
3,Run.,뛰어.
4,Who?,누구?
...,...,...
3311,Why don't we just reformat the hard disk? You'...,우리 그냥 하드 디스크를 새로 포맷하는 건 어때? 너무 그걸로 스트레스 많이 받고 ...
3312,"I knew that Tom was just a freshman, so I was ...","난 톰이 그냥 신입생일 뿐이라고만 알았는데, 그러다보니 톰이랑 선배들이 서로 어울려..."
3313,Tom always cried when his sister took away his...,"톰은 누나가 자기 장난감을 빼앗아 갔을 때마다 울음을 터뜨렸고, 누나는 바로 그런 ..."
3314,Science fiction has undoubtedly been the inspi...,공상 과학 소설은 의심의 여지 없이 오늘날 존재하는 많은 기술에 영감을 주었어.


In [4]:
df_all = pd.read_csv('data/final_df_fix.txt',sep='\t')

df_all = df_all.drop(columns='Unnamed: 0')

In [6]:
eng = preprocess(df_test['eng'])
kor = preprocess(df_test['kor'])

input_tensor, input_lang_tokenizer = tokenize(eng)
target_tensor, target_lang_tokenizer = tokenize(kor)

In [7]:
eng_vocab_size = len(input_lang_tokenizer.word_index)+1
kor_vocab_size = len(target_lang_tokenizer.word_index)+1

print(f'English vocab size: {eng_vocab_size}')
print(f'Korean vocab size: {kor_vocab_size}')

eng_max_length = len(input_tensor[0])
kor_max_length = len(target_tensor[0])

print(f'Longest English Sentence: {eng_max_length}')
print(f'Longest Korean Sentence: {kor_max_length}')

English vocab size: 2338
Korean vocab size: 5063
Longest English Sentence: 31
Longest Korean Sentence: 19


In [11]:
eng = preprocess(df_all['eng'])
kor = preprocess(df_all['kor'])

input_tensor, input_lang_tokenizer = tokenize(eng)
target_tensor, target_lang_tokenizer = tokenize(kor)

In [12]:
eng_vocab_size = len(input_lang_tokenizer.word_index)+1
kor_vocab_size = len(target_lang_tokenizer.word_index)+1

print(f'English vocab size: {eng_vocab_size}')
print(f'Korean vocab size: {kor_vocab_size}')

eng_max_length = len(input_tensor[0])
kor_max_length = len(target_tensor[0])

print(f'Longest English Sentence: {eng_max_length}')
print(f'Longest Korean Sentence: {kor_max_length}')

English vocab size: 12251
Korean vocab size: 58663
Longest English Sentence: 45
Longest Korean Sentence: 15


In [88]:
input_tensor1 = input_tensor[:1000]
target_tensor1 = target_tensor[:1000]

In [13]:
encoder_input = Input(shape=(None,),name='Encoder_input')
embedding_dim=50
embedded_input = Embedding(input_dim=eng_vocab_size,
                           output_dim=embedding_dim,
                           name='Embedding_layer')(encoder_input)
encoder_lstm = LSTM(units=50,
                   activation='relu',
                   return_sequences=False,
                   return_state=True,
                   name='Encoder_lstm')
encoder_out, enc_h_state, enc_c_state = encoder_lstm(embedded_input)

decoder_input = Input(shape=(None,1), name='Decoder_input')
# embedded_decoder = Embedding(kor_vocab_size,
#                             100,
#                             name='Decoder_embedded_layer')(decoder_input)
decoder_lstm = LSTM(units=50,
                   activation='relu',
                   return_sequences=True,
                   return_state=True,
                   name='Decoder_lstm')
decoder_out,_,_ = decoder_lstm(decoder_input,initial_state=[enc_h_state,enc_c_state])

final_dense = Dense(kor_vocab_size,activation='softmax',name='Final_dense_layer')
logits = final_dense(decoder_out)

model = Model([encoder_input,decoder_input],logits)

model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['acc'])

In [14]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Embedding_layer (Embedding)     (None, None, 50)     612550      Encoder_input[0][0]              
__________________________________________________________________________________________________
Decoder_input (InputLayer)      [(None, None, 1)]    0                                            
__________________________________________________________________________________________________
Encoder_lstm (LSTM)             [(None, 50), (None,  20200       Embedding_layer[0][0]            
____________________________________________________________________________________________

In [91]:
input_tensor1.shape

(1000, 103)

In [9]:
kor_max_length

15

In [15]:
decoder_kor_input = target_tensor.reshape((-1,kor_max_length,1))[:,:-1,:]

In [16]:
decoder_kor_target = target_tensor.reshape((-1,kor_max_length,1))[:,1:,:]

In [17]:
model.fit([input_tensor,decoder_kor_input],decoder_kor_target,
         epochs=15,
         batch_size=20,
         validation_split=0.2)

Train on 78528 samples, validate on 19633 samples
Epoch 1/15
Epoch 2/15

KeyboardInterrupt: 

In [18]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Embedding_layer (Embedding)     (None, None, 50)     612550      Encoder_input[0][0]              
__________________________________________________________________________________________________
Decoder_input (InputLayer)      [(None, None, 1)]    0                                            
__________________________________________________________________________________________________
Encoder_lstm (LSTM)             [(None, 50), (None,  20200       Embedding_layer[0][0]            
____________________________________________________________________________________________

In [19]:
inf_encoder_model = Model(encoder_input, [enc_h_state, enc_c_state])

decoder_initial_states = [Input(shape=(50,)),
                         Input(shape=(50,))]

decoder_output, dec_h_state, dec_c_state = decoder_lstm(decoder_input, initial_state=decoder_initial_states)

logits = final_dense(decoder_output)

inf_decoder_model = Model([decoder_input] + decoder_initial_states, [logits,dec_h_state, dec_c_state])

In [20]:
inf_decoder_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder_input (InputLayer)      [(None, None, 1)]    0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
Decoder_lstm (LSTM)             [(None, None, 50), ( 10400       Decoder_input[0][0]              
                                                                 input_1[0][0]              

In [1]:
kor_id2word = {idx:word for word, idx in target_lang_tokenizer.word_index.items()}    

NameError: name 'target_lang_tokenizer' is not defined

In [22]:
kor_id2word

{1: '<start>',
 2: '<end>',
 3: '내',
 4: '난',
 5: '날',
 6: '널',
 7: '더',
 8: '수',
 9: '그',
 10: '내가',
 11: '나',
 12: '이',
 13: '다',
 14: '넌',
 15: '너를',
 16: '너',
 17: '너의',
 18: '또',
 19: '나를',
 20: '걸',
 21: '있어',
 22: '해',
 23: '없어',
 24: '왜',
 25: 'oh',
 26: '내게',
 27: '니',
 28: '니가',
 29: '우리',
 30: '너무',
 31: '네',
 32: '한',
 33: '다시',
 34: '네가',
 35: '너와',
 36: '없는',
 37: '나는',
 38: '나의',
 39: '봐',
 40: '게',
 41: '것',
 42: '거야',
 43: '모든',
 44: '좀',
 45: '안',
 46: '지금',
 47: '않아',
 48: '싶어',
 49: '같아',
 50: '돼',
 51: '같은',
 52: '건',
 53: 'yeah',
 54: '말',
 55: '있는',
 56: '이젠',
 57: '이렇게',
 58: '마',
 59: '이제',
 60: '때',
 61: '못',
 62: '우린',
 63: '두',
 64: '잘',
 65: '밤',
 66: '할',
 67: '맘',
 68: '매일',
 69: '함께',
 70: '속에',
 71: '그래',
 72: 'baby',
 73: '없이',
 74: '자꾸',
 75: '너는',
 76: '정말',
 77: '보고',
 78: '저',
 79: '모두',
 80: '눈을',
 81: '오늘',
 82: '그냥',
 83: '몰라',
 84: '다른',
 85: '너에게',
 86: '이런',
 87: '순간',
 88: '맘을',
 89: '줘',
 90: '나도',
 91: '그대',
 92: '아직',
 93: '그런',
 94: '톰은'

In [23]:
def translate(sentence):
    sentence = preprocess_sentence(sentence)
    
    input_sentence = [input_lang_tokenizer.word_index[i] for i in sentence.split(' ')]
    input_sentence = pad_sequences([input_sentence],maxlen=eng_max_length,padding='post')
    input_sentence_tensor = tf.convert_to_tensor(input_sentence)
    return input_sentence_tensor

In [24]:
new_sentence = translate('tom')

In [25]:
new_sentence

<tf.Tensor: id=25116, shape=(1, 45), dtype=int32, numpy=
array([[  1, 109,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0]])>

In [26]:
initial_state = inf_encoder_model.predict(new_sentence)

In [27]:
prev_word = np.zeros((1,1,1))
prev_word[0,0,0] = target_lang_tokenizer.word_index['<start>']

In [28]:
stop_condition = False
translation = []

while not stop_condition:
    logits, h_state, c_state = inf_decoder_model.predict([prev_word] + initial_state)
    
    pred_id = np.argmax(logits[0,0,:])
    pred_word = kor_id2word[pred_id]
    translation.append(pred_word)
    
    if (pred_word=='<end>') or (len(translation)>kor_max_length):
        break
        
    prev_word[0,0,0] = pred_id
    initial_states=[h_state,c_state]
    
" ".join(translation)

'내 내 내 내 내 내 내 내 내 내 내 내 내 내 내 내'

In [59]:
target_lang_tokenizer.word_index['<end>']

2

In [68]:
kor_max_length

19