In [36]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [82]:
import pandas as pd
from langdetect import detect
import re

from sklearn.model_selection import train_test_split


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, GRU, Embedding
from tensorflow.keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam


In [3]:
df_500 = pd.read_csv('data/lyrics_save500.txt', sep='\t')
df_test = pd.read_csv('data/kor.txt', sep='\t', names=['eng','kor','drop_me'])

In [4]:
df_500 = df_500.drop([19565, 28696, 31890])
df_500['lang'] = df_500['kor'].apply(detect)

In [None]:
df_final = df_500[df_500['lang']=='ko'].drop(columns=['Unnamed: 0','lang'])

df_test = df_test.drop(columns='drop_me')

In [8]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"’", "'", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [15]:
def start_end_tagger(decoder_input_sentence):
    start_tag = "<start> "
    end_tag = " <end>"
    final_target = start_tag + decoder_input_sentence + end_tag
    return final_target

In [21]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [154]:
def tokenize(lang):
    lang_tokenizer = Tokenizer()
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    padded = pad_sequences(tensor, maxlen=max_length(tensor), padding='post')

    return padded, lang_tokenizer

In [155]:
def load_dataset(input_lang, target_lang):
    input_tensor, inp_lang_tokenizer = tokenize(input_lang)
    target_tensor, targ_lang_tokenizer = tokenize(target_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [150]:
df_test['eng'] = df_test['eng'].apply(clean_text)
df_test['kor'] = df_test['kor'].apply(clean_text)

df_final['eng'] = df_final['eng'].apply(clean_text)
df_final['kor'] = df_final['kor'].apply(clean_text)

In [13]:
df_test

Unnamed: 0,eng,kor
0,go,가
1,hi,안녕
2,run,뛰어
3,run,뛰어
4,who,누구
...,...,...
3313,tom always cried when his sister took away his...,톰은 누나가 자기 장난감을 빼앗아 갔을 때마다 울음을 터뜨렸고 누나는 바로 그런 이...
3314,science fiction has undoubtedly been the inspi...,공상 과학 소설은 의심의 여지 없이 오늘날 존재하는 많은 기술에 영감을 주었어
3315,i started a new blog i will do my best not to ...,난 블로그를 시작했어 블로그를 초반에만 반짝 많이 하다가 관두는 사람처럼은 되지 않...
3316,i think it is a shame that some foreign langua...,몇몇 외국어 선생님이 한 번도 원어민과 공부해본 적도 없으면서 대학을 나올 수 있었...


In [14]:
df_final

Unnamed: 0,eng,kor
0,time runs,시간은 달려가
2,i am carrying them on my neck,잠시 내 목마를 태워
3,throwing my memories outside,나의 추억들을 밖으로 던진다
4,this is like a drama without a script,이건 마치 각본 없는 drama
5,it is an ending you have never thought of,생각 못 한 결말 나와
...,...,...
49584,today i am shining brightly,모르지만 오늘도 난 밝게 빛나지
49585,wherever they are i will be the brightest,어디서 보아도 내가 제일 눈부시게
49586,even though time passes i will not wither away,시간이 지나도 사그라들지 않게
49587,even if i run out of breath i will not stop an...,숨이 차 올라도 멈추지 말고 더 크게 외쳐 oh


In [190]:
df_test['eng'] = df_test['eng'].apply(start_end_tagger)
df_test['kor'] = df_test['kor'].apply(start_end_tagger)

In [191]:
df_final['kor'] = df_final['kor'].apply(start_end_tagger)
df_final['eng'] = df_final['eng'].apply(start_end_tagger)

In [192]:
df_test

Unnamed: 0,eng,kor
0,<start> start go end <end>,<start> start 가 end <end>
1,<start> start hi end <end>,<start> start 안녕 end <end>
2,<start> start run end <end>,<start> start 뛰어 end <end>
3,<start> start run end <end>,<start> start 뛰어 end <end>
4,<start> start who end <end>,<start> start 누구 end <end>
...,...,...
3313,<start> start tom always cried when his sister...,<start> start 톰은 누나가 자기 장난감을 빼앗아 갔을 때마다 울음을 터뜨...
3314,<start> start science fiction has undoubtedly ...,<start> start 공상 과학 소설은 의심의 여지 없이 오늘날 존재하는 많은 ...
3315,<start> start i started a new blog i will do m...,<start> start 난 블로그를 시작했어 블로그를 초반에만 반짝 많이 하다가 ...
3316,<start> start i think it is a shame that some ...,<start> start 몇몇 외국어 선생님이 한 번도 원어민과 공부해본 적도 없으...


In [193]:
df_final

Unnamed: 0,eng,kor
0,<start> start time runs end <end>,<start> start 시간은 달려가 end <end>
2,<start> start i am carrying them on my neck en...,<start> start 잠시 내 목마를 태워 end <end>
3,<start> start throwing my memories outside end...,<start> start 나의 추억들을 밖으로 던진다 end <end>
4,<start> start this is like a drama without a s...,<start> start 이건 마치 각본 없는 drama end <end>
5,<start> start it is an ending you have never t...,<start> start 생각 못 한 결말 나와 end <end>
...,...,...
49584,<start> start today i am shining brightly end ...,<start> start 모르지만 오늘도 난 밝게 빛나지 end <end>
49585,<start> start wherever they are i will be the ...,<start> start 어디서 보아도 내가 제일 눈부시게 end <end>
49586,<start> start even though time passes i will n...,<start> start 시간이 지나도 사그라들지 않게 end <end>
49587,<start> start even if i run out of breath i wi...,<start> start 숨이 차 올라도 멈추지 말고 더 크게 외쳐 oh end <...


In [194]:
eng_tens, eng_lang_tok = tokenize(df_test['eng'])
kor_tens, kor_lang_tok = tokenize(df_test['kor'])

In [195]:
eng_tens.shape

(3318, 105)

In [196]:
kor_tens.shape

(3318, 93)

In [197]:
max_length(eng_tens)

105

In [198]:
max_length(kor_tens)

93

In [199]:
eng_vocab_size = len(eng_lang_tok.word_index)

In [200]:
kor_vocab_size = len(kor_lang_tok.word_index)

In [201]:
input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer = load_dataset(df_test['eng'], df_test['kor'])

In [202]:
max_length_inp, max_length_targ = max_length(input_tensor), max_length(target_tensor)

In [203]:
def create_tf_dataset(input_tensor, target_tensor, input_tokenizer, target_tokenizer):
    buffer_size = len(input_tensor)
    batch_size = 64
    steps_per_epoch = len(input_tensor)//batch_size
    embedding_dim = 256
    units = 1024
    vocab_inp_size = len(input_tokenizer.word_index)+1
    vocab_tar_size = len(target_tokenizer.word_index)+1
    
    dataset = tf.data.Dataset.from_tensor_slices((input_tensor, target_tensor)).shuffle(buffer_size)
    dataset = dataset.batch(batch_size, drop_remainder=True)
    return dataset

In [204]:
df = create_tf_dataset(input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer)

In [278]:
example_input_batch, example_target_batch = next(iter(df))

In [279]:
example_input_batch

<tf.Tensor: id=56041, shape=(64, 105), dtype=int32, numpy=
array([[  1,   1,  16, ...,   0,   0,   0],
       [  1,   1,  40, ...,   0,   0,   0],
       [  1,   1,   4, ...,   0,   0,   0],
       ...,
       [  1,   1,   9, ...,   0,   0,   0],
       [  1,   1, 346, ...,   0,   0,   0],
       [  1,   1,   3, ...,   0,   0,   0]])>

In [206]:
batch_size = 64
steps_per_epoch = len(input_tensor)//batch_size
for (batch, (inp, targ)) in enumerate(df.take(steps_per_epoch)):
    print(targ[:,1].shape)

(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)
(64,)


In [269]:
encoder_input = Input(shape=(105,),name='Encoder_input')

embedded_input = Embedding(input_dim=eng_vocab_size, output_dim=200,name='Embedding_layer')(encoder_input)

encoder_output, encoder_state_h = GRU(256, return_state=True, name='Encoder_GRU')(embedded_input)
# encoder_output, encoder_state_h = encoder(encoder_input)

In [270]:
decoder_input = Input(shape=(93,))

embedded_decoder = Embedding(input_dim=kor_vocab_size, output_dim=500, name='Embedding_layer2')(decoder_input)

decoder_gru = GRU(256, return_sequences=True)(embedded_decoder, initial_state=encoder_state_h)

decoder_output = Dense(kor_vocab_size, activation='softmax')(decoder_gru)

In [271]:
model = Model([encoder_input, decoder_input], decoder_output)

In [272]:
model.summary()

Model: "model_13"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder_input (InputLayer)      [(None, 105)]        0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           [(None, 93)]         0                                            
__________________________________________________________________________________________________
Embedding_layer (Embedding)     (None, 105, 200)     470200      Encoder_input[0][0]              
__________________________________________________________________________________________________
Embedding_layer2 (Embedding)    (None, 93, 500)      2552000     input_18[0][0]                   
___________________________________________________________________________________________

In [273]:
model.compile(optimizer=Adam(), loss=SparseCategoricalCrossentropy())

In [274]:
input_tensor.shape

(3318, 105)

In [275]:
target_tensor_out = target_tensor[:,1:]

In [276]:
target_tensor_out.shape

(3318, 92)

In [282]:
model.fit([example_input_batch,example_target_batch],example_target_batch[:,1:,],batch_size=64,epochs=1,validation_split=0.2)

Train on 51 samples, validate on 13 samples


InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  assertion failed: [] [Condition x == y did not hold element-wise:] [x (loss/dense_14_loss/SparseSoftmaxCrossEntropyWithLogits/Shape_1:0) = ] [51 92] [y (loss/dense_14_loss/SparseSoftmaxCrossEntropyWithLogits/strided_slice:0) = ] [51 93]
	 [[node loss/dense_14_loss/SparseSoftmaxCrossEntropyWithLogits/assert_equal/Assert/Assert (defined at C:\Users\jooki\Anaconda3\envs\tf-gpu\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]]
	 [[Reshape_20/_82]]
  (1) Invalid argument:  assertion failed: [] [Condition x == y did not hold element-wise:] [x (loss/dense_14_loss/SparseSoftmaxCrossEntropyWithLogits/Shape_1:0) = ] [51 92] [y (loss/dense_14_loss/SparseSoftmaxCrossEntropyWithLogits/strided_slice:0) = ] [51 93]
	 [[node loss/dense_14_loss/SparseSoftmaxCrossEntropyWithLogits/assert_equal/Assert/Assert (defined at C:\Users\jooki\Anaconda3\envs\tf-gpu\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_distributed_function_56035]

Function call stack:
distributed_function -> distributed_function


In [230]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))

In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)