In [36]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [68]:
import pandas as pd
from langdetect import detect
import re

from sklearn.model_selection import train_test_split


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, GRU
from tensorflow.keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam


In [3]:
df_500 = pd.read_csv('data/lyrics_save500.txt', sep='\t')
df_test = pd.read_csv('data/kor.txt', sep='\t', names=['eng','kor','drop_me'])

In [4]:
df_500 = df_500.drop([19565, 28696, 31890])
df_500['lang'] = df_500['kor'].apply(detect)

In [None]:
df_final = df_500[df_500['lang']=='ko'].drop(columns=['Unnamed: 0','lang'])

df_test = df_test.drop(columns='drop_me')

In [8]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"’", "'", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [15]:
def start_end_tagger(decoder_input_sentence):
    start_tag = "<start> "
    end_tag = " <end>"
    final_target = start_tag + decoder_input_sentence + end_tag
    return final_target

In [21]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [25]:
def tokenize(lang):
    lang_tokenizer = Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = pad_sequences(tensor, padding='post')

    return tensor, lang_tokenizer

In [47]:
def load_dataset(input_lang, target_lang):
    input_tensor, inp_lang_tokenizer = tokenize(input_lang)
    target_tensor, targ_lang_tokenizer = tokenize(target_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [12]:
df_test['eng'] = df_test['eng'].apply(clean_text)
df_test['kor'] = df_test['kor'].apply(clean_text)

df_final['eng'] = df_final['eng'].apply(clean_text)
df_final['kor'] = df_final['kor'].apply(clean_text)

In [13]:
df_test

Unnamed: 0,eng,kor
0,go,가
1,hi,안녕
2,run,뛰어
3,run,뛰어
4,who,누구
...,...,...
3313,tom always cried when his sister took away his...,톰은 누나가 자기 장난감을 빼앗아 갔을 때마다 울음을 터뜨렸고 누나는 바로 그런 이...
3314,science fiction has undoubtedly been the inspi...,공상 과학 소설은 의심의 여지 없이 오늘날 존재하는 많은 기술에 영감을 주었어
3315,i started a new blog i will do my best not to ...,난 블로그를 시작했어 블로그를 초반에만 반짝 많이 하다가 관두는 사람처럼은 되지 않...
3316,i think it is a shame that some foreign langua...,몇몇 외국어 선생님이 한 번도 원어민과 공부해본 적도 없으면서 대학을 나올 수 있었...


In [14]:
df_final

Unnamed: 0,eng,kor
0,time runs,시간은 달려가
2,i am carrying them on my neck,잠시 내 목마를 태워
3,throwing my memories outside,나의 추억들을 밖으로 던진다
4,this is like a drama without a script,이건 마치 각본 없는 drama
5,it is an ending you have never thought of,생각 못 한 결말 나와
...,...,...
49584,today i am shining brightly,모르지만 오늘도 난 밝게 빛나지
49585,wherever they are i will be the brightest,어디서 보아도 내가 제일 눈부시게
49586,even though time passes i will not wither away,시간이 지나도 사그라들지 않게
49587,even if i run out of breath i will not stop an...,숨이 차 올라도 멈추지 말고 더 크게 외쳐 oh


In [16]:
df_test['eng'] = df_test['eng'].apply(start_end_tagger)
df_test['kor'] = df_test['kor'].apply(start_end_tagger)

In [18]:
df_final['kor'] = df_final['kor'].apply(start_end_tagger)
df_final['eng'] = df_final['eng'].apply(start_end_tagger)

In [17]:
df_test

Unnamed: 0,eng,kor
0,<start> go <end>,<start> 가 <end>
1,<start> hi <end>,<start> 안녕 <end>
2,<start> run <end>,<start> 뛰어 <end>
3,<start> run <end>,<start> 뛰어 <end>
4,<start> who <end>,<start> 누구 <end>
...,...,...
3313,<start> tom always cried when his sister took ...,<start> 톰은 누나가 자기 장난감을 빼앗아 갔을 때마다 울음을 터뜨렸고 누나는...
3314,<start> science fiction has undoubtedly been t...,<start> 공상 과학 소설은 의심의 여지 없이 오늘날 존재하는 많은 기술에 영감...
3315,<start> i started a new blog i will do my best...,<start> 난 블로그를 시작했어 블로그를 초반에만 반짝 많이 하다가 관두는 사람...
3316,<start> i think it is a shame that some foreig...,<start> 몇몇 외국어 선생님이 한 번도 원어민과 공부해본 적도 없으면서 대학을...


In [19]:
df_final

Unnamed: 0,eng,kor
0,<start> time runs <end>,<start> 시간은 달려가 <end>
2,<start> i am carrying them on my neck <end>,<start> 잠시 내 목마를 태워 <end>
3,<start> throwing my memories outside <end>,<start> 나의 추억들을 밖으로 던진다 <end>
4,<start> this is like a drama without a script ...,<start> 이건 마치 각본 없는 drama <end>
5,<start> it is an ending you have never thought...,<start> 생각 못 한 결말 나와 <end>
...,...,...
49584,<start> today i am shining brightly <end>,<start> 모르지만 오늘도 난 밝게 빛나지 <end>
49585,<start> wherever they are i will be the bright...,<start> 어디서 보아도 내가 제일 눈부시게 <end>
49586,<start> even though time passes i will not wit...,<start> 시간이 지나도 사그라들지 않게 <end>
49587,<start> even if i run out of breath i will not...,<start> 숨이 차 올라도 멈추지 말고 더 크게 외쳐 oh <end>


In [30]:
eng_tens, eng_lang_tok = tokenize(df_test['eng'])
kor_tens, kor_lang_tok = tokenize(df_test['kor'])

In [34]:
kor_tens

array([[   1,  121,    2, ...,    0,    0,    0],
       [   1,  463,    2, ...,    0,    0,    0],
       [   1,  464,    2, ...,    0,    0,    0],
       ...,
       [   1,    5, 1542, ...,    0,    0,    0],
       [   1,  524, 5066, ...,    0,    0,    0],
       [   1, 1540, 1541, ...,   72,   17,    2]])

In [32]:
max_length(eng_tens)

103

In [33]:
max_length(kor_tens)

91

In [41]:
len(eng_lang_tok.word_index)

2354

In [43]:
len(kor_lang_tok.word_index)

5103

In [49]:
input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer = load_dataset(df_test['eng'], df_test['kor'])

In [50]:
max_length_inp, max_length_targ = max_length(input_tensor), max_length(target_tensor)

In [62]:
def create_tf_dataset(input_tensor, target_tensor, input_tokenizer, target_tokenizer):
    buffer_size = len(input_tensor)
    batch_size = 64
    steps_per_epoch = len(input_tensor)//batch_size
    embedding_dim = 256
    units = 1024
    vocab_inp_size = len(input_tokenizer.word_index)+1
    vocab_tar_size = len(target_tokenizer.word_index)+1
    
    dataset = tf.data.Dataset.from_tensor_slices((input_tensor, target_tensor)).shuffle(buffer_size)
    dataset = dataset.batch(batch_size, drop_remainder=True)
    return dataset

In [63]:
df = create_tf_dataset(input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer)

In [77]:
df

<BatchDataset shapes: ((64, 103), (64, 91)), types: (tf.int32, tf.int32)>

In [67]:
df.take(len(input_tensor)//64)

<TakeDataset shapes: ((64, 103), (64, 91)), types: (tf.int32, tf.int32)>

In [69]:
encoder_inputs = Input(shape=(None, 20))
encoder = GRU(100, return_state=True)
encoder_outputs, state_h = encoder(encoder_inputs)

In [70]:
decoder_inputs = Input(shape=(None, 20))
decoder_gru = GRU(100, return_sequences=True)
decoder_outputs = decoder_gru(decoder_inputs, initial_state=state_h)
decoder_dense = Dense(20, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [71]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [73]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 20)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 20)]   0                                            
__________________________________________________________________________________________________
gru (GRU)                       [(None, 100), (None, 36600       input_1[0][0]                    
__________________________________________________________________________________________________
gru_1 (GRU)                     (None, None, 100)    36600       input_2[0][0]                    
                                                                 gru[0][1]                    

In [75]:
model.compile(optimizer=Adam(), loss=SparseCategoricalCrossentropy())

In [None]:
model.fit()