In [1]:
%load_ext autoreload
%autoreload 2

In [62]:
import pandas as pd
from langdetect import detect
import re
import os

from sklearn.model_selection import train_test_split


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, GRU, Embedding, Layer
from tensorflow.keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam


In [3]:
# df_500 = pd.read_csv('data/lyrics_save500.txt', sep='\t')
df_test = pd.read_csv('data/kor.txt', sep='\t', names=['eng','kor','drop_me'])

In [4]:
# df_500 = df_500.drop([19565, 28696, 31890])
# df_500['lang'] = df_500['kor'].apply(detect)

In [5]:
# df_final = df_500[df_500['lang']=='ko'].drop(columns=['Unnamed: 0','lang'])

df_test = df_test.drop(columns='drop_me')

In [6]:
df_test

Unnamed: 0,eng,kor
0,Go.,가.
1,Hi.,안녕.
2,Run!,뛰어!
3,Run.,뛰어.
4,Who?,누구?
...,...,...
3313,Tom always cried when his sister took away his...,"톰은 누나가 자기 장난감을 빼앗아 갔을 때마다 울음을 터뜨렸고, 누나는 바로 그런 ..."
3314,Science fiction has undoubtedly been the inspi...,공상 과학 소설은 의심의 여지 없이 오늘날 존재하는 많은 기술에 영감을 주었어.
3315,I started a new blog. I'll do my best not to b...,난 블로그를 시작했어. 블로그를 초반에만 반짝 많이 하다가 관두는 사람처럼은 되지 ...
3316,I think it's a shame that some foreign languag...,몇몇 외국어 선생님이 한 번도 원어민과 공부해본 적도 없으면서 대학을 나올 수 있었...


In [7]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"’", "'", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [8]:
def start_end_tagger(decoder_input_sentence):
    start_tag = "<start> "
    end_tag = " <end>"
    final_target = start_tag + decoder_input_sentence + end_tag
    return final_target

In [9]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [10]:
def tokenize(lang):
    lang_tokenizer = Tokenizer()
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    padded = pad_sequences(tensor, maxlen=max_length(tensor), padding='post')

    return padded, lang_tokenizer

In [11]:
def preprocess(input_lang, target_lang):
    input_lang = input_lang.apply(clean_text)
    target_lang = target_lang.apply(clean_text)
    
    input_lang = input_lang.apply(start_end_tagger)
    target_lang = target_lang.apply(start_end_tagger)
    
    input_tensor, input_lang_tokenizer = tokenize(input_lang)
    target_tensor, target_lang_tokenizer = tokenize(target_lang)
    
    return input_tensor, target_tensor, input_lang_tokenizer, target_lang_tokenizer

In [12]:
input_tensor, target_tensor, input_lang_tokenizer, target_lang_tokenizer = preprocess(df_test['eng'],df_test['kor'])

In [13]:
input_tensor.shape

(3318, 103)

In [14]:
target_tensor.shape

(3318, 91)

In [15]:
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

In [16]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

In [17]:
len(target_tensor_val)

664

In [18]:
buffer_size = len(input_tensor_train)
batch_size = 64
steps_per_epoch = len(input_tensor_train)//batch_size
embedding_dim = 256
units = 1024

vocab_inp_size = len(input_lang_tokenizer.word_index)+1
vocab_tar_size = len(target_lang_tokenizer.word_index)+1

In [19]:
vocab_inp_size

2352

In [20]:
vocab_tar_size

5105

In [21]:
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(buffer_size)
dataset = dataset.batch(batch_size, drop_remainder=True)

In [22]:
example_input_batch, example_target_batch = next(iter(dataset))

In [23]:
class Encoder(Model):
    def __init__(self, vocab_size, embed_dim, enc_units, batch_size):
        super(Encoder,self).__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.enc_units = enc_units
        self.batch_size = batch_size
        
        self.embedding_layer = Embedding(self.vocab_size,self.embed_dim)
        self.gru_layer = GRU(self.enc_units, return_sequences=True,return_state=True)
        
    def call(self,x,hidden):
        x = self.embedding_layer(x)
        output,state = self.gru_layer(x,initial_state=hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size,self.enc_units))

In [24]:
encoder = Encoder(vocab_size=vocab_inp_size, embed_dim=embedding_dim, enc_units=units, batch_size=batch_size)

In [25]:
sample_hidden = encoder.initialize_hidden_state()

In [26]:
sample_output, sample_hidden = encoder.call(example_input_batch, sample_hidden)

In [27]:
sample_output.shape

TensorShape([64, 103, 1024])

In [28]:
sample_hidden.shape

TensorShape([64, 1024])

In [38]:
class Attention(Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)
        
    def call(self, query, values):
        hidden_with_time = tf.expand_dims(query,1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time)))
        
        attention_weights = tf.nn.softmax(score,axis=1)
        
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector,axis=1)
        
        return context_vector, attention_weights

In [39]:
attention_layer = Attention(10)
attention_vector, attention_weights = attention_layer(sample_hidden,sample_output)

In [41]:
attention_vector.shape

TensorShape([64, 1024])

In [42]:
attention_weights.shape

TensorShape([64, 103, 1])

In [55]:
class Decoder(Model):
    def __init__(self, vocab_size, embed_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.dec_units = dec_units
        self.batch_size = batch_size
        
        self.embedding_layer = Embedding(vocab_size, embed_dim)
        self.gru_layer = GRU(self.dec_units, return_sequences=True,return_state=True)
        self.dense = Dense(vocab_size)
        
        self.attention = Attention(self.dec_units)
        
    def call(self, x, hidden, enc_output):
        attention_vector, attention_weights = self.attention(hidden, enc_output)
        
        x = self.embedding_layer(x)
        
        x = tf.concat([tf.expand_dims(attention_vector, 1),x],axis=-1)
        
        output, state = self.gru_layer(x)
        output = tf.reshape(output,(-1,output.shape[2]))
        x = self.dense(output)
        
        return x, state, attention_weights

In [56]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, batch_size)

In [57]:
sample_decoder_output, _, _ = decoder(tf.random.uniform((batch_size, 1)),sample_hidden, sample_output)

In [59]:
sample_decoder_output.shape

TensorShape([64, 5105])

In [60]:
optimizer = Adam()
loss = SparseCategoricalCrossentropy()

def loss_function(real,pred):
    mask = tf.math.logical_not(tf.math.equal(real,0))
    loss_ = loss(real,pred)
    
    mask = tf.cast(mask,dtype=loss_.dtype)
    loss *= mask
    
    return tf.reduce_mean(loss_)

In [63]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)