In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import tensorflow as tf
import tensorflow_datasets as tfds
from keras.preprocessing.sequence import pad_sequences
from nlp_utils import preprocess_sentence, TextTokenizing
from transformer import transformer, CustomSchedule, loss_function
from utils import make_checkpoint, accuracy, load_csv_and_processing

In [6]:
train_data = pd.read_csv("./small_dataset.csv")
train_data.head()

len(train_data)

28850

In [7]:
questions, answers = load_csv_and_processing("./small_dataset.csv")

len(questions), len(answers)

(28850, 28850)

# For Small Model -> Small Dataset

In [8]:
questions = questions[:20000]
answers = answers[:20000]

type (questions[0]), answers[:4], questions[-1]

(str,
 ['누룽지 끓여 먹을까 고민', '아 나 점심 뭐 먹지', '나 점심 내장국밥 픽', '누룽지는 간식으로 먹어야징'],
 '내 동생이 취업했어! 좋은 곳에 들어가게 돼서 선물 해준다네 하하 ')

In [9]:
textTokenizing = TextTokenizing()

In [10]:
# tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
#    questions+answers, target_vocab_size=2**16
#

tokenizer = textTokenizing.create_tokenizer(questions, answers, target_vocab_size=2**15)

['점심 메뉴 정하신 분 ', '나 아침 든든하게 먹으니 힘 난다', '점심은 사모님이 주심 후후', '**가 오늘 만들어야 함?', '돼지국밥으로 바꿀까'] ['누룽지 끓여 먹을까 고민', '아 나 점심 뭐 먹지', '나 점심 내장국밥 픽', '누룽지는 간식으로 먹어야징', '누룽지 맛있겠어... 나두'] 32768


In [11]:
textTokenizing.save_tokenizer("super_super_small_vocab")

In [6]:
tokenizer = textTokenizing.load_tokenizer("super_super_small_vocab")

In [12]:
VOCAB_SIZE, START_TOKEN, END_TOKEN = textTokenizing.tokens()

VOCAB_SIZE, START_TOKEN, END_TOKEN

(32368, [32366], [32367])

In [13]:
questions, answers = textTokenizing.tokenize_and_filter(questions, answers)

In [14]:
print(f'질문 데이터의 크기:{questions.shape}')
print(f'답변 데이터의 크기:{answers.shape}')

질문 데이터의 크기:(20000, 50)
답변 데이터의 크기:(20000, 50)


In [25]:

BATCH_SIZE = 64
BUFFER_SIZE = 20000

dataset = textTokenizing.make_dataset(BATCH_SIZE, BUFFER_SIZE)

In [26]:
# 임의의 샘플에 대해서 [:, :-1]과 [:, 1:]이 어떤 의미를 가지는지 테스트해본다.
print(answers[0]) # 기존 샘플
print(answers[:1][:, :-1]) # 마지막 패딩 토큰 제거하면서 길이가 39가 된다.
print(answers[:1][:, 1:]) # 맨 처음 토큰이 제거된다. 다시 말해 시작 토큰이 제거된다. 길이는 역시 39가 된다.

[32366  6574  6630 27079  2262 32367     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]
[[32366  6574  6630 27079  2262 32367     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]]
[[ 6574  6630 27079  2262 32367     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]]


In [27]:
from hyperparameters import NUM_LAYERS, D_MODEL, NUM_HEADS, DFF, DROPOUT, MAX_LENGTH

In [28]:
tf.keras.backend.clear_session()

In [29]:
model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    dff=DFF,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout=DROPOUT)

(1, 32368, 256)
(1, 32368, 256)


In [30]:
model.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 inputs (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 dec_inputs (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 enc_padding_mask (Lambda)      (None, 1, 1, None)   0           ['inputs[0][0]']                 
                                                                                                  
 encoder (Functional)           (None, None, 256)    9340416     ['inputs[0][0]',                 
                                                                  'enc_padding_mask[0][0

In [31]:
cp_callback = make_checkpoint("./training_super_small")

In [32]:
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)


model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

In [33]:
run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom=True)

In [None]:
EPOCHS = 40
model.fit(dataset, epochs=EPOCHS, callbacks=[cp_callback])