In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import tensorflow as tf
import tensorflow_datasets as tfds
from keras.preprocessing.sequence import pad_sequences
from nlp_utils import preprocess_sentence, TextTokenizing
from transformer import transformer, CustomSchedule, loss_function
from utils import make_checkpoint, accuracy, load_csv_and_processing

In [2]:
train_data = pd.read_csv("./super_small_4_dataset.csv")
train_data.head()

len(train_data)

21801

In [13]:
questions, answers = load_csv_and_processing("./super_small_4_dataset.csv")

# questions = questions[15000:]
# answers = answers[15000:]

len(questions), len(answers)


(21794, 21794)

# For Small Model -> Small Dataset

In [14]:
type (questions[0]), answers[:4], questions[-1]

(str,
 ['응  난 엄청 근처에 있어  .',
  '집 근처에 병원이 있으면 편리한 거 같아  .',
  '언제 사고가 발생할지 모르잖아  .',
  '나는 집 근처에 병원이 있어서 다행이야  .'],
 '힘들어서 결혼할까봐')

In [15]:
textTokenizing = TextTokenizing()

In [16]:
# tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
#    questions+answers, target_vocab_size=2**16
#

tokenizer = textTokenizing.create_tokenizer(questions, answers, target_vocab_size=2**16)

In [17]:
textTokenizing.save_tokenizer("super_ultra_small_vocab")

In [18]:
tokenizer = textTokenizing.load_tokenizer("super_ultra_small_vocab")

In [19]:
VOCAB_SIZE, START_TOKEN, END_TOKEN = textTokenizing.tokens()

VOCAB_SIZE, START_TOKEN, END_TOKEN

(54759, [54757], [54758])

In [20]:
questions, answers = textTokenizing.tokenize_and_filter(questions, answers)

In [21]:
print(f'질문 데이터의 크기:{questions.shape}')
print(f'답변 데이터의 크기:{answers.shape}')

질문 데이터의 크기:(21794, 50)
답변 데이터의 크기:(21794, 50)


In [22]:

BATCH_SIZE = 64
BUFFER_SIZE = 20000

dataset = textTokenizing.make_dataset(BATCH_SIZE, BUFFER_SIZE)

In [23]:
# 임의의 샘플에 대해서 [:, :-1]과 [:, 1:]이 어떤 의미를 가지는지 테스트해본다.
print(answers[0]) # 기존 샘플
print(answers[:1][:, :-1]) # 마지막 패딩 토큰 제거하면서 길이가 39가 된다.
print(answers[:1][:, 1:]) # 맨 처음 토큰이 제거된다. 다시 말해 시작 토큰이 제거된다. 길이는 역시 39가 된다.

[54757   226    29    28    54  2355    26     1 54758     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]
[[54757   226    29    28    54  2355    26     1 54758     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]]
[[  226    29    28    54  2355    26     1 54758     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]]


In [24]:
from hyperparameters import NUM_LAYERS, D_MODEL, NUM_HEADS, DFF, DROPOUT, MAX_LENGTH

In [25]:
tf.keras.backend.clear_session()

In [26]:
model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    dff=DFF,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout=DROPOUT)

(1, 54759, 256)
(1, 54759, 256)


In [27]:
model.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 inputs (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 dec_inputs (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 enc_padding_mask (Lambda)      (None, 1, 1, None)   0           ['inputs[0][0]']                 
                                                                                                  
 encoder (Functional)           (None, None, 256)    15072512    ['inputs[0][0]',                 
                                                                  'enc_padding_mask[0][0

In [28]:
cp_callback = make_checkpoint("training_super_small/cp-{epoch:04d}.ckpt")

In [29]:
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)


model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

In [30]:
run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom=True)

In [31]:
EPOCHS = 20
model.fit(dataset, epochs=EPOCHS, callbacks=[cp_callback])

Epoch 1/20

KeyboardInterrupt: 