In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import tensorflow as tf
import tensorflow_datasets as tfds
from keras.preprocessing.sequence import pad_sequences
from nlp_utils import preprocess_sentence, TextTokenizing
from transformer.tensorflow import transformer, CustomSchedule, loss_function
from utils import make_checkpoint, accuracy, load_csv_and_processing

In [8]:
train_data = pd.read_csv("./small_dataset.csv")
train_data.head()

len(train_data)

40674

In [36]:
questions, answers = load_csv_and_processing("./small_dataset.csv")


questions = questions[15000:]
answers = answers[15000:]

len(questions), len(answers)


(25673, 25673)

# For Small Model -> Small Dataset

In [37]:
type (questions[0]), answers[:4], questions[-1]

(str,
 ['맞아맞아 리즈시절이라는 말이 괜히 있느게 아닌 것같아',
  '진짜 멋있지~ 응 라디오로 가끔 노래가 나오는데 너무 좋더라고 또 듣고싶어져',
  '오 나도 한번 해보고 싶어',
  '완전 무농약이지  ?'],
 '힘들어서 결혼할까봐')

In [38]:
textTokenizing = TextTokenizing()

In [39]:
# tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
#    questions+answers, target_vocab_size=2**16
#

tokenizer = textTokenizing.create_tokenizer(questions, answers, target_vocab_size=2**16)

In [40]:
textTokenizing.save_tokenizer("super_super_small_vocab")

In [41]:
tokenizer = textTokenizing.load_tokenizer("super_super_small_vocab")

In [42]:
VOCAB_SIZE, START_TOKEN, END_TOKEN = textTokenizing.tokens()

VOCAB_SIZE, START_TOKEN, END_TOKEN

(62646, [62644], [62645])

In [43]:
questions, answers = textTokenizing.tokenize_and_filter(questions, answers)

In [44]:
print(f'질문 데이터의 크기:{questions.shape}')
print(f'답변 데이터의 크기:{answers.shape}')

질문 데이터의 크기:(25673, 50)
답변 데이터의 크기:(25673, 50)


In [45]:

BATCH_SIZE = 32
BUFFER_SIZE = 20000

dataset = textTokenizing.make_dataset(BATCH_SIZE, BUFFER_SIZE)

In [46]:
# 임의의 샘플에 대해서 [:, :-1]과 [:, 1:]이 어떤 의미를 가지는지 테스트해본다.
print(answers[0]) # 기존 샘플
print(answers[:1][:, :-1]) # 마지막 패딩 토큰 제거하면서 길이가 39가 된다.
print(answers[:1][:, 1:]) # 맨 처음 토큰이 제거된다. 다시 말해 시작 토큰이 제거된다. 길이는 역시 39가 된다.

[62644 11051 50239   354   682 32397   502  4045 62645     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]
[[62644 11051 50239   354   682 32397   502  4045 62645     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]]
[[11051 50239   354   682 32397   502  4045 62645     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]]


In [47]:
from hyperparameters import NUM_LAYERS, D_MODEL, NUM_HEADS, DFF, DROPOUT, MAX_LENGTH

In [48]:
tf.keras.backend.clear_session()

In [49]:
model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    dff=DFF,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout=DROPOUT)

ResourceExhaustedError: {{function_node __wrapped__RandomUniform_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[62646,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:RandomUniform]

In [25]:
model.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 inputs (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 dec_inputs (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 enc_padding_mask (Lambda)      (None, 1, 1, None)   0           ['inputs[0][0]']                 
                                                                                                  
 encoder (Functional)           (None, None, 256)    13979392    ['inputs[0][0]',                 
                                                                  'enc_padding_mask[0][0

In [26]:
cp_callback = make_checkpoint("training_super_small/cp-{epoch:04d}.ckpt")

In [27]:
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)


model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

In [28]:
run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom=True)

In [None]:
EPOCHS = 40
model.fit(dataset, epochs=EPOCHS, callbacks=[cp_callback])