In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import tensorflow as tf
import tensorflow_datasets as tfds
from keras.preprocessing.sequence import pad_sequences
from nlp_utils import preprocess_sentence, TextTokenizing
from transformer import transformer, CustomSchedule, loss_function
from utils import make_checkpoint, accuracy, load_csv_and_processing

In [2]:
train_data = pd.read_csv("./final_dataset.csv")
train_data.head()

Unnamed: 0,Q,A
0,안녕하세요,️️
1,이거 해봐요><,"나의 직장인 멘탈 성향은 [안챙겨도 잘커요, 탕비실 선인장] 당신의 멘탈 성향은 ..."
2,오 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ오 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ 안챙겨도 잘커요 너무 맞는데요ㅜㅜ? 자...,ㅋㅌㅋㅋㅋㅋㅋㅌㅋㅋㅋㅋ 아녜여 챙겨주세요
3,ㅋㅋㅋㅋㅋ당연하쥬ㅋㅋㅋㅋㅋ당연하쥬 누굴 챙길 여유는 저도 없는거같지만 그러나 점심 ...,그렇게 큰 권한을 주신다구요??그렇게 큰 권한을 주신다구요?? name1님 완전 대인배
4,"목요일 점심메뉴도 생각해오세요 크크 전 닭가슴살 먹을거지만,,",흠 그럼 저도흠 그럼 저도 한번 도시락을 싸올까요


In [3]:
questions, answers = load_csv_and_processing("./final_dataset.csv")

# For Small Model -> Small Dataset

In [4]:
questions = questions[:50000]
answers = answers[:50000]

In [5]:
textTokenizing = TextTokenizing()

In [None]:
# tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
#    questions+answers, target_vocab_size=2**16
#)

tokenizer = textTokenizing.create_tokenizer(questions, answers, target_vocab_size=2**16)

In [None]:
textTokenizing.save_tokenizer("super_small_vocab")

In [6]:
tokenizer = textTokenizing.load_tokenizer("super_small_vocab")

In [7]:
VOCAB_SIZE, START_TOKEN, END_TOKEN = textTokenizing.tokens()

VOCAB_SIZE, START_TOKEN, END_TOKEN

(62146, [62144], [62145])

In [8]:
questions, answers = textTokenizing.tokenize_and_filter(questions, answers)

In [9]:
print(f'질문 데이터의 크기:{questions.shape}')
print(f'답변 데이터의 크기:{answers.shape}')

질문 데이터의 크기:(50000, 50)
답변 데이터의 크기:(50000, 50)


In [10]:

BATCH_SIZE = 30
BUFFER_SIZE = 20000

dataset = textTokenizing.make_dataset(BATCH_SIZE, BUFFER_SIZE)

In [11]:
# 임의의 샘플에 대해서 [:, :-1]과 [:, 1:]이 어떤 의미를 가지는지 테스트해본다.
print(answers[0]) # 기존 샘플
print(answers[:1][:, :-1]) # 마지막 패딩 토큰 제거하면서 길이가 39가 된다.
print(answers[:1][:, 1:]) # 맨 처음 토큰이 제거된다. 다시 말해 시작 토큰이 제거된다. 길이는 역시 39가 된다.

[62144 14444 62145     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]
[[62144 14444 62145     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]]
[[14444 62145     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]]


In [12]:
from hyperparameters import NUM_LAYERS, D_MODEL, NUM_HEADS, DFF, DROPOUT, MAX_LENGTH

In [13]:
tf.keras.backend.clear_session()

In [14]:
model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    dff=DFF,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout=DROPOUT)

(1, 62146, 256)
(1, 62146, 256)


In [15]:
model.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 inputs (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 dec_inputs (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 enc_padding_mask (Lambda)      (None, 1, 1, None)   0           ['inputs[0][0]']                 
                                                                                                  
 encoder (Functional)           (None, None, 256)    16963584    ['inputs[0][0]',                 
                                                                  'enc_padding_mask[0][0

In [16]:
cp_callback = make_checkpoint("./training_small")

In [17]:
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)


model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

In [18]:
run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom=True)

In [19]:
EPOCHS = 40
model.fit(dataset, epochs=EPOCHS, callbacks=[cp_callback])

Epoch 1/40
   2/1667 [..............................] - ETA: 6:04 - loss: 2.5539 - accuracy: 0.0000e+00   



INFO:tensorflow:Assets written to: .\training_small\assets


INFO:tensorflow:Assets written to: .\training_small\assets


TypeError: Unable to serialize 256.0 to JSON. Unrecognized type <class 'tensorflow.python.framework.ops.EagerTensor'>.