In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import tensorflow as tf
import tensorflow_datasets as tfds
from keras.preprocessing.sequence import pad_sequences
from nlp_utils import preprocess_sentence, TextTokenizing
from transformer import transformer, CustomSchedule, loss_function

In [2]:
train_data = pd.read_csv("./final_dataset.csv")
train_data.head()

Unnamed: 0,Q,A
0,안녕하세요,️️
1,이거 해봐요><,"나의 직장인 멘탈 성향은 [안챙겨도 잘커요, 탕비실 선인장] 당신의 멘탈 성향은 ..."
2,오 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ오 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ 안챙겨도 잘커요 너무 맞는데요ㅜㅜ? 자...,ㅋㅌㅋㅋㅋㅋㅋㅌㅋㅋㅋㅋ 아녜여 챙겨주세요
3,ㅋㅋㅋㅋㅋ당연하쥬ㅋㅋㅋㅋㅋ당연하쥬 누굴 챙길 여유는 저도 없는거같지만 그러나 점심 ...,그렇게 큰 권한을 주신다구요??그렇게 큰 권한을 주신다구요?? name1님 완전 대인배
4,"목요일 점심메뉴도 생각해오세요 크크 전 닭가슴살 먹을거지만,,",흠 그럼 저도흠 그럼 저도 한번 도시락을 싸올까요


In [45]:
questions = [preprocess_sentence(q) for q in train_data["Q"]]
answers = [preprocess_sentence(a) for a in train_data["A"]]


print(questions[:3])
print(questions[:3])

['안녕하세요', '이거 해봐요><', '오 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ오 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ 안챙겨도 잘커요 너무 맞는데요ㅜㅜ ? 자세한 내용은 더 알아가야겟지만~~']
['안녕하세요', '이거 해봐요><', '오 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ오 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ 안챙겨도 잘커요 너무 맞는데요ㅜㅜ ? 자세한 내용은 더 알아가야겟지만~~']


# For Small Model -> Small Dataset

In [46]:
questions = questions[:50000]
answers = answers[:50000]

In [48]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    questions+answers, target_vocab_size=2**16
)

In [49]:
tokenizer.save_to_file("super_small_vocab")

In [98]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.load_from_file("super_small_vocab")

In [50]:
START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size+1]

VOCAB_SIZE = tokenizer.vocab_size + 2

START_TOKEN, END_TOKEN

([62158], [62159])

In [51]:
MAX_LENGTH = 50

def tokenize_and_filter(questions, answers):
    tokenized_inputs, tokenized_outputs = [], []

    for (input, output) in zip(questions, answers):
        input = START_TOKEN + tokenizer.encode(input) + END_TOKEN
        output = START_TOKEN + tokenizer.encode(output) + END_TOKEN

        tokenized_inputs.append(input)
        tokenized_outputs.append(output)
    
    tokenized_inputs = pad_sequences(tokenized_inputs, maxlen=MAX_LENGTH, padding="post")
    tokenized_outputs = pad_sequences(tokenized_outputs, maxlen=MAX_LENGTH, padding="post")

    return tokenized_inputs, tokenized_outputs

In [52]:
questions, answers = tokenize_and_filter(questions, answers)

In [53]:
print(f'질문 데이터의 크기:{questions.shape}')
print(f'답변 데이터의 크기:{answers.shape}')

질문 데이터의 크기:(50000, 50)
답변 데이터의 크기:(50000, 50)


In [54]:
# 텐서플로우 dataset을 이용하여 셔플(shuffle)을 수행하되, 배치 크기로 데이터를 묶는다.
# 또한 이 과정에서 교사 강요(teacher forcing)을 사용하기 위해서 디코더의 입력과 실제값 시퀀스를 구성한다.
BATCH_SIZE = 64
BUFFER_SIZE = 20000

# Decoder real sequence has to remove <SOS> token
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs': questions,
        'dec_inputs': answers[:, :-1] # decoder input. Last Padding Token removed
    },
    {
        'outputs': answers[:, 1:] # First Token removed. <sos> token gone
    }
))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [55]:
# 임의의 샘플에 대해서 [:, :-1]과 [:, 1:]이 어떤 의미를 가지는지 테스트해본다.
print(answers[0]) # 기존 샘플
print(answers[:1][:, :-1]) # 마지막 패딩 토큰 제거하면서 길이가 39가 된다.
print(answers[:1][:, 1:]) # 맨 처음 토큰이 제거된다. 다시 말해 시작 토큰이 제거된다. 길이는 역시 39가 된다.

[62158 14452 62159     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]
[[62158 14452 62159     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]]
[[14452 62159     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]]


In [94]:
# Hyper-parameters
NUM_LAYERS = 2
D_MODEL = 256
NUM_HEADS = 8
DFF = 512
DROPOUT = 0.1

In [95]:
tf.keras.backend.clear_session()

In [96]:
model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    dff=DFF,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout=DROPOUT)

ResourceExhaustedError: {{function_node __wrapped__RandomUniform_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[62160,128] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:RandomUniform]

In [86]:
model.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 inputs (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 enc_padding_mask (Lambda)      (None, 1, None, Non  0           ['inputs[0][0]']                 
                                e)                                                                
                                                                                                  
 dec_inputs (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 encoder (Functional)           (None, None, 256)    16967168    ['inputs[0][0]',       

In [87]:
import os

# Checkpoint
checkpoint_path = "training_small/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# save weights in each five epochs
# cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_weights_only=True, save_best_only=True)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_weights_only=True, save_freq=3)

# model.save_weights(checkpoint_path.format(epoch=0))

In [88]:
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

def accuracy(y_true, y_pred):
  # ensure labels have shape (batch_size, MAX_LENGTH - 1)
  y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
  return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

In [89]:
EPOCHS = 40
model.fit(dataset, epochs=EPOCHS, callbacks=[cp_callback])

Epoch 1/40


ValueError: in user code:

    File "c:\Users\YSH\anaconda3\envs\mlenv\lib\site-packages\keras\engine\training.py", line 1013, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\YSH\anaconda3\envs\mlenv\lib\site-packages\keras\engine\training.py", line 1002, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\YSH\anaconda3\envs\mlenv\lib\site-packages\keras\engine\training.py", line 992, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\YSH\anaconda3\envs\mlenv\lib\site-packages\keras\engine\training.py", line 851, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\YSH\anaconda3\envs\mlenv\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\YSH\anaconda3\envs\mlenv\lib\site-packages\keras\engine\input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Exception encountered when calling layer "transformer" (type Functional).
    
    Input 1 of layer "encoder" is incompatible with the layer: expected shape=(None, 1, 1, None), found shape=(None, 1, 50, 50)
    
    Call arguments received by layer "transformer" (type Functional):
      • inputs={'inputs': 'tf.Tensor(shape=(None, 50), dtype=int32)', 'dec_inputs': 'tf.Tensor(shape=(None, 49), dtype=int32)'}
      • training=True
      • mask=None
