In [1]:
# https://wikidocs.net/31379
# https://www.tensorflow.org/text/tutorials/transformer

# Build Model

In [2]:
import numpy as np
import tensorflow as tf

from layers import *
from ops import *

In [3]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, rate=0.1):
        super(Transformer, self).__init__()

        # encoder 와 decoder의 vocab size 다르게 설정 가능(번역 문제)
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, vocab_size, rate)

        self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                                   vocab_size, vocab_size, rate)

        self.final_layer = layers.Dense(vocab_size) # depending on purpose

    def call(self, inputs, target, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):

        enc_output = self.encoder(inputs, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

        # dec_output.shape == (batch_size, tar_seq_len, d_model)
        dec_output = self.decoder(target, enc_output, training, look_ahead_mask, dec_padding_mask)

        final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

        return final_output

# Build Dataset

In [4]:
import pandas as pd
import re
import urllib.request
import tensorflow_datasets as tfds

In [5]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData%20.csv", filename="ChatBotData.csv")
train_data = pd.read_csv('ChatBotData.csv')
train_data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [6]:
questions = []
for sentence in train_data['Q']:
    # 구두점에 대해서 띄어쓰기
    # ex) 12시 땡! -> 12시 땡 !
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    questions.append(sentence)
    
answers = []
for sentence in train_data['A']:
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    answers.append(sentence)

In [7]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    questions + answers, target_vocab_size=2**13)

In [8]:
START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]
VOCAB_SIZE = tokenizer.vocab_size + 2
MAX_LENGTH = 40

In [9]:
def tokenize_and_filter(inputs, outputs):
    tokenized_inputs, tokenized_outputs = [], []

    for (sentence1, sentence2) in zip(inputs, outputs):
        # encode(토큰화 + 정수 인코딩), 시작 토큰과 종료 토큰 추가
        sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN
        sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN

        tokenized_inputs.append(sentence1)
        tokenized_outputs.append(sentence2)

    # 패딩
    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
      tokenized_inputs, maxlen=MAX_LENGTH, padding='post')
    tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
      tokenized_outputs, maxlen=MAX_LENGTH, padding='post')

    return tokenized_inputs, tokenized_outputs

questions, answers = tokenize_and_filter(questions, answers)

In [10]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

In [11]:
dataset = tf.data.Dataset.from_tensor_slices(
    {
        'inputs': questions,
        'outputs': answers 
    },
)

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

# Train

In [12]:
D_MODEL = 512
NUM_LAYERS = 3
NUM_HEADS = 4
DFF = 1024

model = Transformer(
    num_layers = NUM_LAYERS,
    d_model = D_MODEL,
    num_heads = NUM_HEADS,
    dff = DFF,
    vocab_size = VOCAB_SIZE,
    )

In [13]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [14]:
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate)

In [15]:
EPOCHS = 20

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0)) # loss mask
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [16]:
model.compile(optimizer=optimizer, loss=loss_function)

In [17]:
for e in range(EPOCHS):
    losses = []
    for data in dataset:
        inp = data['inputs']
        tar = data['outputs']

        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]

        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

        with tf.GradientTape() as tape:
            predictions = model(inp, tar_inp,
                                         True,
                                         enc_padding_mask,
                                         combined_mask,
                                         dec_padding_mask)
            loss = loss_function(tar_real, predictions)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        losses.append(loss.numpy())
    print(e, np.mean(losses))

0 7.6406503
1 6.0654864
2 5.52619
3 5.1869555
4 4.7982006
5 4.341309
6 3.839578
7 3.3081253
8 2.758317
9 2.199976
10 1.6463473
11 1.1624299
12 0.78663677
13 0.53121084
14 0.3807246
15 0.3147327
16 0.29109034
17 0.30391937
18 0.32214537
19 0.3458417


# Evaluation

In [18]:
def preprocess_sentence(sentence):
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    return sentence

def evaluate(sentence):
    sentence = preprocess_sentence(sentence)

    sentence = tf.expand_dims(
      START_TOKEN + tokenizer.encode(sentence) + END_TOKEN, axis=0)

    output = tf.expand_dims(START_TOKEN, 0)

    
    # 디코더의 예측 시작
    for i in range(MAX_LENGTH):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(sentence, output)
        predictions = model(sentence, output,
                                     False,
                                     enc_padding_mask,
                                     combined_mask,
                                     dec_padding_mask)

    # 현재(마지막) 시점의 예측 단어를 받아온다.
        predictions = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # 만약 마지막 시점의 예측 단어가 종료 토큰이라면 예측을 중단
        if tf.equal(predicted_id, END_TOKEN[0]):
            break

        # 마지막 시점의 예측 단어를 출력에 연결한다.
        # 이는 for문을 통해서 디코더의 입력으로 사용될 예정이다.
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0)

def predict(sentence):
    prediction = evaluate(sentence)

    predicted_sentence = tokenizer.decode(
      [i for i in prediction if i < tokenizer.vocab_size])

    print('Input: {}'.format(sentence))
    print('Output: {}'.format(predicted_sentence))

    return predicted_sentence

In [19]:
output = predict("영화 볼래?")

Input: 영화 볼래?
Output: 먼저 연락해 보는게 좋을 거 같아요 .


In [20]:
output = predict("고민이 있어")

Input: 고민이 있어
Output: 제가 있잖아요 .


In [21]:
output = predict("게임하자")

Input: 게임하자
Output: 게임하세요 !


In [22]:
output = predict("카페갈래?")

Input: 카페갈래?
Output: 카페 데이트 좋죠 .


In [23]:
output = predict("너무 화가나")

Input: 너무 화가나
Output: 지금 상황을 그대로 받아들이세요 .
