In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import urllib.request
import time
import os

In [3]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [4]:
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Layer, Lambda, Embedding, Dropout, LayerNormalization
from keras import Input, Model

## 1. TPU Setting

### 1) TPU 초기화

In [None]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
    tpu = 'grpc://' + os.environ['COLAB_TPU_ADDR'])

tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)

### 2) TPU Strategy 세팅

In [None]:
strategy = tf.distribute.TPUStrategy(resolver)

## 2. Data Load

In [7]:
url = "https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData%20.csv"

train_data = pd.read_csv(url)

In [8]:
train_data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [9]:
len(train_data)

11823

In [10]:
train_data.isnull().sum()

Q        0
A        0
label    0
dtype: int64

## 3. Data Cleansing

### 1) 구두점 앞에 공백 추가

In [11]:
questions = []

for sentence in train_data['Q'] :
  sentence = re.sub(r"([?.!,])" , r" \1 " , sentence)
  sentence = sentence.strip()
  questions.append(sentence)

In [12]:
answers = []

for sentence in train_data['A'] :
  sentence = re.sub(r"([?.!,])" , r" \1 " , sentence)
  sentence = sentence.strip()
  answers.append(sentence)

In [13]:
print(questions[:5])
print(answers[:5])

['12시 땡 !', '1지망 학교 떨어졌어', '3박4일 놀러가고 싶다', '3박4일 정도 놀러가고 싶다', 'PPL 심하네']
['하루가 또 가네요 .', '위로해 드립니다 .', '여행은 언제나 좋죠 .', '여행은 언제나 좋죠 .', '눈살이 찌푸려지죠 .']


### 2) 단어집합 생성
- 서브워드 텍스트 인코더 사용

In [14]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    questions + answers, target_vocab_size = 2**13)

- 시작토큰, 종료토큰에 대한 정수 부여

In [15]:
START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]

# 시작토큰과 종료토큰을 고려하여 단어집합의 크기를 +2
VOCAB_SIZE = tokenizer.vocab_size + 2

In [16]:
print('시작 토큰 번호: ' , START_TOKEN)
print('종료 토큰 번호: ' , END_TOKEN)
print('단어 집합의 크기: ' , VOCAB_SIZE)

시작 토큰 번호:  [8178]
종료 토큰 번호:  [8179]
단어 집합의 크기:  8180


### 3) 정수 인코딩과 패딩
- .encode() 사용

In [17]:
# 임의의 입력 문장을 sample_string에 저장
sample_string = questions[20]

# encode(): 텍스트 시퀀스 -> 정수 시퀀스
tokenized_string = tokenizer.encode(sample_string)
print('정수 인코딩 후의 문장: {}' .format(tokenized_string))

# decode(): 정수 시퀀스 -> 텍스트 시퀀스
original_string = tokenizer.decode(tokenized_string)
print('기존 문장: {}' .format(original_string))

정수 인코딩 후의 문장: [5766, 611, 3509, 141, 685, 3747, 849]
기존 문장: 가스비 비싼데 감기 걸리겠어


- tokenized_and_filter() 함수 정의

In [18]:
MAX_LENGTH = 40

def tokenize_and_filter(inputs, outputs) :
  tokenized_inputs, tokenized_outputs = [] , []

  for (sentence1, sentence2) in zip(inputs, outputs) :
    # encode(토큰화 + 정수인코딩), 시작 토큰과 종료 토큰 추가
    sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN
    sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN

    tokenized_inputs.append(sentence1)
    tokenized_outputs.append(sentence2)

  # 패딩
  tokenized_inputs = pad_sequences(tokenized_inputs,
                                   maxlen = MAX_LENGTH,
                                   padding = 'post')
  tokenized_outputs = pad_sequences(tokenized_outputs,
                                    maxlen = MAX_LENGTH,
                                    padding = 'post')
  
  return tokenized_inputs, tokenized_outputs

In [19]:
questions, answers = tokenize_and_filter(questions, answers)

In [20]:
questions.shape, answers.shape

((11823, 40), (11823, 40))

## 4. 인코더와 디코더 입력, 레이블 생성

In [21]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

# 디코더의 실제값 시퀀스에서 시작토큰 제거
dataset = tf.data.Dataset.from_tensor_slices((
    {'inputs' : questions,
     'dec_inputs' : answers[:, :-1]},   # 디코더의 입력(마지막 패딩 토큰 제거)
    {'outputs' : answers[:, 1:]}        # 맨 처음 토큰(시작토큰) 제거
))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [22]:
print(answers[0])           # 기존 샘플
print(answers[:1][:, :-1])  # 마지막 패딩 토큰 제거하면서 길이가 39가 됨
print(answers[:1][:, 1:])   # 시작토큰이 제거되면서 길이가 39가 됨

[8178 3844   74 7894    1 8179    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[[8178 3844   74 7894    1 8179    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]]
[[3844   74 7894    1 8179    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]]


## 5. 트랜스포머 만들기

In [23]:
class PositionalEncoding(Layer) :
  def __init__(self, position, d_model) :
    super(PositionalEncoding, self).__init__()
    self.pos_encoding = self.positional_encoding(position, d_model)

  def get_angles(self, position, i, d_model):
    angles = 1 / tf.pow(10000, (2 * (i//2)) / tf.cast(d_model, tf.float32))
    return position * angles

  def positional_encoding(self, position, d_model):
    angle_rads = self.get_angles(
        position = tf.range(position, dtype = tf.float32)[:, tf.newaxis],
        i = tf.range(d_model, dtype = tf.float32)[tf.newaxis, :],
        d_model = d_model
    )

    sines = tf.math.sin(angle_rads[:, 0::2])
    cosines = tf.math.cos(angle_rads[:, 1::2])

    pos_encoding = tf.concat([sines, cosines], axis = -1)
    pos_encoding = pos_encoding[tf.newaxis, ...]

    print(pos_encoding.shape)

    return tf.cast(pos_encoding, tf.float32)
  
  def call(self, inputs):
    return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

In [24]:
def scaled_dot_product_attention(query, key, value, mask) :

  matmul_qk = tf.matmul(query, key, transpose_b = True)
  depth = tf.cast(tf.shape(key)[-1], tf.float32)
  logits = matmul_qk / tf.math.sqrt(depth)

  if mask is not None:
    logits += (mask * -1e9)

  attention_weights = tf.nn.softmax(logits, axis = -1)
  output = tf.matmul(attention_weights, value)

  return output, attention_weights

In [25]:
class MultiHeadAttention(Layer):

  def __init__(self, d_model, num_heads, name = 'multi_head_attention'):
    super(MultiHeadAttention, self).__init__(name = name)
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads
    self.query_dense = Dense(units = d_model) # units: 뉴런 수
    self.key_dense = Dense(units = d_model)
    self.value_dense = Dense(units = d_model)
    self.dense = Dense(units = d_model)

  def split_heads(self, inputs, batch_size):
    inputs = tf.reshape(
        inputs, shape = (batch_size, -1, self.num_heads, self.depth)) # (1, 1, 4, 32)
    
    return tf.transpose(inputs, perm = [0, 2, 1, 3]) # (1, 4, 1, 32)

  def call(self, inputs):
    query, key, value, mask = inputs['query'], inputs['key'], \
    inputs['value'], inputs['mask']
    batch_size = tf.shape(query)[0]

    query = self.query_dense(query) 
    key = self.key_dense(key)       
    value = self.value_dense(value)
  
    query = self.split_heads(query, batch_size)
    key = self.split_heads(key, batch_size)
    value = self.split_heads(value, batch_size)

    scaled_attention, _ = scaled_dot_product_attention(query, key, value, mask)
    scaled_attention = tf.transpose(scaled_attention, perm = [0, 2, 1, 3])

    concat_attention = tf.reshape(scaled_attention,
                                  (batch_size, -1, self.d_model))
   
    outputs = self.dense(concat_attention)

    return outputs

In [26]:
def encoder_layer(dff, d_model, num_heads, dropout, name = 'encoder_layer'):
  inputs = Input(shape = (None, d_model), name = 'inputs')
  padding_mask = Input(shape = (1, 1, None), name = 'padding_mask')

  attention = MultiHeadAttention(
      d_model, num_heads, name = 'attention')({
          'query' : inputs,
          'key' : inputs,
          'value' : inputs,
          'mask' : padding_mask
      })
  
  attention = Dropout(rate = dropout)(attention)
  attention = LayerNormalization(epsilon = 1e-6)(inputs + attention)

  outputs = Dense(units = dff, activation = 'relu')(attention)
  outputs = Dense(units = d_model)(outputs)

  outputs = Dropout(rate = dropout)(outputs)
  outputs = LayerNormalization(epsilon = 1e-6)(attention + outputs)

  return Model(inputs = [inputs, padding_mask],
               outputs = outputs, name = name)

In [27]:
def encoder(vocab_size, num_layers, dff, d_model, num_heads, dropout, name = 'encoder'):
  inputs = Input(shape = (None,) , name = 'inputs')
  padding_mask = Input(shape = (1, 1, None), name = 'padding_mask')

  embeddings = Embedding(vocab_size, d_model)(inputs)
  embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
  embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
  outputs = Dropout(rate = dropout)(embeddings)

  for i in range(num_layers):
    outputs = encoder_layer(
        dff = dff,
        d_model = d_model,
        num_heads = num_heads,
        dropout = dropout,
        name = 'encoder_layer_{}' .format(i))([outputs, padding_mask])

  return Model(inputs = [inputs, padding_mask],
               outputs = outputs, name = name)

In [28]:
def create_padding_mask(x):
  mask = tf.cast(tf.math.equal(x, 0), tf.float32)
  
  return mask[:, tf.newaxis, tf.newaxis, :]

In [29]:
def create_look_ahead_mask(x):
  seq_len = tf.shape(x)[1]
  look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
  padding_mask = create_padding_mask(x) # 패딩 마스크도 포함

  return tf.maximum(look_ahead_mask, padding_mask)

In [30]:
def decoder_layer(dff, d_model, num_heads, dropout,
                  name = 'decoder_layer'):
  inputs = Input(shape = (None, d_model), name = 'inputs')
  enc_outputs = Input(shape = (None, d_model), name = 'encoder_outputs')

  look_ahead_mask = Input(shape = (1, None, None), name = 'look_ahead_mask')
  padding_mask = Input(shape = (1, 1, None), name = 'padding_mask')

  attention1 = MultiHeadAttention(
      d_model, num_heads, name = 'attention_1')(inputs = {
          'query' : inputs,
          'key' : inputs,
          'value' : inputs,
          'mask' : look_ahead_mask
      })

  attention1 = LayerNormalization(epsilon = 1e-6)(attention1 + inputs)
  attention2 = MultiHeadAttention(
      d_model, num_heads, name = 'attention_2')(inputs = {
          'query' : attention1, 
          'key' : enc_outputs, 
          'value' : enc_outputs,
          'mask' : padding_mask
      })
  
  attention2 = Dropout(rate = dropout)(attention2)
  attention2 = LayerNormalization(epsilon = 1e-6)(attention2 + attention1)

  outputs = Dense(units = dff, activation = 'relu')(attention2)
  outputs = Dense(units = d_model)(outputs)

  outputs = Dropout(rate = dropout)(outputs)
  outputs = LayerNormalization(epsilon = 1e-6)(outputs + attention2)

  return Model(
      inputs = [inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs = outputs,
      name = name)

In [31]:
def decoder(vocab_size, num_layers, dff, d_model, num_heads, dropout,
            name = 'docoder'):
  inputs = Input(shape = (None,), name = 'inputs')
  enc_outputs = Input(shape = (None, d_model), name = 'encoder_outputs')

  look_ahead_mask = Input(shape = (1, None, None), name = 'look_ahead_mask')
  padding_mask = Input(shape = (1, 1, None), name = 'padding_mask')

  embeddings = Embedding(vocab_size, d_model)(inputs)
  embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
  embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
  outputs = Dropout(rate = dropout)(embeddings)

  for i in range(num_layers):
    outputs = decoder_layer(
        dff = dff, d_model = d_model,
        num_heads = num_heads,
        dropout = dropout,
        name = 'decoder_layer_{}' .format(i))(
            inputs = [outputs, enc_outputs, look_ahead_mask, padding_mask])
        
  return Model(
      inputs = [inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs = outputs,
      name = name)

In [32]:
def transformer(vocab_size, num_layers, dff, d_model, num_heads, dropout,
                name = 'transformer'):
  
  inputs = Input(shape = (None,), name = 'inputs')
  dec_inputs = Input(shape = (None,), name = 'dec_inputs')

  enc_padding_mask = Lambda(
      create_padding_mask,
      output_shape = (1, 1, None),
      name = 'enc_padding_mask')(inputs)
  
  look_ahead_mask = Lambda(
      create_look_ahead_mask,
      output_shape = (1, 1, None),
      name = 'look_ahead_mask')(dec_inputs)

  dec_padding_mask = Lambda(
      create_padding_mask,
      output_shape = (1, 1, None),
      name = 'dec_padding_mask')(inputs)

  enc_outputs = encoder(
      vocab_size = vocab_size,
      num_layers = num_layers,
      dff = dff,
      d_model = d_model,
      num_heads = num_heads,
      dropout = dropout)(inputs = [inputs, enc_padding_mask])

  dec_outputs = decoder(
      vocab_size = vocab_size,
      num_layers = num_layers,
      dff = dff,
      d_model = d_model,
      num_heads = num_heads,
      dropout = dropout)(inputs = [dec_inputs, enc_outputs,
                                   look_ahead_mask,
                                   dec_padding_mask])

  outputs = Dense(units = vocab_size, name = 'outputs')(dec_outputs)

  return Model(inputs = [inputs, dec_inputs],
               outputs = outputs, name = name)

In [33]:
def loss_function(y_true, y_pred):
  y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))

  loss = keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction='none')(y_true, y_pred)

  mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
  loss = tf.multiply(loss, mask)

  return tf.reduce_mean(loss)

In [34]:
class CustomSchedule(keras.optimizers.schedules.LearningRateSchedule):

  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)
    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps**-1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [35]:
def accuracy(y_true, y_pred):
  y_true = tf.reshape(y_true, shape = (-1, MAX_LENGTH -1))

  return keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

## 6. 모델 생성

In [37]:
%%time

D_MODEL = 256
NUM_LAYERS = 2
NUM_HEADS = 8
DFF = 512
DROPOUT = 0.1

def create_model():
  model = transformer(
      vocab_size = VOCAB_SIZE,
      num_layers = NUM_LAYERS,
      dff = DFF,
      d_model = D_MODEL,
      num_heads = NUM_HEADS,
      dropout = DROPOUT)
  return model

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 12.4 µs


In [38]:
with strategy.scope():
  model = create_model()
  learning_rate = CustomSchedule(D_MODEL)
  optimizer = keras.optimizers.Adam(
      learning_rate, beta_1 = 0.9, beta_2 = 0.98, epsilon = 1e-9)  

  model.compile(optimizer = optimizer,
                  loss = loss_function,
                  metrics = [accuracy])

(1, 8180, 256)
(1, 8180, 256)


In [52]:
%%time
EPOCHS = 10
hist = model.fit(dataset, epochs = EPOCHS)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 20.7 s, sys: 2.92 s, total: 23.7 s
Wall time: 1min 3s


## 7. 챗봇 평가하기

In [40]:
def evaluate(sentence) :
  sentence = preprocess_sentence(sentence)

  sentence = tf.expand_dims(
      START_TOKEN + tokenizer.encode(sentence) + END_TOKEN, axis = 0)
  
  output = tf.expand_dims(START_TOKEN, 0)

  # 디코더의 예측 시작
  for i in range(MAX_LENGTH) :
    predictions = model(inputs = [sentence, output],
                        training = False)
    
    # 현재(마지막) 시점의 예측 단어를 받아옴
    predictions = predictions[:, -1:, :]
    predicted_id = tf.cast(tf.argmax(predictions, axis = -1), tf.int32)

    # 마지막 시점의 예측 단어가 종료 토큰이면 예측 중단
    if tf.equal(predicted_id, END_TOKEN[0]):
       break

    # 마지막 시점의 예측 단어를 출력에 연결
    output = tf.concat([output, predicted_id], axis = -1)

  return tf.squeeze(output, axis = 0)

In [41]:
def predict(sentence):
  prediction = evaluate(sentence)

  predicted_sentence = tokenizer.decode(
      [i for i in prediction if i < tokenizer.vocab_size])
  
  print('Input: {}' .format(sentence))
  print('Output: {}' .format(predicted_sentence))

  return predicted_sentence

In [42]:
def preprocess_sentence(sentence):
  sentence = re.sub(r'([?.!,])' , r' \1' , sentence)
  sentence = sentence.strip()

  return sentence

In [53]:
predict('안녕')

Input: 안녕
Output: 안녕하세요 .


'안녕하세요 .'

In [54]:
predict('뭐해')

Input: 뭐해
Output: 냉장고 파먹기 해보세요 .


'냉장고 파먹기 해보세요 .'

In [55]:
predict('냉장고를 왜 파먹어 ㅋㅋ')

Input: 냉장고를 왜 파먹어 ㅋㅋ
Output: 슈퍼라도 가서 쇼핑하고 오세요 .


'슈퍼라도 가서 쇼핑하고 오세요 .'

In [56]:
predict('뭐 살까?')

Input: 뭐 살까?
Output: 함께 충분한 대화를 하고 상담을 받아보는 게 좋겠어요 .


'함께 충분한 대화를 하고 상담을 받아보는 게 좋겠어요 .'

In [57]:
predict('호엥')

Input: 호엥
Output: 음~ 곰곰히 생각해보세요 .


'음~ 곰곰히 생각해보세요 .'

In [58]:
predict('졸려')

Input: 졸려
Output: 오늘 일찍 주무세요 .


'오늘 일찍 주무세요 .'