In [1]:
!pip install konlpy

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.4MB/s 
Collecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 9.6MB/s 
[?25hCollecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/de/af/93f92b38ec1ff3091cd38982ed19cea2800fefb609b5801c41fc43c0781e/JPype1-1.2.1-cp36-cp36m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 48.0MB/s 
[?25hCollecting tweepy>=3.7.0
  Downloading https://files.pythonhosted.org/packages/67/c3/6bed87f3b1e5ed2f34bd58bf7978e308c86e255193916be76e5a5ce5dfca/tweepy-3.10.0-py2.py3-none-any.whl
Collecting colorama
  Download

In [2]:
import random
import tensorflow as tf
from konlpy.tag import Okt

EPOCHS = 200
NUM_WORDS = 2000

In [14]:
class Encoder(tf.keras.Model):
  def __init__(self):
    super(Encoder, self).__init__()
    self.emb = tf.keras.layers.Embedding(NUM_WORDS, 64)
    self.lstm = tf.keras.layers.LSTM(512, return_state=True)
  
  def call(self, x, training=False, mask=None):
    x = self.emb(x)
    _, h, c = self.lstm(x)
    return h,c

In [22]:
class Decoder(tf.keras.Model):
  def __init__(self):
    super(Decoder, self).__init__()
    self.emb= tf.keras.layers.Embedding(NUM_WORDS, 64)
    self.lstm = tf.keras.layers.LSTM(512, return_sequences=True, return_state=True)
    self.dense = tf.keras.layers.Dense(NUM_WORDS, activation='softmax')

  def call(self, inputs, training=False, mask=None):
    x, h, c = inputs
    x = self.emb(x)
    x, h, c = self.lstm(x, initial_state=[h,c])
    return self.dense(x), h, c

In [23]:
class Seq2seq(tf.keras.Model):
  def __init__(self, sos, eos):
    super(Seq2seq, self).__init__()
    self.enc = Encoder()
    self.dec = Decoder()
    self.sos = sos
    self.eos = eos
  def call(self, inputs, training=False, mask=None):
    if training is True:
      x, y = inputs
      h, c = self.enc(x)
      y, _, _ = self.dec((y, h, c))
      return y
    else:
      x = inputs
      h, c = self.enc(x)
      y = tf.convert_to_tensor(self.sos)
      y = tf.reshape(y, (1, 1))
      seq = tf.TensorArray(tf.int32, 64)
      for idx in tf.range(64):
        y, h, c = self.dec([y, h, c])
        y = tf.cast(tf.argmax(y, axis=-1), dtype=tf.int32)
        y = tf.reshape(y, (1, 1))
        seq = seq.write(idx, y)
        if y == self.eos:
          break
      return tf.reshape(seq.stack(), (1, 64))

In [28]:
@tf.function
def train_step(model, inputs, labels, loss_object, optimizer, train_loss, train_accuracy):
  output_labels = labels[:, 1:]
  shifted_labels = labels[:, :-1]
  with tf.GradientTape() as tape:
    predictions = model([inputs, shifted_labels], training=True)
    loss = loss_object(output_labels, predictions)
  gradients = tape.gradient(loss, model.trainable_variables)

  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  train_loss(loss)
  train_accuracy(output_labels, predictions)

@tf.function
def test_step(model, inputs):
  return model(inputs, training=False)

In [29]:
dataset_file = '/content/drive/MyDrive/Colab Notebooks/datas/chatbot_data.csv' 
okt = Okt()

with open(dataset_file, 'r') as file:
  lines = file.readlines()
  seq = [' '.join(okt.morphs(line)) for line in lines]

questions = seq[::2]
answers = ['\t ' + lines for lines in seq[1::2]]

num_sample = len(questions)

perm = list(range(num_sample))
random.seed(0)
random.shuffle(perm)

train_q = list()
train_a = list()
test_q = list()
test_a = list()

for idx, qna in enumerate(zip(questions, answers)):
  q, a = qna
  if perm[idx] > num_sample//5:
    train_q.append(q)
    train_a.append(a)
  else:
    test_q.append(q)
    test_a.append(a)
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~')
tokenizer.fit_on_texts(train_q + train_a)

train_q_seq = tokenizer.texts_to_sequences(train_q)
train_a_seq = tokenizer.texts_to_sequences(train_a)

test_q_seq = tokenizer.texts_to_sequences(test_q)
test_a_seq = tokenizer.texts_to_sequences(test_a)

x_train = tf.keras.preprocessing.sequence.pad_sequences(train_q_seq, value=0, padding='pre', maxlen=64)
y_train = tf.keras.preprocessing.sequence.pad_sequences(train_a_seq, value=0, padding='post', maxlen=65)

x_test = tf.keras.preprocessing.sequence.pad_sequences(test_q_seq, value=0, padding='pre', maxlen=64)
y_test = tf.keras.preprocessing.sequence.pad_sequences(test_a_seq, value=0, padding='post', maxlen=65)

train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(32).prefetch(1024)
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(1).prefetch(1024)

In [30]:
model = Seq2seq(sos=tokenizer.word_index['\t'], eos=tokenizer.word_index['\n'])

loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

In [31]:
for epoch in range(EPOCHS):
  for seqs, labels in train_ds:
    train_step(model, seqs, labels, loss_object, optimizer, train_loss, train_accuracy)

  template = 'Epoch {}, Loss: {}, Accuracy: {}'
  print(template.format(epoch + 1, train_loss.result(), train_accuracy.result() * 100))

  train_loss.reset_states()
  train_accuracy.reset_states()

Epoch 1, Loss: 3.094630002975464, Accuracy: 83.20801544189453
Epoch 2, Loss: 0.6022385358810425, Accuracy: 90.88346099853516
Epoch 3, Loss: 0.5638185143470764, Accuracy: 91.0753402709961
Epoch 4, Loss: 0.5472832918167114, Accuracy: 91.13408660888672
Epoch 5, Loss: 0.5439242124557495, Accuracy: 91.1536636352539
Epoch 6, Loss: 0.5402533411979675, Accuracy: 91.11058807373047
Epoch 7, Loss: 0.534101665019989, Accuracy: 91.09100341796875
Epoch 8, Loss: 0.5339129567146301, Accuracy: 91.17716217041016
Epoch 9, Loss: 0.5275033712387085, Accuracy: 91.14974975585938
Epoch 10, Loss: 0.522671103477478, Accuracy: 91.18890380859375
Epoch 11, Loss: 0.5094152092933655, Accuracy: 91.19282531738281
Epoch 12, Loss: 0.5102260112762451, Accuracy: 91.26331329345703
Epoch 13, Loss: 0.4912428855895996, Accuracy: 91.3690414428711
Epoch 14, Loss: 0.4766705334186554, Accuracy: 91.53352355957031
Epoch 15, Loss: 0.4577277898788452, Accuracy: 91.90946197509766
Epoch 16, Loss: 0.4544192850589752, Accuracy: 92.144424

In [32]:
for test_seq, test_labels in test_ds:
  prediction = test_step(model, test_seq)
  test_text = tokenizer.sequences_to_texts(test_seq.numpy())
  gt_text = tokenizer.sequences_to_texts(test_labels.numpy())
  texts = tokenizer.sequences_to_texts(prediction.numpy())
  print('-')
  print('q: ', test_text)
  print('a: ', gt_text)
  print('p: ', texts)

-
q:  ['여기 기프티콘 되죠 \n']
a:  ['\t 네 현금영수증 해드릴까 요 \n']
p:  ['여기 진동 벨 가지 고 계시다가 울리면 주문 한 음료 가져가세요 \n']
-
q:  ['네 에 테이크 아웃 도 가능한가요 \n']
a:  ['\t 네 로 오시 면 테이크 아웃 잔 에 담아 드려요 \n']
p:  ['아뇨 현재 법적 으로 금지 하고 있어요 \n']
-
q:  ['아메리카노 톨 사이즈 로 주세요 \n']
a:  ['\t 따뜻한 거 로 드릴 까요 \n']
p:  ['사이즈 는 뭘 로 드릴 까요 \n']
-
q:  ['진동 을 따로 주시나요 \n']
a:  ['\t 주 번호 로 드리겠습니다 \n']
p:  ['네 그건 시즌 한정 메뉴 라 겨울 에는 판매 하지 않습니다 \n']
-
q:  ['자리 있나요 \n']
a:  ['\t 네 있습니다 \n']
p:  ['네 영수증 드릴게요 \n']
-
q:  ['그럼 루이보스 밀크 티 하나 \n']
a:  ['\t 네 알겠습니다 \n']
p:  ['네 알겠습니다 \n']
-
q:  ['다음 에 무료 로 하고 엔 도장 찍어주세요 \n']
a:  ['\t 네 \n']
p:  ['2 개 찍어 드렸고 진동 벨 로 알려 드리겠습니다 \n']
-
q:  ['아메리카노 한 잔 에 얼마 죠 \n']
a:  ['\t 입니다 \n']
p:  ['4000원 입니다 \n']
-
q:  ['얼마나 \n']
a:  ['\t 바로 만들어 드릴게요 \n']
p:  ['진동 벨 로 알려 드리겠습니다 \n']
-
q:  ['카푸치노 는 로 주시 고 아메리카노 는 로 \n']
a:  ['\t 네 더 없으세요 \n']
p:  ['드시고 가시나요 \n']
-
q:  ['아메리카노 는 어떤 종류 가 있나요 \n']
a:  ['\t 디카 페인 과 기본 아메리카노 2 종류 있습니다 \n']
p:  ['네 초코 머핀 이랑 치즈케이크 있습니다 \n']
-
q:  ['카카오 페이 로 결제 가능한가요 \n']
a:  ['\t 네 가능합니다 \n']
p:  [