# Seq2Seq Q&A Chatbot 구현 실습

### 데이터 취득

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/songys/Chatbot_data/refs/heads/master/ChatbotData.csv')
df = df[['Q', 'A']]
df

Unnamed: 0,Q,A
0,12시 땡!,하루가 또 가네요.
1,1지망 학교 떨어졌어,위로해 드립니다.
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.
4,PPL 심하네,눈살이 찌푸려지죠.
...,...,...
11818,훔쳐보는 것도 눈치 보임.,티가 나니까 눈치가 보이는 거죠!
11819,훔쳐보는 것도 눈치 보임.,훔쳐보는 거 티나나봐요.
11820,흑기사 해주는 짝남.,설렜겠어요.
11821,힘든 연애 좋은 연애라는게 무슨 차이일까?,잘 헤어질 수 있는 사이 여부인 거 같아요.


### 데이터 전처리

In [3]:
import re

def preprocess_text(text):
    text = re.sub(r"[ ]+", " ", text)
    text = text.strip()
    return text

# 질문과 답변에 전처리 적용 
df['Q'] = df['Q'].apply(preprocess_text)
df['A'] = df['A'].apply(preprocess_text)

# 학습용 텍스트 파일 생성 (Q + A)
with open('chatbot_train_data.txt', 'w', encoding='utf-8') as f:
    for q, a in zip(df['Q'], df['A']):
        f.write(q + '\t' + a + '\n')

##### 토커나이저 학습 (sentencepiece 활용)
- 접두사, 접미사 처리

In [4]:
import sentencepiece as spm

# Sentencepiece 모델 학습
spm.SentencePieceTrainer.Train(
    '--input=chatbot_train_data.txt --model_prefix=chatbot_model --vocab_size=2000'
)

# 학습된 토크나이저 로드
sp = spm.SentencePieceProcessor()
sp.Load('chatbot_model.model')

True

##### 학습용 데이터 Q_input, A_input, A_target 생성

In [5]:
Q_input = [sp.EncodeAsIds(q) for q in df['Q']]
A_input = [sp.EncodeAsIds(a) for a in df['A']]
A_target = [ids + [sp.piece_to_id('</s')] for ids in A_input]  # 디코더 종료 토큰 추가

### 모델 생성

In [6]:
import tensorflow as tf
from tensorflow.keras import layers

In [7]:
# 하이퍼파라미터 설정
VOCAB_SIZE = 5000
EMBEDDING_DIM = 256
HIDDEN_UNITS = 512

##### 인코더 생성

In [8]:
def build_encoder():
    inputs = tf.keras.Input(shape=(None, ), dtype='int32')
    embedding = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
    outputs, h, c = layers.LSTM(HIDDEN_UNITS, return_state=True)(embedding)
    return tf.keras.Model(inputs, [outputs, h, c])

encoder = build_encoder()

##### 디코더 (teacher-forcing 모델) 생성

In [9]:
def build_decoder():
    inputs = tf.keras.Input(shape=(None, ), dtype='int32')
    encoder_outputs = tf.keras.Input(shape=(HIDDEN_UNITS, ))
    encoder_h = tf.keras.Input(shape=(HIDDEN_UNITS, ))
    encoder_c = tf.keras.Input(shape=(HIDDEN_UNITS, ))

    embedding = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
    lstm = layers.LSTM(HIDDEN_UNITS, return_sequences=True, return_state=True)
    outputs, _, _ = lstm(embedding, initial_state=[encoder_h, encoder_c])
    outputs = layers.Dense(VOCAB_SIZE, activation='softmax')(outputs)

    return tf.keras.Model([inputs, encoder_outputs, encoder_h, encoder_c], outputs)

decoder = build_decoder()

### 모델 학습

In [10]:
# 모델 학습 준비
class Seq2Seq(tf.keras.Model):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs

        encoder_outputs, h, c = self.encoder(encoder_inputs)
        decoder_outputs = self.decoder([decoder_inputs, encoder_outputs, h, c])

        return decoder_outputs
    
model = Seq2Seq(encoder, decoder)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


##### 학습

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 50

Q_input = pad_sequences(Q_input, maxlen=max_len, padding='post')
A_input = pad_sequences(A_input, maxlen=max_len, padding='post')
A_target = pad_sequences(A_target, maxlen=max_len, padding='post')

In [12]:
model.fit([Q_input, A_input], A_target, batch_size=64, epochs=20)

Epoch 1/20
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 524ms/step - accuracy: 0.8100 - loss: 1.9369
Epoch 2/20
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 534ms/step - accuracy: 0.8720 - loss: 0.7957
Epoch 3/20
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 532ms/step - accuracy: 0.9123 - loss: 0.5402
Epoch 4/20
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 531ms/step - accuracy: 0.9478 - loss: 0.3305
Epoch 5/20
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 535ms/step - accuracy: 0.9745 - loss: 0.1847
Epoch 6/20
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 534ms/step - accuracy: 0.9894 - loss: 0.0944
Epoch 7/20
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 536ms/step - accuracy: 0.9954 - loss: 0.0485
Epoch 8/20
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 530ms/step - accuracy: 0.9976 - loss: 0.0275
Epoch 9/20
[1m1

<keras.src.callbacks.history.History at 0x2330e630140>

##### 디코더 (추론 모델) 생성

In [13]:
def decode_sequence(input_seq):
    encoder_outputs, h, c = encoder(input_seq)
    target_seq = tf.zeros((1, 1), dtype=tf.int32)

    decoded_sentence = ''

    for _ in range(max_len):
        output_tokens = decoder([target_seq, encoder_outputs, h, c])

        # 배열이 아닌 정수로 변환
        sampled_token_index = int(tf.argmax(output_tokens, axis=-1).numpy()[0, 0])

        sampled_token = sp.IdToPiece(sampled_token_index)
        decoded_sentence += sampled_token + ' '

        if sampled_token == '</s>':
            break

        target_seq = tf.constant([[sampled_token_index]])

    return decoded_sentence

### 추론 함수
- 추론 함수 생성 및 테스트

##### 간단한 챗봇 구현

In [None]:
while True:
    user_input = input('User: ')
    if user_input == 'exit':
        break

    input_seq = pad_sequences([sp.EncodeAsIds(user_input)], maxlen=max_len, padding='post')
    response = decode_sequence(input_seq)
    print('Chatbot:', response.strip())