# 🇰🇷 Seq2Seq Q&A Chatbot (Korean, SentencePiece) — 프로젝트 노트북

이 노트북은 **한국어 Q&A Chatbot**을 위한 **Seq2Seq (Encoder–Decoder, Teacher Forcing)** 파이프라인 구현 예시입니다.  
요구사항을 반영하여 다음 단계를 모두 포함합니다.

1. **데이터 취득**: 공개 한국어 챗봇 데이터셋(`ChatbotData.csv`) 로드  
2. **데이터 전처리**: 정리, 정규화, 분리  
3. **토크나이저 학습 (SentencePiece)**  
   - `<bos>, <eos>, <pad>, <oov>` 지정  
   - `set_encode_extra_options(':')`, `bos:`, `:eos`, `bos:eos` 사용 예시  
4. **학습용 텐서 생성**: `Q_input`, `A_input`, `A_target`  
5. **모델 생성**: Encoder, Decoder(Teacher Forcing)  
6. **학습**  
7. **모델 추론**: Inference용 Decoder, Greedy Decode 함수  

In [2]:

import os, re, random
from pathlib import Path
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
import sentencepiece as spm
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers

print("TensorFlow:", tf.__version__)
SEED=42; np.random.seed(SEED); random.seed(SEED); tf.random.set_seed(SEED)

DATA_URL="https://raw.githubusercontent.com/songys/Chatbot_data/refs/heads/master/ChatbotData.csv"
DATA_DIR=Path("./data"); DATA_DIR.mkdir(exist_ok=True, parents=True)
RAW_CSV=DATA_DIR/"ChatbotData.csv"
SPM_PREF=str(DATA_DIR/"spm_kor_v6b")

VOCAB_SIZE=4000
EMB_DIM=256; HID_DIM=384
MAX_LEN_Q=28; MAX_LEN_A=28
BATCH_SIZE=32; EPOCHS=25
DROPOUT=0.2


TensorFlow: 2.16.1


In [3]:

# 1) 데이터
df=pd.read_csv(DATA_URL)[['Q','A']].dropna()
df['Q']=df['Q'].astype(str).str.strip(); df['A']=df['A'].astype(str).str.strip()
df=df[(df['Q']!='')&(df['A']!='')].drop_duplicates().reset_index(drop=True)
df.to_csv(RAW_CSV, index=False, encoding='utf-8')
print("Raw:", df.shape); df.head(3)


Raw: (11750, 2)


Unnamed: 0,Q,A
0,12시 땡!,하루가 또 가네요.
1,1지망 학교 떨어졌어,위로해 드립니다.
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.


In [4]:

# 2) 전처리
def norm_ko(t):
    t=str(t)
    t=re.sub(r"[\t]+"," ",t)
    t=re.sub(r"\s+", " ", t)
    return t.strip()

df['Q_norm']=df['Q'].apply(norm_ko)
df['A_norm']=df['A'].apply(norm_ko)
df=df[(df['Q_norm'].str.len()>0)&(df['A_norm'].str.len()>0)]
train_df, val_df = train_test_split(df[['Q_norm','A_norm']], test_size=0.1, random_state=SEED, shuffle=True)
len(train_df), len(val_df)


(10575, 1175)

In [5]:

# 3) SentencePiece (4k vocab)
corpus_path=DATA_DIR/"spm_corpus_v6b.txt"
with open(corpus_path,"w",encoding="utf-8") as f:
    for s in pd.concat([train_df['Q_norm'], train_df['A_norm']], axis=0):
        f.write(s+"\n")

spm_cmd=(f"--input={corpus_path} --model_prefix={SPM_PREF} --vocab_size={VOCAB_SIZE} "
         f"--model_type=unigram --character_coverage=0.9995 "
         f"--max_sentence_length=999999 "
         f"--pad_id=0 --pad_piece=<pad> --bos_id=1 --bos_piece=<bos> "
         f"--eos_id=2 --eos_piece=<eos> --unk_id=3 --unk_piece=<oov>")
spm.SentencePieceTrainer.train(spm_cmd)
sp=spm.SentencePieceProcessor(model_file=str(Path(SPM_PREF + ".model")))
print("Vocab:", sp.vocab_size())

def sp_encode_ids(text, extra=':'):
    sp.set_encode_extra_options(extra)
    ids=sp.encode(str(text), out_type=int)
    sp.set_encode_extra_options('')
    return ids


Vocab: 4000


sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=data/spm_corpus_v6b.txt --model_prefix=data/spm_kor_v6b --vocab_size=4000 --model_type=unigram --character_coverage=0.9995 --max_sentence_length=999999 --pad_id=0 --pad_piece=<pad> --bos_id=1 --bos_piece=<bos> --eos_id=2 --eos_piece=<eos> --unk_id=3 --unk_piece=<oov>
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: data/spm_corpus_v6b.txt
  input_format: 
  model_prefix: data/spm_kor_v6b
  model_type: UNIGRAM
  vocab_size: 4000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 999999
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  re

In [6]:

# 4) 텐서 + PAD 마스킹
PAD_ID,BOS_ID,EOS_ID,OOV_ID=0,1,2,3
VOCAB_SIZE=sp.vocab_size()

def encode_pair(q,a):
    return sp_encode_ids(q, ":"), sp_encode_ids(a, "bos:"), sp_encode_ids(a, ":eos")

def pad_to(ids, L): return ids[:L] if len(ids)>=L else ids+[PAD_ID]*(L-len(ids))

def build_tensors(frame):
    Qs,Ains,Atgts=[],[],[]
    for q,a in zip(frame['Q_norm'], frame['A_norm']):
        q_ids,a_in,a_tgt=encode_pair(q,a)
        Qs.append(pad_to(q_ids,MAX_LEN_Q))
        Ains.append(pad_to(a_in,MAX_LEN_A))
        Atgts.append(pad_to(a_tgt,MAX_LEN_A))
    Xq=np.array(Qs,np.int32); Xin=np.array(Ains,np.int32); Y=np.array(Atgts,np.int32)
    W=(Y!=PAD_ID).astype("float32")  # PAD 무시
    return Xq,Xin,Y,W

X_enc_tr,X_dec_in_tr,Y_tr,W_tr=build_tensors(train_df)
X_enc_va,X_dec_in_va,Y_va,W_va=build_tensors(val_df)

X_enc_tr.shape, X_dec_in_tr.shape, Y_tr.shape, W_tr.shape


((10575, 28), (10575, 28), (10575, 28), (10575, 28))

In [7]:

# 5) 모델 — BiLSTM Encoder + Custom Luong (No custom Dense)
embedding=layers.Embedding(VOCAB_SIZE, EMB_DIM, mask_zero=True, name="tok_emb")

enc_inputs=layers.Input(shape=(None,), name="enc_in")
dec_inputs=layers.Input(shape=(None,), name="dec_in")

# Encoder
enc_emb=embedding(enc_inputs)
enc_emb=layers.Dropout(DROPOUT)(enc_emb)
enc_bi=layers.Bidirectional(layers.LSTM(HID_DIM//2, return_sequences=True, name="enc_lstm"), name="bilstm_enc")(enc_emb)
enc_bi=layers.LayerNormalization()(enc_bi)

# Decoder
dec_emb=embedding(dec_inputs)
dec_emb=layers.Dropout(DROPOUT)(dec_emb)
dec_lstm=layers.LSTM(HID_DIM, return_sequences=True, return_state=True, name="dec_lstm")
enc_mean = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(enc_bi)
init_h = layers.Dense(HID_DIM, activation="tanh")(enc_mean)
init_c = layers.Dense(HID_DIM, activation="tanh")(enc_mean)
dec_out, _, _ = dec_lstm(dec_emb, initial_state=[init_h, init_c])
dec_out=layers.LayerNormalization()(dec_out)

# Mask
enc_mask_bool=embedding.compute_mask(enc_inputs)
enc_mask=layers.Lambda(lambda m: tf.cast(m, tf.float32))(enc_mask_bool)
enc_mask_exp=layers.Lambda(lambda m: tf.expand_dims(m,1))(enc_mask)

# Luong
score=layers.Lambda(lambda xy: tf.matmul(xy[0], tf.transpose(xy[1],perm=[0,2,1])))([dec_out, enc_bi])
minus_inf=layers.Lambda(lambda m: (1.0-m)*-1e9)(enc_mask_exp)
score=layers.Add()([score, minus_inf])
attn_w=layers.Activation("softmax")(score)
context=layers.Lambda(lambda xy: tf.matmul(xy[0], xy[1]))([attn_w, enc_bi])
dec_cat=layers.Concatenate()([dec_out, context])

# Standard output Dense (no tied weights)
logits=layers.Dense(VOCAB_SIZE, activation=None, name="out_dense")(dec_cat)

model=models.Model([enc_inputs, dec_inputs], logits, name="seq2seq_v6b")
model.summary()

loss_fn=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
opt=optimizers.Adam(learning_rate=3e-4, clipnorm=1.0)
model.compile(optimizer=opt, loss=loss_fn, metrics=['accuracy'])


2025-08-27 03:05:44.215133: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-08-27 03:05:44.215163: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-08-27 03:05:44.215180: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-08-27 03:05:44.215197: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-08-27 03:05:44.215209: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [8]:

# 6) 학습
ckpt=Path("./checkpoints_v6b"); ckpt.mkdir(exist_ok=True)
cbs=[
    callbacks.ModelCheckpoint(str(ckpt/"weights.keras"), save_best_only=True, monitor="val_accuracy", mode="max"),
    callbacks.EarlyStopping(monitor="val_accuracy", patience=5, restore_best_weights=True),
    callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-5, verbose=1),
]
history=model.fit([X_enc_tr, X_dec_in_tr], Y_tr, sample_weight=W_tr,
                  validation_data=([X_enc_va, X_dec_in_va], Y_va, W_va),
                  epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1, callbacks=cbs)


Epoch 1/25


2025-08-27 03:05:45.950802: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 137ms/step - accuracy: 0.0714 - loss: 1.6733 - val_accuracy: 0.0846 - val_loss: 1.5326 - learning_rate: 3.0000e-04
Epoch 2/25
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 129ms/step - accuracy: 0.0106 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan - learning_rate: 3.0000e-04
Epoch 3/25
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step - accuracy: 0.0000e+00 - loss: nan
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0001500000071246177.
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 130ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan - learning_rate: 3.0000e-04
Epoch 4/25
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 128ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan - learning_rate: 1.5000e-04
Epoch 5/25
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━

In [9]:

# 7) 추론/채팅 (동일)
encoder_model=models.Model(enc_inputs, [enc_bi, init_h, init_c], name="enc_infer")

dec_state_h=layers.Input(shape=(HID_DIM,), name="dec_state_h")
dec_state_c=layers.Input(shape=(HID_DIM,), name="dec_state_c")
enc_seq_in=layers.Input(shape=(None,HID_DIM), name="enc_seq_in")
dec_tok_in=layers.Input(shape=(1,), name="dec_tok_in")

dec_tok_emb=embedding(dec_tok_in)
dec_step, n_h, n_c = dec_lstm(dec_tok_emb, initial_state=[dec_state_h, dec_state_c])

score_step = layers.Lambda(lambda xy: tf.matmul(xy[0], tf.transpose(xy[1], perm=[0,2,1])))([dec_step, enc_seq_in])
attn_w_step = layers.Activation("softmax")(score_step)
context_step = layers.Lambda(lambda xy: tf.matmul(xy[0], xy[1]))([attn_w_step, enc_seq_in])
dec_cat_step = layers.Concatenate()([dec_step, context_step])
logits_step = model.get_layer("out_dense")(dec_cat_step)

decoder_model=models.Model([dec_tok_in, dec_state_h, dec_state_c, enc_seq_in],
                           [logits_step, n_h, n_c], name="dec_step")

def _norm(t): return re.sub(r"[ \t]+"," ",str(t)).strip()
def _prep_q(q):
    q_ids=sp_encode_ids(_norm(q), ":")
    q_ids=q_ids[:MAX_LEN_Q]+[PAD_ID]*(MAX_LEN_Q-len(q_ids))
    arr=np.array(q_ids, np.int32)[None,:]
    enc_seq, h, c = encoder_model.predict(arr, verbose=0)
    return enc_seq, h, c

def _ban_basic(logits):
    logits[PAD_ID]=-1e9; logits[BOS_ID]=-1e9
def _ban_repeating_ngrams(logits, ids, n=3):
    if n<=1 or len(ids)<n-1: return
    grams=set(tuple(ids[i:i+n]) for i in range(len(ids)-n+1))
    prefix=tuple(ids[-(n-1):])
    V=logits.shape[-1]
    for v in range(V):
        if prefix+(v,) in grams: logits[v]=-1e9
def _cleanup(text: str) -> str:
    text=re.sub(r"(\b\S{1,6})( \1){2,}", r"\1 \1", text)
    text=re.sub(r"(.)\1{3,}", r"\1\1", text)
    text=re.sub(r"\s+", " ", text).strip()
    return text

def greedy_decode(q, max_len=28, no_repeat_ngram=3):
    enc_seq,h,c=_prep_q(q); cur=BOS_ID; out=[]
    for _ in range(max_len):
        token=np.array([[cur]], np.int32)
        logits,h,c=decoder_model.predict([token,h,c,enc_seq], verbose=0)
        logits=logits[0,-1,:]
        _ban_basic(logits); _ban_repeating_ngrams(logits, out, n=no_repeat_ngram)
        nid=int(np.argmax(logits))
        if nid in (EOS_ID, PAD_ID): break
        out.append(nid); cur=nid
    return _cleanup(sp.decode(out))

def sampling_decode(q, max_len=28, top_p=0.9, temperature=0.9, no_repeat_ngram=3):
    enc_seq,h,c=_prep_q(q); cur=BOS_ID; out=[]
    for _ in range(max_len):
        token=np.array([[cur]], np.int32)
        logits,h,c=decoder_model.predict([token,h,c,enc_seq], verbose=0)
        logits=logits[0,-1,:]/max(1e-6,temperature)
        _ban_basic(logits); _ban_repeating_ngrams(logits, out, n=no_repeat_ngram)
        probs=tf.nn.softmax(tf.convert_to_tensor(logits)).numpy()
        idx=np.argsort(-probs); probs=probs[idx]; cum=probs.cumsum()
        cut=np.searchsorted(cum, top_p)+1; idx=idx[:cut]; probs=probs[:cut]; probs=probs/probs.sum()
        nid=int(np.random.choice(idx, p=probs))
        if nid in (EOS_ID, PAD_ID): break
        out.append(nid); cur=nid
    return _cleanup(sp.decode(out))

def beam_search_decode(q, max_len=28, beam_size=5, length_norm_alpha=0.8,
                       repetition_penalty=1.15, no_repeat_ngram=3):
    enc_seq,h0,c0=_prep_q(q)
    beams=[(0.0,[BOS_ID],h0,c0)]; completed=[]
    for _ in range(max_len):
        new=[]
        for lp,ids,h,c in beams:
            cur=ids[-1]
            if cur==EOS_ID: completed.append((lp,ids)); continue
            token=np.array([[cur]], np.int32)
            logits,nh,nc=decoder_model.predict([token,h,c,enc_seq], verbose=0)
            logits=logits[0,-1,:]
            _ban_basic(logits); _ban_repeating_ngrams(logits, ids[1:], n=no_repeat_ngram)
            if repetition_penalty>1.0:
                for v in set(ids): logits[v]=logits[v]/repetition_penalty
            topk=int(beam_size*2)
            cand=np.argpartition(-logits, topk)[:topk]
            logp=tf.nn.log_softmax(tf.convert_to_tensor(logits)).numpy()
            for v in cand:
                new.append((lp+float(logp[v]), ids+[int(v)], nh, nc))
        if not new: break
        new.sort(key=lambda x:x[0], reverse=True)
        beams=new[:beam_size]
    completed.extend([(lp,ids) for lp,ids,_,_ in beams])
    def score(lp, ids):
        L=max(1,len([i for i in ids if i not in (BOS_ID, PAD_ID)]))
        return lp/(((5+L)**length_norm_alpha)/(6**length_norm_alpha))
    best=max(completed, key=lambda x: score(x[0], x[1]))
    out=[i for i in best[1] if i not in (BOS_ID, EOS_ID, PAD_ID)]
    return _cleanup(sp.decode(out))

# quick sanity (학습 후 실행 권장)
for q in ["안녕?", "오늘 기분 어때?", "취미가 뭐야?"]:
    try:
        print("Q:", q, "\nA(beam):", beam_search_decode(q))
    except Exception as e:
        print("Decode error (학습 전):", e)




Q: 안녕? 
A(beam): 잘.
Q: 오늘 기분 어때? 
A(beam): 잘. 거예요
Q: 취미가 뭐야? 
A(beam): 잘. 거예요.


In [11]:

# 8) 평가 & 콘솔형 채팅
def preview_predictions(n=10, method="beam"):
    idx=np.random.choice(len(val_df), size=min(n,len(val_df)), replace=False)
    for i in idx:
        q=val_df.iloc[i]['Q_norm']; ref=val_df.iloc[i]['A_norm']
        pred=beam_search_decode(q) if method=="beam" else greedy_decode(q)
        print(f"Q: {q}\nA*: {ref}\nÂ : {pred}\n"+"-"*80)

def chat_cli(mode="beam"):
    print("대화를 시작합니다. '종료' 입력 시 종료합니다.")
    while True:
        try:
            user=input("나: ").strip()
        except EOFError:
            print("\n입력 종료"); break
        if not user: 
            continue
        if user=="종료":
            print("챗봇: 안녕!"); break
        try:
            if mode=="beam":
                ans=beam_search_decode(user, max_len=MAX_LEN_A)
            elif mode=="sample":
                ans=sampling_decode(user, max_len=MAX_LEN_A)
            else:
                ans=greedy_decode(user, max_len=MAX_LEN_A)
        except Exception as e:
            ans=f"(디코딩 오류: {e})"
        print("챗봇:", ans)

# preview_predictions(10, method="beam")
# chat_cli("beam")


In [15]:
# 미리 위 셀(디코더 정의 등)을 모두 실행한 뒤에!
chat_cli("beam")   # 콘솔 입력 대화 시작

대화를 시작합니다. '종료' 입력 시 종료합니다.
챗봇: 잘.
챗봇: 잘. 거예요.
챗봇: . 게 거예요
챗봇: 안녕!
