In [1]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv('ChatBotData.csv')
df.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [55]:
questions = df['Q'].tolist()
answers = df['A'].tolist()

In [56]:
# 텍스트 전처리
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9가-힣\s]", "", text)
    return text

In [57]:
questions = [preprocess_text(q) for q in questions]
answers = ["<start> " + preprocess_text(a) + " <end>" for a in answers]

In [58]:
train_Q, test_Q, train_A, test_A = train_test_split(questions, answers, test_size=0.2, random_state=42)

In [59]:
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(train_Q + train_A)

In [60]:
if '<start>' not in tokenizer.word_index:
    tokenizer.word_index['<start>'] = len(tokenizer.word_index) + 1
if '<end>' not in tokenizer.word_index:
    tokenizer.word_index['<end>'] = len(tokenizer.word_index) + 1

In [61]:
train_Q_sequences = tokenizer.texts_to_sequences(train_Q)
train_A_sequences = tokenizer.texts_to_sequences(train_A)
test_Q_sequences = tokenizer.texts_to_sequences(test_Q)
test_A_sequences = tokenizer.texts_to_sequences(test_A)

In [62]:
# 패딩
max_seq_length = max(max(len(seq) for seq in train_Q_sequences), max(len(seq) for seq in train_A_sequences))
train_Q_padded = pad_sequences(train_Q_sequences, maxlen=max_seq_length, padding='post')
train_A_padded = pad_sequences(train_A_sequences, maxlen=max_seq_length, padding='post')
test_Q_padded = pad_sequences(test_Q_sequences, maxlen=max_seq_length, padding='post')
test_A_padded = pad_sequences(test_A_sequences, maxlen=max_seq_length, padding='post')

In [63]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

In [64]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 256
units = 512

In [65]:
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

In [66]:
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [67]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [68]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])

In [69]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_10 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 input_11 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, None, 256)    4660480     ['input_10[0][0]']               
                                                                                                  
 embedding_4 (Embedding)        (None, None, 256)    4660480     ['input_11[0][0]']               
                                                                                            

In [70]:
train_decoder_input_sequences = np.array([seq[:-1] for seq in train_A_padded])
train_decoder_target_sequences = np.expand_dims(np.array([seq[1:] for seq in train_A_padded]), -1)

In [71]:
model.fit([train_Q_padded, train_decoder_input_sequences], train_decoder_target_sequences, batch_size=64, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2fa3ac250>

In [72]:
encoder_model = Model(encoder_inputs, encoder_states)

In [73]:
decoder_state_input_h = Input(shape=(units,))
decoder_state_input_c = Input(shape=(units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs_single = Input(shape=(1,))
dec_emb2 = Embedding(vocab_size, embedding_dim)(decoder_inputs_single)
decoder_outputs2, state_h2, state_c2 = LSTM(units, return_sequences=True, return_state=True)(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = Dense(vocab_size, activation='softmax')(decoder_outputs2)

decoder_model = Model([decoder_inputs_single] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

In [74]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['<start>']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = tokenizer.index_word[sampled_token_index]

        decoded_sentence += ' ' + sampled_char

        if sampled_char == '<end>' or len(decoded_sentence) > max_seq_length:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence.replace('<start>', '').replace('<end>', '').strip()

In [None]:
user_input = "안녕하세요"
input_seq = preprocess_sentence(user_input)
input_seq = tokenizer.texts_to_sequences([input_seq])
input_seq = pad_sequences(input_seq, maxlen=max_len, padding='post')
response = decode_sequence(input_seq)
print(response)

In [75]:
for seq_index in range(10):
    input_seq = test_Q_padded[seq_index:seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('질문:', test_Q[seq_index])
    print('답변:', decoded_sentence)

-
질문: 죽을거 같네
답변: 혼란스러워 혼란스러워 혼란스러워 혼란스러워
-
질문: 내일 시험이야
답변: 혼란스러워 혼란스러워 혼란스러워 혼란스러워
-
질문: 정말내 자신이 싫다
답변: 혼란스러워 혼란스러워 혼란스러워 혼란스러워
-
질문: 이별후 네달째
답변: 혼란스러워 기다리지마세요 기다리지마세요 혼란스러워
-
질문: 쌍커풀 해볼까
답변: 혼란스러워 혼란스러워 혼란스러워 혼란스러워
-
질문: 내 생각 하나만 바꾸면 편할텐데
답변: 어쨌든 기념일에 어쨌든 다가오니까 다가오니까
-
질문: 어떻게 살아가야 할까
답변: 기다리지마세요 기다리지마세요 기다리지마세요
-
질문: 발 아파
답변: 혼란스러워 혼란스러워 혼란스러워 혼란스러워
-
질문: 썸 타는 것도 귀찮아
답변: 혼란스러워 혼란스러워 혼란스러워 혼란스러워
-
질문: 좋아하는 애랑 전화하면
답변: 혼란스러워 기다리지마세요 기다리지마세요 혼란스러워
