<a href="https://colab.research.google.com/github/yym10618/Colab/blob/master/Ch04.%ED%85%8D%EC%8A%A4%ED%8A%B8%20%EB%A7%88%EC%9D%B4%EB%8B%9D%20%EC%8B%A4%EC%8A%B5/%ED%85%8D%EC%8A%A4%ED%8A%B8_%EB%8D%B0%EC%9D%B4%ED%84%B0_%EB%B6%84%EB%A5%98%EA%B2%B0%EA%B3%BC_%EB%B6%84%EC%84%9D%ED%95%98%EA%B8%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 라이브러리 선언
import numpy as np
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM

In [None]:
def load_data(file):
  result = []

  with open(file, 'r', encoding='utf-8') as f:
    lines = f.read().splitlines()

    for line in lines[:5000]:
      data = line.split(',')
      data = data[0] + ' ' + data[1] + '<end>\n'
      result.append(data)
  
  result = result[1:]
  return result

In [None]:
# 데이터 불러오기
chat_dataset = load_data('/content/drive/MyDrive/파이썬 데이터 과학 실습/file/ChatbotData.csv')
chat_dataset

In [None]:
# 텍스트(문장) 생성
text = ' '.join(chat_dataset)
text

In [None]:
# 문장 토큰 생성 및 인덱스
tokenizer = Tokenizer(filters='~!@#$%^&*()_+|-=\{}[];:",.?/\t\n')
tokenizer.fit_on_texts([text])
voca_size = len(tokenizer.word_index) + 1

print('전체 토큰 갯수 :', voca_size)
print('토큰 인덱스 :', tokenizer.word_index)

전체 토큰 갯수 : 8915
토큰 인덱스 : {'<end>': 1, '거예요': 2, '너무': 3, '거': 4, '수': 5, '잘': 6, '안': 7, '싶어': 8, '많이': 9, '좋은': 10, '게': 11, '더': 12, '좀': 13, '같아': 14, '있을': 15, '나': 16, '싶다': 17, '해보세요': 18, '것': 19, '같아요': 20, '마세요': 21, '왜': 22, '없어': 23, '있어요': 24, '사람': 25, '다': 26, '드세요': 27, '먹고': 28, '사람이': 29, '오늘': 30, '저도': 31, '하고': 32, '내': 33, '다른': 34, '내가': 35, '같이': 36, '좋죠': 37, '될': 38, '있는': 39, '하세요': 40, '보세요': 41, '뭐': 42, '친구가': 43, '또': 44, '할': 45, '자꾸': 46, '있어': 47, '좋을': 48, '건': 49, '좋아': 50, '돼요': 51, '진짜': 52, '그런': 53, '어떻게': 54, '수도': 55, '그': 56, '것도': 57, '좋아요': 58, '가고': 59, '못': 60, '한': 61, '돈': 62, '없어요': 63, '걸': 64, '가보세요': 65, '제가': 66, '하는': 67, '해': 68, '나만': 69, '있을까': 70, '일이': 71, '열심히': 72, '때': 73, '하지': 74, '하면': 75, '이제': 76, '이렇게': 77, '저는': 78, '있으면': 79, '좋겠다': 80, '될까': 81, '선물': 82, '싫어': 83, '시간': 84, '많아': 85, '스트레스': 86, '마음이': 87, '주세요': 88, '조심하세요': 89, '오세요': 90, '바라요': 91, '맛있게': 92, '말고': 93, '바랍니다': 94, '할까': 95, '술': 96, '안돼': 97, '시

In [None]:
# 토큰 저장
with open('/content/drive/MyDrive/파이썬 데이터 과학 실습/file/chatbot_tokenizer.pickle', 'wb') as handle:
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# 문장 인코딩(문장 -> 인덱스 변환)
sequences = []

for line in text.split('\n'):
  encoded = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(encoded)):
    sequence = encoded[:i+1]
    sequences.append(sequence)

sequences

In [None]:
# 문장에서 길이가 가장 긴 토큰 갯수
max_len = max(len(l) for l in sequences)
print('max_len :', max_len)

max_len : 17


In [None]:
# 패딩처리(모든 토큰 문장의 길이 일치)
paded_sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
paded_sequences

array([[   0,    0,    0, ...,    0, 5436, 5437],
       [   0,    0,    0, ..., 5436, 5437, 2338],
       [   0,    0,    0, ..., 5437, 2338,   44],
       ...,
       [   0,    0,    0, ..., 1512,    5,   15],
       [   0,    0,    0, ...,    5,   15,    2],
       [   0,    0,    0, ...,   15,    2,    1]], dtype=int32)

In [None]:
# 학습데이터, 라벨 생성
dataset = np.array(paded_sequences)
x_data = dataset[:,:-1]
y_data = dataset[:,-1]

print('x_data :', x_data[0])
print('y_data :', y_data[0])

x_data : [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 5436]
y_data : 5437


In [None]:
# 라벨 원-핫 인코딩
y_data = to_categorical(y_data, num_classes=voca_size)
print('y_data[0] :', y_data[0])

y_data[0] : [0. 0. 0. ... 0. 0. 0.]


In [None]:
# 모델설계
embedding_dim = 10
hidden_units = 128

model = Sequential()
model.add(Embedding(voca_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(voca_size, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
# 모델학습
model.fit(x_data, y_data, epochs=50)

In [None]:
# 모델 저장
model.save('/content/drive/MyDrive/파이썬 데이터 과학 실습/file/chatbot_model.h5')