In [1]:
import pandas as pd
import numpy as np
from konlpy.tag import Okt
from konlpy.tag import Kkma

import os
import shutil
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
df = pd.read_excel('./1_구어체(1).xlsx', names=["SID", "src", "tar"])
df = df[:50000].copy()

In [3]:
del df['SID']

In [4]:
len(df)

50000

In [None]:
df.tar = df.tar.apply(lambda x: '\t ' + x + ' \n')
df.sample(10)

In [5]:
# 문자 집합 구축
src_vocab = set()

for data in df.src:
    for char in data:
        src_vocab.add(char)

tar_vocab = set()
for data in df.tar:
    for char in data:
        tar_vocab.add(char)

In [6]:
# 문자 개수
src_vocab_size = len(src_vocab) 
tar_vocab_size = len(tar_vocab) 

print(src_vocab_size, tar_vocab_size)

1632 123


In [7]:
src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))
print(src_vocab[100:150])
print(tar_vocab[45:75])

['・', 'ㄹ', 'ㅇ', 'ㅏ', '㎏', '㎛', '㎞', '㎡', '多', '情', '故', '社', '美', '가', '각', '간', '갇', '갈', '갉', '감', '갑', '값', '갓', '갔', '강', '갖', '같', '갚', '갛', '개', '객', '갠', '갤', '갭', '갯', '갱', '갸', '걀', '걔', '걘', '거', '걱', '건', '걷', '걸', '검', '겁', '것', '겉', '게']
['N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l']


In [8]:
# 문자 집합에 인덱스 부여
src_to_index = dict([(word, i+1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word, i+1) for i, word in enumerate(tar_vocab)])

print(src_to_index)
print(tar_to_index)

{' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, '&': 6, "'": 7, '*': 8, '+': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '<': 26, '=': 27, '>': 28, '?': 29, '@': 30, 'A': 31, 'B': 32, 'C': 33, 'D': 34, 'E': 35, 'F': 36, 'G': 37, 'H': 38, 'I': 39, 'J': 40, 'K': 41, 'L': 42, 'M': 43, 'N': 44, 'O': 45, 'P': 46, 'Q': 47, 'R': 48, 'S': 49, 'T': 50, 'U': 51, 'V': 52, 'W': 53, 'X': 54, 'Y': 55, 'Z': 56, '\\': 57, '_': 58, 'a': 59, 'b': 60, 'c': 61, 'd': 62, 'e': 63, 'f': 64, 'g': 65, 'h': 66, 'i': 67, 'j': 68, 'k': 69, 'l': 70, 'm': 71, 'n': 72, 'o': 73, 'p': 74, 'q': 75, 'r': 76, 's': 77, 't': 78, 'u': 79, 'v': 80, 'w': 81, 'x': 82, 'y': 83, 'z': 84, '~': 85, '²': 86, '·': 87, '˚': 88, 'Ω': 89, '‘': 90, '“': 91, '․': 92, '…': 93, '\u202c': 94, '℃': 95, 'Ⅳ': 96, '∼': 97, '□': 98, '「': 99, '」': 100, '・': 101, 'ㄹ': 102, 'ㅇ': 103, 'ㅏ': 104, '㎏': 105, '㎛': 106, '㎞': 107, '㎡': 108, '多': 109, '情': 110, '

In [10]:
# src 정수 인코딩
encoder_input = []
for data in df.src:
    encoded_line = []
    for char in data:
        encoded_line.append(src_to_index[char])
    encoder_input.append(encoded_line)
    
print(encoder_input[:5])

[[7, 32, 67, 60, 70, 63, 1, 33, 73, 70, 73, 76, 67, 72, 65, 7, 1125, 1, 882, 163, 1130, 1, 994, 626, 400, 1095, 1, 1131, 1015, 222, 625, 1, 1295, 1559, 1, 1540, 1, 910, 1, 1140, 385, 1, 1349, 568, 638, 1, 1013, 1138, 392, 400, 12], [987, 1464, 1125, 1553, 1041, 872, 1, 1134, 1537, 883, 1087, 29], [1515, 631, 1436, 1130, 1, 755, 932, 1459, 886, 568, 385, 1, 1546, 1085, 1041, 872, 1, 1138, 896, 698, 641, 1123, 596, 1, 18, 1265, 1, 1078, 1474, 1126, 1, 222, 597, 1537, 1058, 400, 12], [15, 15, 1154, 1041, 872, 385, 1, 1061, 910, 396, 1131, 1, 1131, 748, 1043, 1, 304, 849, 596, 625, 1, 695, 426, 1041, 872, 1, 787, 568, 319, 1025, 1, 1192, 1125, 1, 1145, 1, 114, 1095, 431, 872, 1, 852, 631, 893, 936, 392, 400, 12], [20, 12, 19, 10, 1, 21, 10, 1, 22, 1, 849, 1131, 1205, 114, 1, 683, 1, 130, 304, 1, 421, 1, 1156, 1138, 167, 1, 455, 1212, 1, 1172, 150, 1, 999, 586, 1191, 939, 680, 1, 120, 849, 1537, 154, 936, 392, 400, 12]]


In [12]:
# tar 정수 인코딩
decoder_input = []
for data in df.tar:
    encoded_line = []
    for char in data:
        encoded_line.append(tar_to_index[char])
    decoder_input.append(encoded_line)
print(decoder_input[:5])

[[34, 72, 65, 75, 68, 1, 35, 78, 75, 78, 81, 72, 77, 70, 7, 1, 72, 82, 1, 64, 1, 66, 78, 75, 78, 81, 72, 77, 70, 1, 64, 79, 79, 75, 72, 66, 64, 83, 72, 78, 77, 1, 83, 71, 64, 83, 1, 64, 75, 75, 78, 86, 82, 1, 88, 78, 84, 1, 83, 78, 1, 68, 87, 79, 68, 81, 72, 68, 77, 66, 68, 1, 65, 68, 64, 84, 83, 72, 69, 84, 75, 1, 82, 83, 78, 81, 72, 68, 82, 1, 72, 77, 1, 83, 71, 68, 1, 34, 72, 65, 75, 68, 14], [36, 78, 1, 88, 78, 84, 1, 86, 78, 81, 74, 1, 64, 83, 1, 64, 1, 35, 72, 83, 88, 1, 65, 64, 77, 74, 31], [48, 53, 50, 41, 52, 47, 7, 82, 1, 65, 68, 82, 83, 82, 68, 75, 75, 68, 81, 12, 1, 86, 71, 72, 66, 71, 1, 81, 68, 66, 78, 81, 67, 68, 67, 1, 20, 83, 71, 1, 81, 78, 84, 70, 71, 1, 13, 66, 84, 83, 82, 1, 65, 88, 1, 86, 78, 81, 67, 82, 1, 78, 69, 1, 76, 78, 84, 83, 71, 1, 69, 81, 78, 76, 1, 64, 65, 81, 78, 64, 67, 14], [41, 77, 1, 35, 71, 64, 79, 83, 68, 81, 1, 17, 17, 1, 42, 68, 82, 84, 82, 1, 66, 64, 75, 75, 68, 67, 1, 44, 64, 89, 64, 81, 84, 82, 1, 69, 81, 78, 76, 1, 83, 71, 68, 1, 83, 78, 76,

In [13]:
decoder_target = []
for data in df.tar:
    timestep = 0
    encoded_line = []
    for char in data:
        if timestep > 0:
            encoded_line.append(tar_to_index[char])
        timestep += 1
    decoder_target.append(encoded_line)
print('tar encoding: ', decoder_target[:5])

tar encoding:  [[72, 65, 75, 68, 1, 35, 78, 75, 78, 81, 72, 77, 70, 7, 1, 72, 82, 1, 64, 1, 66, 78, 75, 78, 81, 72, 77, 70, 1, 64, 79, 79, 75, 72, 66, 64, 83, 72, 78, 77, 1, 83, 71, 64, 83, 1, 64, 75, 75, 78, 86, 82, 1, 88, 78, 84, 1, 83, 78, 1, 68, 87, 79, 68, 81, 72, 68, 77, 66, 68, 1, 65, 68, 64, 84, 83, 72, 69, 84, 75, 1, 82, 83, 78, 81, 72, 68, 82, 1, 72, 77, 1, 83, 71, 68, 1, 34, 72, 65, 75, 68, 14], [78, 1, 88, 78, 84, 1, 86, 78, 81, 74, 1, 64, 83, 1, 64, 1, 35, 72, 83, 88, 1, 65, 64, 77, 74, 31], [53, 50, 41, 52, 47, 7, 82, 1, 65, 68, 82, 83, 82, 68, 75, 75, 68, 81, 12, 1, 86, 71, 72, 66, 71, 1, 81, 68, 66, 78, 81, 67, 68, 67, 1, 20, 83, 71, 1, 81, 78, 84, 70, 71, 1, 13, 66, 84, 83, 82, 1, 65, 88, 1, 86, 78, 81, 67, 82, 1, 78, 69, 1, 76, 78, 84, 83, 71, 1, 69, 81, 78, 76, 1, 64, 65, 81, 78, 64, 67, 14], [77, 1, 35, 71, 64, 79, 83, 68, 81, 1, 17, 17, 1, 42, 68, 82, 84, 82, 1, 66, 64, 75, 75, 68, 67, 1, 44, 64, 89, 64, 81, 84, 82, 1, 69, 81, 78, 76, 1, 83, 71, 68, 1, 83, 78, 76, 

In [14]:
max_src_len = max([len(data) for data in df.src])
max_tar_len = max([len(data) for data in df.tar])
print(max_src_len)
print(max_tar_len)

86
229


In [16]:
encoder_input = pad_sequences(encoder_input, maxlen=max_src_len, padding='post')
decoder_input = pad_sequences(decoder_input, maxlen=max_tar_len,padding='post')
decoder_target = pad_sequences(decoder_target, maxlen=max_tar_len, padding='post')

In [17]:
encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)

# Seq2Seq

In [19]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense 
from tensorflow.keras.models import Model
import numpy as np

### Encoder

In [39]:
encoder_input.shape, decoder_input.shape, decoder_target.shape

<KerasTensor: shape=(None, None, 1632) dtype=float32 (created by layer 'input_8')>

In [48]:
encoder_inputs = Input(shape=(None, 1633))
encoder_lstm = LSTM(units=256, return_state=True)

encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)

# 은닉셀과, 셀상태
encoder_states = [state_h, state_c]

### Decoder

In [49]:
decoder_inputs = Input(shape=(None, 124))
decoder_lstm = LSTM(units=256, return_sequences=True, return_state=True)

# 디코더에게 인코더의 은닉 상태, 셀 상태를 전달
decoder_outputs, _, _= decoder_lstm(decoder_inputs, initial_state=encoder_states)

decoder_sotfmax_layer = Dense(124, activation='softmax')
decoder_outputs = decoder_sotfmax_layer(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss = 'categorical_crossentropy')

In [None]:
model.fit(
    x=[encoder_input, decoder_input], 
    y=decoder_target, 
    batch_size=64, 
    epochs=40, 
    validation_split=0.2
)

Epoch 1/40
Epoch 2/40
130/625 [=====>........................] - ETA: 8:15 - loss: 0.5400