# 임의의 입력 생성

In [1]:
import numpy as np
import tensorflow as tf

In [2]:
# 단어 벡터의 차원 : 5, 문장의 길이 : 4
# 4번의 timesteps가 존재하고 각 시점마다 5차원의 단어벡터

train_X = [[[0.1, 4.2, 1.5, 1.1, 2.8], [1.0, 3.1, 2.5, 0.7, 1.1], [0.3, 2.1, 1.5, 2.1, 0.1], [2.2, 1.4, 0.5, 0.9, 1.1]]]

# 2D 텐서 -> 3D 텐서
train_X = np.array(train_X, dtype=np.float32)

# RNN

In [3]:
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.models import Sequential

In [4]:
model = Sequential()

model.add(SimpleRNN(3, input_shape = (2, 10)))    # model.add(SimpleRNN(3, input_length = 2, input_dim = 10))와 동일

# batch size를 미리 정의하는 경우
# model.add(SimpleRNN(3, batch_input_shape=(8,2,10), return_sequences=True))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 3)                 42        
                                                                 
Total params: 42
Trainable params: 42
Non-trainable params: 0
_________________________________________________________________


In [5]:
rnn = SimpleRNN(3, return_sequences=True, return_state=True)
hidden_states, last_state = rnn(train_X)

print('hidden states : {}, shape: {}'.format(hidden_states, hidden_states.shape))
print('last hidden state : {}, shape: {}'.format(last_state, last_state.shape))

hidden states : [[[ 0.5390469   0.2607186   0.99959356]
  [ 0.99135005 -0.8097063   0.99538106]
  [ 0.9851441  -0.34994292  0.9947161 ]
  [ 0.9347638  -0.7116422   0.9995795 ]]], shape: (1, 4, 3)
last hidden state : [[ 0.9347638 -0.7116422  0.9995795]], shape: (1, 3)


# RNN으로 텍스트 생성하기

## 데이터 전처리

In [6]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [7]:
text = """경마장에 있는 말이 뛰고 있다\n
그의 말이 법이다\n
가는 말이 고와야 오는 말이 곱다\n"""

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
vocab_size = len(tokenizer.word_index) + 1

print('단어 집합의 크기 : %d' % vocab_size)
print(tokenizer.word_index)

단어 집합의 크기 : 12
{'말이': 1, '경마장에': 2, '있는': 3, '뛰고': 4, '있다': 5, '그의': 6, '법이다': 7, '가는': 8, '고와야': 9, '오는': 10, '곱다': 11}


In [9]:
# 훈련 데이터 생성

sequences = list()
for line in text.split('\n'): # 줄바꿈 문자를 기준으로 문장 토큰화
    encoded = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
        
print(sequences)

# 전체 샘플의 길이를 가장 긴 샘플의 길이로 패딩
max_len = max(len(l) for l in sequences) 
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

print(sequences)

[[2, 3], [2, 3, 1], [2, 3, 1, 4], [2, 3, 1, 4, 5], [6, 1], [6, 1, 7], [8, 1], [8, 1, 9], [8, 1, 9, 10], [8, 1, 9, 10, 1], [8, 1, 9, 10, 1, 11]]
[[ 0  0  0  0  2  3]
 [ 0  0  0  2  3  1]
 [ 0  0  2  3  1  4]
 [ 0  2  3  1  4  5]
 [ 0  0  0  0  6  1]
 [ 0  0  0  6  1  7]
 [ 0  0  0  0  8  1]
 [ 0  0  0  8  1  9]
 [ 0  0  8  1  9 10]
 [ 0  8  1  9 10  1]
 [ 8  1  9 10  1 11]]


In [10]:
# 레이블 분리

sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]

# 원-핫 인코딩
y = to_categorical(y, num_classes=vocab_size)
print(y)

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


## 모델 설계하기

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

embedding_dim = 10
hidden_units = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(SimpleRNN(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)

Epoch 1/200
1/1 - 2s - loss: 2.5104 - accuracy: 0.0000e+00 - 2s/epoch - 2s/step
Epoch 2/200
1/1 - 0s - loss: 2.4980 - accuracy: 0.0000e+00 - 9ms/epoch - 9ms/step
Epoch 3/200
1/1 - 0s - loss: 2.4860 - accuracy: 0.0000e+00 - 9ms/epoch - 9ms/step
Epoch 4/200
1/1 - 0s - loss: 2.4742 - accuracy: 0.0000e+00 - 8ms/epoch - 8ms/step
Epoch 5/200
1/1 - 0s - loss: 2.4624 - accuracy: 0.0000e+00 - 10ms/epoch - 10ms/step
Epoch 6/200
1/1 - 0s - loss: 2.4507 - accuracy: 0.0909 - 13ms/epoch - 13ms/step
Epoch 7/200
1/1 - 0s - loss: 2.4390 - accuracy: 0.1818 - 16ms/epoch - 16ms/step
Epoch 8/200
1/1 - 0s - loss: 2.4270 - accuracy: 0.2727 - 21ms/epoch - 21ms/step
Epoch 9/200
1/1 - 0s - loss: 2.4148 - accuracy: 0.4545 - 13ms/epoch - 13ms/step
Epoch 10/200
1/1 - 0s - loss: 2.4023 - accuracy: 0.3636 - 12ms/epoch - 12ms/step
Epoch 11/200
1/1 - 0s - loss: 2.3893 - accuracy: 0.3636 - 11ms/epoch - 11ms/step
Epoch 12/200
1/1 - 0s - loss: 2.3759 - accuracy: 0.3636 - 12ms/epoch - 12ms/step
Epoch 13/200
1/1 - 0s - los

Epoch 103/200
1/1 - 0s - loss: 0.7297 - accuracy: 0.8182 - 10ms/epoch - 10ms/step
Epoch 104/200
1/1 - 0s - loss: 0.7145 - accuracy: 0.8182 - 8ms/epoch - 8ms/step
Epoch 105/200
1/1 - 0s - loss: 0.6996 - accuracy: 0.8182 - 7ms/epoch - 7ms/step
Epoch 106/200
1/1 - 0s - loss: 0.6850 - accuracy: 0.8182 - 8ms/epoch - 8ms/step
Epoch 107/200
1/1 - 0s - loss: 0.6707 - accuracy: 0.8182 - 9ms/epoch - 9ms/step
Epoch 108/200
1/1 - 0s - loss: 0.6566 - accuracy: 0.9091 - 10ms/epoch - 10ms/step
Epoch 109/200
1/1 - 0s - loss: 0.6428 - accuracy: 0.9091 - 8ms/epoch - 8ms/step
Epoch 110/200
1/1 - 0s - loss: 0.6292 - accuracy: 0.9091 - 8ms/epoch - 8ms/step
Epoch 111/200
1/1 - 0s - loss: 0.6159 - accuracy: 0.9091 - 8ms/epoch - 8ms/step
Epoch 112/200
1/1 - 0s - loss: 0.6029 - accuracy: 0.9091 - 7ms/epoch - 7ms/step
Epoch 113/200
1/1 - 0s - loss: 0.5901 - accuracy: 0.9091 - 7ms/epoch - 7ms/step
Epoch 114/200
1/1 - 0s - loss: 0.5775 - accuracy: 0.9091 - 10ms/epoch - 10ms/step
Epoch 115/200
1/1 - 0s - loss: 0.5

<keras.callbacks.History at 0x25446d03a60>

In [12]:
def sentence_generation(model, tokenizer, current_word, n):
    init_word = current_word
    sentence = ''

    # n번 반복
    for _ in range(n):
        # 현재 단어에 대한 정수 인코딩과 패딩
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=5, padding='pre')
        # 입력한 X(현재 단어)에 대해서 Y를 예측하고 Y(예측한 단어)를 result에 저장.
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for word, index in tokenizer.word_index.items(): 
            # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면 break
            if index == result:
                break

        # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        current_word = current_word + ' '  + word

        # 예측 단어를 문장에 저장
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence

In [13]:
print(sentence_generation(model, tokenizer, '경마장에', 4))

경마장에 있는 말이 뛰고 있다


In [14]:
print(sentence_generation(model, tokenizer, '그의', 2))

그의 말이 법이다


In [15]:
print(sentence_generation(model, tokenizer, '가는', 5))

가는 말이 고와야 오는 말이 곱다


# LSTM 

## 데이터 전처리

In [16]:
from tensorflow.keras.layers import LSTM

lstm = LSTM(3, return_sequences=True, return_state=True)
hidden_states, last_hidden_state, last_cell_state = lstm(train_X)

print('hidden states : {}, shape: {}'.format(hidden_states, hidden_states.shape))
print('last hidden state : {}, shape: {}'.format(last_hidden_state, last_hidden_state.shape))
print('last cell state : {}, shape: {}'.format(last_cell_state, last_cell_state.shape))

hidden states : [[[-0.02149745 -0.11039421  0.00422487]
  [ 0.02829784 -0.1395757   0.02499247]
  [-0.026241   -0.12349994  0.09546734]
  [-0.03320558 -0.17440006  0.14722748]]], shape: (1, 4, 3)
last hidden state : [[-0.03320558 -0.17440006  0.14722748]], shape: (1, 3)
last cell state : [[-0.04348563 -0.2547642   0.28488302]], shape: (1, 3)


In [17]:
import pandas as pd
from string import punctuation

df = pd.read_csv('ArticlesApril2018.csv')
print(df.columns)

Index(['articleID', 'articleWordCount', 'byline', 'documentType', 'headline',
       'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')


In [18]:
# 헤드라인의 값들을 리스트로 저장
headline = []
headline.extend(list(df.headline.values)) 

# 노이즈 데이터 제거
headline = [word for word in headline if word != "Unknown"]

In [19]:
# 구두점 제거, 소문자화

def repreprocessing(raw_sentence):
    preproceseed_sentence = raw_sentence.encode("utf8").decode("ascii",'ignore')
    return ''.join(word for word in preproceseed_sentence if word not in punctuation).lower()

preprocessed_headline = [repreprocessing(x) for x in headline]

In [20]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_headline)
vocab_size = len(tokenizer.word_index) + 1

In [21]:
sequences = list()

for sentence in preprocessed_headline:

    # 각 샘플에 대한 정수 인코딩
    encoded = tokenizer.texts_to_sequences([sentence])[0] 
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

sequences[:11]

[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52],
 [99, 269, 371, 1115, 582, 52, 7],
 [99, 269, 371, 1115, 582, 52, 7, 2],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116],
 [100, 3]]

In [22]:
# 정수가 어떤 단어를 의미하는지 알아보는 것

index_to_word = {}
for key, value in tokenizer.word_index.items(): # 인덱스를 단어로 바꾸기 위해 index_to_word를 생성
    index_to_word[value] = key
    
print('빈도수 상위 582번 단어 : {}'.format(index_to_word[582]))

빈도수 상위 582번 단어 : offer


In [23]:
# 전체 샘플의 길이를 가장 긴 샘플의 길이로 패딩
max_len = max(len(l) for l in sequences)
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

# 레이블 분리
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]

y = to_categorical(y, num_classes=vocab_size)

## 모델 설계하기

In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM

embedding_dim = 10
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)

Epoch 1/200
244/244 - 8s - loss: 7.6514 - accuracy: 0.0268 - 8s/epoch - 35ms/step
Epoch 2/200
244/244 - 7s - loss: 7.1111 - accuracy: 0.0292 - 7s/epoch - 28ms/step
Epoch 3/200
244/244 - 7s - loss: 6.9748 - accuracy: 0.0358 - 7s/epoch - 29ms/step
Epoch 4/200
244/244 - 6s - loss: 6.8481 - accuracy: 0.0450 - 6s/epoch - 25ms/step
Epoch 5/200
244/244 - 6s - loss: 6.6959 - accuracy: 0.0455 - 6s/epoch - 23ms/step
Epoch 6/200
244/244 - 6s - loss: 6.5216 - accuracy: 0.0501 - 6s/epoch - 24ms/step
Epoch 7/200
244/244 - 6s - loss: 6.3307 - accuracy: 0.0541 - 6s/epoch - 23ms/step
Epoch 8/200
244/244 - 6s - loss: 6.1336 - accuracy: 0.0590 - 6s/epoch - 23ms/step
Epoch 9/200
244/244 - 6s - loss: 5.9433 - accuracy: 0.0642 - 6s/epoch - 24ms/step
Epoch 10/200
244/244 - 5s - loss: 5.7591 - accuracy: 0.0665 - 5s/epoch - 21ms/step
Epoch 11/200
244/244 - 5s - loss: 5.5793 - accuracy: 0.0748 - 5s/epoch - 21ms/step
Epoch 12/200
244/244 - 5s - loss: 5.4129 - accuracy: 0.0759 - 5s/epoch - 21ms/step
Epoch 13/200


Epoch 100/200
244/244 - 5s - loss: 0.6855 - accuracy: 0.8644 - 5s/epoch - 22ms/step
Epoch 101/200
244/244 - 5s - loss: 0.6703 - accuracy: 0.8703 - 5s/epoch - 22ms/step
Epoch 102/200
244/244 - 5s - loss: 0.6571 - accuracy: 0.8715 - 5s/epoch - 22ms/step
Epoch 103/200
244/244 - 5s - loss: 0.6412 - accuracy: 0.8721 - 5s/epoch - 22ms/step
Epoch 104/200
244/244 - 5s - loss: 0.6272 - accuracy: 0.8748 - 5s/epoch - 22ms/step
Epoch 105/200
244/244 - 5s - loss: 0.6157 - accuracy: 0.8802 - 5s/epoch - 22ms/step
Epoch 106/200
244/244 - 5s - loss: 0.6011 - accuracy: 0.8784 - 5s/epoch - 22ms/step
Epoch 107/200
244/244 - 5s - loss: 0.5882 - accuracy: 0.8826 - 5s/epoch - 22ms/step
Epoch 108/200
244/244 - 5s - loss: 0.5758 - accuracy: 0.8841 - 5s/epoch - 22ms/step
Epoch 109/200
244/244 - 6s - loss: 0.5638 - accuracy: 0.8857 - 6s/epoch - 23ms/step
Epoch 110/200
244/244 - 6s - loss: 0.5539 - accuracy: 0.8868 - 6s/epoch - 23ms/step
Epoch 111/200
244/244 - 6s - loss: 0.5440 - accuracy: 0.8884 - 6s/epoch - 23

Epoch 198/200
244/244 - 7s - loss: 0.2655 - accuracy: 0.9163 - 7s/epoch - 28ms/step
Epoch 199/200
244/244 - 7s - loss: 0.2754 - accuracy: 0.9132 - 7s/epoch - 27ms/step
Epoch 200/200
244/244 - 6s - loss: 0.2750 - accuracy: 0.9166 - 6s/epoch - 26ms/step


<keras.callbacks.History at 0x2544a2821d0>

In [25]:
def sentence_generation(model, tokenizer, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word
    sentence = ''

    # n번 반복
    for _ in range(n):
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=max_len-1, padding='pre')

        # 입력한 X(현재 단어)에 대해서 y를 예측하고 y(예측한 단어)를 result에 저장.
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for word, index in tokenizer.word_index.items(): 
            # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면
            if index == result:
                break

        # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        current_word = current_word + ' '  + word

        # 예측 단어를 문장에 저장
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence

In [26]:
print(sentence_generation(model, tokenizer, 'i', 10))

i disapprove of school vouchers can i still apply for them


In [27]:
print(sentence_generation(model, tokenizer, 'how', 10))

how to make facebook more accountable on immigration immigrants risks to


# 양방향 순환 신경망(Bidirectional Recurrent Neural Network)

In [28]:
from tensorflow.keras.layers import Bidirectional

timesteps = 10
input_dim = 5
hidden_units = 8

model = Sequential()
model.add(Bidirectional(SimpleRNN(hidden_units, return_sequences=True), input_shape=(timesteps, input_dim)))
model.add(Bidirectional(SimpleRNN(hidden_units, return_sequences=True)))
model.add(Bidirectional(SimpleRNN(hidden_units, return_sequences=True)))
model.add(Bidirectional(SimpleRNN(hidden_units, return_sequences=True)))

In [29]:
k_init = tf.keras.initializers.Constant(value=0.1)
b_init = tf.keras.initializers.Constant(value=0)
r_init = tf.keras.initializers.Constant(value=0.1)

In [30]:
bilstm = Bidirectional(LSTM(3, return_sequences=False, return_state=True,
                            kernel_initializer=k_init, bias_initializer=b_init, recurrent_initializer=r_init))
hidden_states, forward_h, forward_c, backward_h, backward_c = bilstm(train_X)

print('hidden states : {}, shape: {}'.format(hidden_states, hidden_states.shape))
print('forward state : {}, shape: {}'.format(forward_h, forward_h.shape))
print('backward state : {}, shape: {}'.format(backward_h, backward_h.shape))

hidden states : [[0.6303138 0.6303138 0.6303138 0.7038734 0.7038734 0.7038734]], shape: (1, 6)
forward state : [[0.6303138 0.6303138 0.6303138]], shape: (1, 3)
backward state : [[0.7038734 0.7038734 0.7038734]], shape: (1, 3)


In [31]:
# return_sequences=True인 경우

bilstm = Bidirectional(LSTM(3, return_sequences=True, return_state=True,
                            kernel_initializer=k_init, bias_initializer=b_init, recurrent_initializer=r_init))
hidden_states, forward_h, forward_c, backward_h, backward_c = bilstm(train_X)

print('hidden states : {}, shape: {}'.format(hidden_states, hidden_states.shape))
print('forward state : {}, shape: {}'.format(forward_h, forward_h.shape))
print('backward state : {}, shape: {}'.format(backward_h, backward_h.shape))

hidden states : [[[0.35906473 0.35906473 0.35906473 0.7038734  0.7038734  0.7038734 ]
  [0.55111325 0.55111325 0.55111325 0.58863586 0.58863586 0.58863586]
  [0.59115744 0.59115744 0.59115744 0.3951699  0.3951699  0.3951699 ]
  [0.6303138  0.6303138  0.6303138  0.21942244 0.21942244 0.21942244]]], shape: (1, 4, 6)
forward state : [[0.6303138 0.6303138 0.6303138]], shape: (1, 3)
backward state : [[0.7038734 0.7038734 0.7038734]], shape: (1, 3)


# GRU(Gated Recurrent Unit)

In [32]:
from tensorflow.keras.layers import GRU

hidden_size = 12

model = Sequential()
model.add(GRU(hidden_size, input_shape=(timesteps, input_dim)))

#  문자 단위 RNN

## 데이터 전처리

In [33]:
import numpy as np
import urllib.request
from tensorflow.keras.utils import to_categorical

# 데이터 로드
urllib.request.urlretrieve("http://www.gutenberg.org/files/11/11-0.txt", filename="11-0.txt")

f = open('11-0.txt', 'rb')
sentences = []
for sentence in f: # 데이터로부터 한 줄씩 읽는다.
    sentence = sentence.strip() # strip()을 통해 \r, \n을 제거한다.
    sentence = sentence.lower() # 소문자화.
    sentence = sentence.decode('ascii', 'ignore') # \xe2\x80\x99 등과 같은 바이트 열 제거
    if len(sentence) > 0:
        sentences.append(sentence)
f.close()

total_data = ' '.join(sentences)

In [34]:
# 문자 집합
char_vocab = sorted(list(set(total_data)))
vocab_size = len(char_vocab)

# 정수 부여
char_to_index = dict((char, index) for index, char in enumerate(char_vocab))

# 정수로부터 문자 리턴
index_to_char = {}
for key, value in char_to_index.items():
    index_to_char[value] = key

In [35]:
# sample 만들기

seq_length = 60
n_samples = int(np.floor((len(total_data) - 1) / seq_length))


train_X = []
train_y = []

for i in range(n_samples):
    # 0:60 -> 60:120 -> 120:180로 loop를 돌면서 문장 샘플을 1개씩 pick.
    X_sample = total_data[i * seq_length: (i + 1) * seq_length]
    
    X_encoded = [char_to_index[c] for c in X_sample]
    train_X.append(X_encoded)

    # 오른쪽으로 1칸 쉬프트
    y_sample = total_data[i * seq_length + 1: (i + 1) * seq_length + 1]
    y_encoded = [char_to_index[c] for c in y_sample]
    train_y.append(y_encoded)

In [36]:
print(train_X[1])
print(train_y[1])

[43, 33, 10, 0, 31, 54, 0, 41, 34, 52, 38, 48, 0, 32, 30, 47, 47, 44, 41, 41, 0, 49, 37, 38, 48, 0, 34, 31, 44, 44, 40, 0, 38, 48, 0, 35, 44, 47, 0, 49, 37, 34, 0, 50, 48, 34, 0, 44, 35, 0, 30, 43, 54, 44, 43, 34, 0, 30, 43, 54]
[33, 10, 0, 31, 54, 0, 41, 34, 52, 38, 48, 0, 32, 30, 47, 47, 44, 41, 41, 0, 49, 37, 38, 48, 0, 34, 31, 44, 44, 40, 0, 38, 48, 0, 35, 44, 47, 0, 49, 37, 34, 0, 50, 48, 34, 0, 44, 35, 0, 30, 43, 54, 44, 43, 34, 0, 30, 43, 54, 52]


In [37]:
# 원-핫 인코딩
train_X = to_categorical(train_X)
train_y = to_categorical(train_y)

print('train_X의 크기(shape) : {}'.format(train_X.shape)) # 원-핫 인코딩
print('train_y의 크기(shape) : {}'.format(train_y.shape)) # 원-핫 인코딩

train_X의 크기(shape) : (2658, 60, 56)
train_y의 크기(shape) : (2658, 60, 56)


## 모델 설계하기

In [38]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed

hidden_units = 256

model = Sequential()
model.add(LSTM(hidden_units, input_shape=(None, train_X.shape[2]), return_sequences=True))
model.add(LSTM(hidden_units, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_y, epochs=80, verbose=2)

Epoch 1/80
84/84 - 29s - loss: 3.0745 - accuracy: 0.1799 - 29s/epoch - 349ms/step
Epoch 2/80
84/84 - 28s - loss: 2.7248 - accuracy: 0.2498 - 28s/epoch - 334ms/step
Epoch 3/80
84/84 - 25s - loss: 2.3749 - accuracy: 0.3347 - 25s/epoch - 296ms/step
Epoch 4/80
84/84 - 25s - loss: 2.2303 - accuracy: 0.3675 - 25s/epoch - 296ms/step
Epoch 5/80
84/84 - 26s - loss: 2.1208 - accuracy: 0.3942 - 26s/epoch - 310ms/step
Epoch 6/80
84/84 - 31s - loss: 2.0312 - accuracy: 0.4148 - 31s/epoch - 371ms/step
Epoch 7/80
84/84 - 30s - loss: 1.9617 - accuracy: 0.4343 - 30s/epoch - 360ms/step
Epoch 8/80
84/84 - 29s - loss: 1.8961 - accuracy: 0.4519 - 29s/epoch - 349ms/step
Epoch 9/80
84/84 - 28s - loss: 1.8412 - accuracy: 0.4660 - 28s/epoch - 339ms/step
Epoch 10/80
84/84 - 28s - loss: 1.7884 - accuracy: 0.4797 - 28s/epoch - 337ms/step
Epoch 11/80
84/84 - 28s - loss: 1.7388 - accuracy: 0.4933 - 28s/epoch - 337ms/step
Epoch 12/80
84/84 - 29s - loss: 1.6979 - accuracy: 0.5055 - 29s/epoch - 341ms/step
Epoch 13/80
8

<keras.callbacks.History at 0x25455656530>

In [39]:
def sentence_generation(model, length):
    # 문자에 대한 랜덤한 정수 생성
    ix = [np.random.randint(vocab_size)]

    # 랜덤한 정수로부터 맵핑되는 문자 생성
    y_char = [index_to_char[ix[-1]]]
    print(ix[-1],'번 문자',y_char[-1],'로 예측을 시작!')

    # (1, length, 55) 크기의 X 생성. 즉, LSTM의 입력 시퀀스 생성
    X = np.zeros((1, length, vocab_size))

    for i in range(length):
        # X[0][i][예측한 문자의 인덱스] = 1, 즉, 예측 문자를 다음 입력 시퀀스에 추가
        X[0][i][ix[-1]] = 1
        print(index_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(index_to_char[ix[-1]])
    return ('').join(y_char)

In [40]:
result = sentence_generation(model, 100)
print(result)

54 번 문자 y 로 예측을 시작!
y the rabbit came up to the door, and tried to open it; but, if the wooldation por omenes, couldnt ge
