In [1]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [0]:
f = open('alice.txt', 'rb')
lines=[]
for line in f: # 데이터를 한 줄씩 읽는다.
    line=line.strip() # strip()을 통해 \r, \n을 제거한다.
    line=line.lower() # 소문자화.
    line=line.decode('ascii', 'ignore') # \xe2\x80\x99 등과 같은 바이트 열 제거
    if len(line) > 0:
        lines.append(line)
f.close()

In [5]:
lines[:5]

['project gutenbergs alices adventures in wonderland, by lewis carroll',
 'this ebook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  you may copy it, give it away or',
 're-use it under the terms of the project gutenberg license included',
 'with this ebook or online at www.gutenberg.org']

In [6]:
text = ' '.join(lines)
print('문자열의 길이 또는 총 글자의 개수: %d' % len(text))

문자열의 길이 또는 총 글자의 개수: 158783


In [7]:
print(text[:200])

project gutenbergs alices adventures in wonderland, by lewis carroll this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever.  you may copy it, give it away 


In [8]:
char_vocab = sorted(list(set(text)))
vocab_size=len(char_vocab)
print ('글자 집합의 크기 : {}'.format(vocab_size))

글자 집합의 크기 : 55


In [9]:
char_to_index = dict((c, i) for i, c in enumerate(char_vocab)) # 글자에 고유한 정수 인덱스 부여
print(char_to_index)

{' ': 0, '!': 1, '#': 2, '$': 3, '%': 4, '(': 5, ')': 6, '*': 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, ';': 23, '?': 24, '@': 25, '[': 26, ']': 27, '_': 28, 'a': 29, 'b': 30, 'c': 31, 'd': 32, 'e': 33, 'f': 34, 'g': 35, 'h': 36, 'i': 37, 'j': 38, 'k': 39, 'l': 40, 'm': 41, 'n': 42, 'o': 43, 'p': 44, 'q': 45, 'r': 46, 's': 47, 't': 48, 'u': 49, 'v': 50, 'w': 51, 'x': 52, 'y': 53, 'z': 54}


In [0]:
index_to_char={}
for key, value in char_to_index.items():
    index_to_char[value] = key

In [11]:
seq_length = 60 # 문장의 길이를 60으로 한다.
n_samples = int(np.floor((len(text) - 1) / seq_length)) # 문자열을 60등분한다. 그러면 즉, 총 샘플의 개수
print ('문장 샘플의 수 : {}'.format(n_samples))

문장 샘플의 수 : 2646


In [0]:
train_X = []
train_y = []

for i in range(n_samples): # 2,646번 수행
    X_sample = text[i * seq_length: (i + 1) * seq_length]
    # 0:60 -> 60:120 -> 120:180로 loop를 돌면서 문장 샘플을 1개씩 가져온다.
    X_encoded = [char_to_index[c] for c in X_sample] # 하나의 문장 샘플에 대해서 정수 인코딩
    train_X.append(X_encoded)

    y_sample = text[i * seq_length + 1: (i + 1) * seq_length + 1] # 오른쪽으로 1칸 쉬프트한다.
    y_encoded = [char_to_index[c] for c in y_sample]
    train_y.append(y_encoded)

In [0]:
train_X = to_categorical(train_X)
train_y = to_categorical(train_y)

In [14]:
print('train_X의 크기(shape) : {}'.format(train_X.shape)) # 원-핫 인코딩
print('train_y의 크기(shape) : {}'.format(train_y.shape)) # 원-핫 인코딩

train_X의 크기(shape) : (2646, 60, 55)
train_y의 크기(shape) : (2646, 60, 55)


In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed
model = Sequential()
model.add(LSTM(256, input_shape=(None, train_X.shape[2]), return_sequences=True))
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_y, epochs=20, verbose=2)

Train on 2646 samples
Epoch 1/20
2646/2646 - 17s - loss: 3.0796 - acc: 0.1790
Epoch 2/20
2646/2646 - 16s - loss: 2.7310 - acc: 0.2456
Epoch 3/20
2646/2646 - 17s - loss: 2.3627 - acc: 0.3343
Epoch 4/20
2646/2646 - 17s - loss: 2.2093 - acc: 0.3718
Epoch 5/20
2646/2646 - 17s - loss: 2.0961 - acc: 0.3991
Epoch 6/20
2646/2646 - 16s - loss: 2.0019 - acc: 0.4231
Epoch 7/20
2646/2646 - 17s - loss: 1.9184 - acc: 0.4442
Epoch 8/20
2646/2646 - 17s - loss: 1.8459 - acc: 0.4653
Epoch 9/20
2646/2646 - 17s - loss: 1.7830 - acc: 0.4822
Epoch 10/20
2646/2646 - 17s - loss: 1.7263 - acc: 0.4967
Epoch 11/20
2646/2646 - 17s - loss: 1.6753 - acc: 0.5101
Epoch 12/20
2646/2646 - 17s - loss: 1.6260 - acc: 0.5230
Epoch 13/20
2646/2646 - 17s - loss: 1.5815 - acc: 0.5348
Epoch 14/20
2646/2646 - 17s - loss: 1.5409 - acc: 0.5456
Epoch 15/20
2646/2646 - 16s - loss: 1.5006 - acc: 0.5549
Epoch 16/20
2646/2646 - 16s - loss: 1.4634 - acc: 0.5658
Epoch 17/20
2646/2646 - 16s - loss: 1.4272 - acc: 0.5750
Epoch 18/20
2646/2

<tensorflow.python.keras.callbacks.History at 0x7f54d029dd30>

In [18]:
def sentence_generation(model, length):
    ix = [np.random.randint(vocab_size)] # 글자에 대한 랜덤 인덱스 생성
    y_char = [index_to_char[ix[-1]]] # 랜덤 익덱스로부터 글자 생성
    print(ix[-1],'번 글자',y_char[-1],'로 예측을 시작!')
    X = np.zeros((1, length, vocab_size)) # (1, length, 55) 크기의 X 생성. 즉, LSTM의 입력 시퀀스 생성

    for i in range(length):
        X[0][i][ix[-1]] = 1 # X[0][i][예측한 글자의 인덱스] = 1, 즉, 예측 글자를 다음 입력 시퀀스에 추가
        print(index_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(index_to_char[ix[-1]])
    return ('').join(y_char)
sentence_generation(model, 100)

11 번 글자 / 로 예측을 시작!
/or a little things and said to herself, and the treather was a long to do a little things and said 

'/or a little things and said to herself, and the treather was a long to do a little things and said t'