# 1. 글자 단위 RNN 언어 모델(Char RNNLM)
RNN의 입출력을 글자 단위로 구현해 임베딩층을 사용하지 않음
![Char RNN](https://wikidocs.net/images/page/48649/char_rnn1.PNG "Char RNN")

> **1) 데이터에 대한 이해와 전처리**

In [1]:
import numpy as np
import urllib.request
from tensorflow.keras.utils import to_categorical

In [2]:
urllib.request.urlretrieve("http://www.gutenberg.org/files/11/11-0.txt", filename="11-0.txt")
f = open('11-0.txt', 'rb')
lines = []

for line in f:
    line = line.strip() # \r, \n 제거
    line = line.lower()
    line = line.decode('ascii', 'ignore') # \xe2\x80\x99 등의 바이트 열 제거
    if len(line) > 0:
        lines.append(line)    
f.close()

lines[:5]

['the project gutenberg ebook of alices adventures in wonderland, by lewis carroll',
 'this ebook is for the use of anyone anywhere in the united states and most',
 'other parts of the world at no cost and with almost no restrictions',
 'whatsoever.  you may copy it, give it away or re-use it under the terms of',
 'the project gutenberg license included with this ebook or online at']

In [3]:
# 한 문자열로 통합
text = ' '.join(lines)
print('문자열 길이 또는 총 글자 개수: %d' % len(text))

print(text[:200])

문자열 길이 또는 총 글자 개수: 159821
the project gutenberg ebook of alices adventures in wonderland, by lewis carroll this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with


In [4]:
# 글자 집합 생성
char_vocab = sorted(list(set(text)))
vocab_size = len(char_vocab)
print('글자 집합 크기: {}'.format(vocab_size))

char_to_index = dict((c,i) for i, c in enumerate(char_vocab))
print(char_to_index)

글자 집합 크기: 57
{' ': 0, '!': 1, '"': 2, '#': 3, '$': 4, '%': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '@': 27, '[': 28, ']': 29, '_': 30, 'a': 31, 'b': 32, 'c': 33, 'd': 34, 'e': 35, 'f': 36, 'g': 37, 'h': 38, 'i': 39, 'j': 40, 'k': 41, 'l': 42, 'm': 43, 'n': 44, 'o': 45, 'p': 46, 'q': 47, 'r': 48, 's': 49, 't': 50, 'u': 51, 'v': 52, 'w': 53, 'x': 54, 'y': 55, 'z': 56}


In [5]:
# 인덱스로부터 글자 반환
index_to_char = {}
for key, value in char_to_index.items():
    index_to_char[value] = key

In [6]:
# 문자열로부터 문장 샘플로 분리
seq_length = 60
n_samples = int(np.floor((len(text)-1) / seq_length))
print('문장 샘플 수: {}'.format(n_samples))

train_X = []
train_y = []

for i in range(n_samples):
    X_sample = text[i * seq_length: (i + 1) * seq_length]
    X_encoded = [char_to_index[c] for c in X_sample]
    train_X.append(X_encoded)
    
    y_sample = text[i * seq_length + 1: (i + 1) * seq_length + 1]
    y_encoded = [char_to_index[c] for c in y_sample]
    train_y.append(y_encoded)
    
print(train_X[0])
print(train_y[0]) # train_X[0]에서 오른쪽으로 한 칸 쉬프트된 문장

문장 샘플 수: 2663
[50, 38, 35, 0, 46, 48, 45, 40, 35, 33, 50, 0, 37, 51, 50, 35, 44, 32, 35, 48, 37, 0, 35, 32, 45, 45, 41, 0, 45, 36, 0, 31, 42, 39, 33, 35, 49, 0, 31, 34, 52, 35, 44, 50, 51, 48, 35, 49, 0, 39, 44, 0, 53, 45, 44, 34, 35, 48, 42, 31]
[38, 35, 0, 46, 48, 45, 40, 35, 33, 50, 0, 37, 51, 50, 35, 44, 32, 35, 48, 37, 0, 35, 32, 45, 45, 41, 0, 45, 36, 0, 31, 42, 39, 33, 35, 49, 0, 31, 34, 52, 35, 44, 50, 51, 48, 35, 49, 0, 39, 44, 0, 53, 45, 44, 34, 35, 48, 42, 31, 44]


In [7]:
# 훈련 데이터를 원-핫 인코딩
train_X = to_categorical(train_X)
train_y = to_categorical(train_y)

print('train_X의 크기: {}'.format(train_X.shape))
print('train_y의 크기: {}'.format(train_y.shape))

train_X의 크기: (2663, 60, 57)
train_y의 크기: (2663, 60, 57)


> **2) 모델 설계하기**

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed

In [9]:
model = Sequential()
model.add(LSTM(256, input_shape=(None, train_X.shape[2]), return_sequences=True))
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_y, epochs=80, verbose=2)

Epoch 1/80
84/84 - 43s - loss: 3.0792 - accuracy: 0.1820
Epoch 2/80
84/84 - 39s - loss: 2.6886 - accuracy: 0.2592
Epoch 3/80
84/84 - 37s - loss: 2.3621 - accuracy: 0.3341
Epoch 4/80
84/84 - 39s - loss: 2.2198 - accuracy: 0.3713
Epoch 5/80
84/84 - 40s - loss: 2.1107 - accuracy: 0.3977
Epoch 6/80
84/84 - 43s - loss: 2.0177 - accuracy: 0.4204
Epoch 7/80
84/84 - 44s - loss: 1.9489 - accuracy: 0.4365
Epoch 8/80
84/84 - 42s - loss: 1.8828 - accuracy: 0.4538
Epoch 9/80
84/84 - 42s - loss: 1.8252 - accuracy: 0.4705
Epoch 10/80
84/84 - 41s - loss: 1.7770 - accuracy: 0.4841
Epoch 11/80
84/84 - 41s - loss: 1.7320 - accuracy: 0.4962
Epoch 12/80
84/84 - 44s - loss: 1.6893 - accuracy: 0.5074
Epoch 13/80
84/84 - 47s - loss: 1.6508 - accuracy: 0.5177
Epoch 14/80
84/84 - 47s - loss: 1.6127 - accuracy: 0.5270
Epoch 15/80
84/84 - 49s - loss: 1.5756 - accuracy: 0.5364
Epoch 16/80
84/84 - 43s - loss: 1.5420 - accuracy: 0.5443
Epoch 17/80
84/84 - 42s - loss: 1.5076 - accuracy: 0.5547
Epoch 18/80
84/84 - 45s

<tensorflow.python.keras.callbacks.History at 0x298790ffee0>

In [10]:
def sentence_generation(model, length):
    ix = [np.random.randint(vocab_size)]
    y_char = [index_to_char[ix[-1]]]
    print(ix[-1], '번 글자', y_char[-1], '로 예측을 시작!')
    
    X = np.zeros((1, length, vocab_size))
    
    for i in range(length):
        X[0][i][ix[-1]] = 1
        print(index_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(index_to_char[ix[-1]])
        
    return ('').join(y_char)

In [11]:
sentence_generation(model, 100)

1 번 글자 ! 로 예측을 시작!
! its always six oclock now uponitions. so for the reason of tank, and well my tel what? was your ea

'! its always six oclock now uponitions. so for the reason of tank, and well my tel what? was your eas'

# 2. 글자 단위 RNN(Char RNN)으로 텍스트 생성하기

> **1) 데이터에 대한 이해와 전처리**

In [12]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [13]:
text = '''
I get on with life as a programmer,
I like to comtemplate beer.
But when I start to daydream,
My mind turns straight to wine.

Do I love wine more than beer?

I like to use words about beer.
But when I stop my talking,
My mind turns straight to wine.

I hate bugs and errors.
But I just think back to wine,
And I'm happy once again.

I like to hang out with programming and deep learning.
But when left alone,
My mind turns straight to wine.
'''

In [14]:
# 한 문자열로 통합
tokens = text.split()
text = ' '.join(tokens)
print(text)

I get on with life as a programmer, I like to comtemplate beer. But when I start to daydream, My mind turns straight to wine. Do I love wine more than beer? I like to use words about beer. But when I stop my talking, My mind turns straight to wine. I hate bugs and errors. But I just think back to wine, And I'm happy once again. I like to hang out with programming and deep learning. But when left alone, My mind turns straight to wine.


In [15]:
# 글자 집합 생성
char_vocab = sorted(list(set(text)))
print(char_vocab)

vocab_size = len(char_vocab)
print('글자 집합 크기: {}'.format(vocab_size))

[' ', "'", ',', '.', '?', 'A', 'B', 'D', 'I', 'M', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']
글자 집합 크기: 33


In [16]:
# 글자에 정수 인덱스 부여
char_to_index = dict((c,i) for i, c in enumerate(char_vocab))
print(char_to_index)

{' ': 0, "'": 1, ',': 2, '.': 3, '?': 4, 'A': 5, 'B': 6, 'D': 7, 'I': 8, 'M': 9, 'a': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'i': 18, 'j': 19, 'k': 20, 'l': 21, 'm': 22, 'n': 23, 'o': 24, 'p': 25, 'r': 26, 's': 27, 't': 28, 'u': 29, 'v': 30, 'w': 31, 'y': 32}


In [17]:
# 모든 샘플 길이가 같도록 구성
length = 11
sequences = []
for i in range(length, len(text)):
    seq = text[i-length:i]
    sequences.append(seq)
print('총 훈련 샘플 수: %d' % len(sequences))

sequences[:10]

총 훈련 샘플 수: 426


['I get on wi',
 ' get on wit',
 'get on with',
 'et on with ',
 't on with l',
 ' on with li',
 'on with lif',
 'n with life',
 ' with life ',
 'with life a']

In [18]:
# 전체 데이터에 정수 인코딩 
X = []
for line in sequences:
    temp_X = [char_to_index[char] for char in line]
    X.append(temp_X)
    
for line in X[:5]:
    print(line)

[8, 0, 16, 14, 28, 0, 24, 23, 0, 31, 18]
[0, 16, 14, 28, 0, 24, 23, 0, 31, 18, 28]
[16, 14, 28, 0, 24, 23, 0, 31, 18, 28, 17]
[14, 28, 0, 24, 23, 0, 31, 18, 28, 17, 0]
[28, 0, 24, 23, 0, 31, 18, 28, 17, 0, 21]


In [19]:
# 샘플 마지막 글자를 레이블로 분리
sequences = np.array(X)
X = sequences[:,:-1]
y = sequences[:,-1]

for line in X[:5]:
    print(line)
print(y[:5])

[ 8  0 16 14 28  0 24 23  0 31]
[ 0 16 14 28  0 24 23  0 31 18]
[16 14 28  0 24 23  0 31 18 28]
[14 28  0 24 23  0 31 18 28 17]
[28  0 24 23  0 31 18 28 17  0]
[18 28 17  0 21]


In [20]:
# X와 y를 원-핫 인코딩
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
X = np.array(sequences)
y = to_categorical(y, num_classes=vocab_size)

print(X.shape)

(426, 10, 33)


> **2) 모델 설계하기**

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [22]:
model = Sequential()
# 은닉 상태 크기는 80, 출력층에 단어 집합 크기의 뉴런을 배치한 LSTM
model.add(LSTM(80, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=2)

Epoch 1/100
14/14 - 4s - loss: 3.4732 - accuracy: 0.1103
Epoch 2/100
14/14 - 0s - loss: 3.3513 - accuracy: 0.1995
Epoch 3/100
14/14 - 0s - loss: 3.0942 - accuracy: 0.1972
Epoch 4/100
14/14 - 0s - loss: 2.9854 - accuracy: 0.1972
Epoch 5/100
14/14 - 0s - loss: 2.9619 - accuracy: 0.1972
Epoch 6/100
14/14 - 0s - loss: 2.9363 - accuracy: 0.1972
Epoch 7/100
14/14 - 0s - loss: 2.9211 - accuracy: 0.1972
Epoch 8/100
14/14 - 0s - loss: 2.9027 - accuracy: 0.1972
Epoch 9/100
14/14 - 0s - loss: 2.8724 - accuracy: 0.1972
Epoch 10/100
14/14 - 0s - loss: 2.8466 - accuracy: 0.1972
Epoch 11/100
14/14 - 0s - loss: 2.8220 - accuracy: 0.1972
Epoch 12/100
14/14 - 0s - loss: 2.7793 - accuracy: 0.2089
Epoch 13/100
14/14 - 0s - loss: 2.7466 - accuracy: 0.1972
Epoch 14/100
14/14 - 0s - loss: 2.6960 - accuracy: 0.2254
Epoch 15/100
14/14 - 0s - loss: 2.6507 - accuracy: 0.2300
Epoch 16/100
14/14 - 0s - loss: 2.5922 - accuracy: 0.2653
Epoch 17/100
14/14 - 0s - loss: 2.5583 - accuracy: 0.2535
Epoch 18/100
14/14 - 0s

<tensorflow.python.keras.callbacks.History at 0x29804843730>

In [23]:
def sentence_generation(model, char_to_index, seq_length, seed_text, n):
    init_text = seed_text
    sentence = ''
    
    for _ in range(n):
        encoded = [char_to_index[char] for char in seed_text]
        encoded = pad_sequences([encoded], maxlen=seq_length, padding='pre')
        encoded = to_categorical(encoded, num_classes=len(char_to_index))
        result = model.predict_classes(encoded, verbose=0)
        
        for char, index in char_to_index.items():
            if index == result:
                break
        seed_text = seed_text + char
        sentence = sentence + char
        
    sentence = init_text + sentence
    return sentence

In [24]:
print(sentence_generation(model, char_to_index, 10, 'I get on w', 80))



I get on with life as a programmer, I like to hang out with programming and deep learning.
