In [1]:
import numpy as np
import urllib.request
from tensorflow.keras.utils import to_categorical

In [2]:
urllib.request.urlretrieve("http://www.gutenberg.org/files/11/11-0.txt", filename = '11-0.txt')
f = open('11-0.txt', 'rb')
lines = []
for line in f:
    line = line.strip()
    line = line.lower()
    line = line.decode('ascii', 'ignore')
    if len(line)>0:
        lines.append(line)
f.close()

In [3]:
lines[:5]

['the project gutenberg ebook of alices adventures in wonderland, by lewis carroll',
 'this ebook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  you may copy it, give it away or',
 're-use it under the terms of the project gutenberg license included',
 'with this ebook or online at www.gutenberg.org']

In [4]:
text = ' '.join(lines)
print('문자열의 길이는 %d' %len(text))

문자열의 길이는 159612


In [5]:
print(text[:200])

the project gutenberg ebook of alices adventures in wonderland, by lewis carroll this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever.  you may copy it, g


In [6]:
char_vocab = sorted(list(set(text)))
vocab_size = len(char_vocab)
print('글자 집합의 크기는 {}'.format(vocab_size))

글자 집합의 크기는 57


In [7]:
char_to_index = dict((c,i) for i,c in enumerate(char_vocab))
print(char_to_index)

{' ': 0, '!': 1, '"': 2, '#': 3, '$': 4, '%': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '@': 27, '[': 28, ']': 29, '_': 30, 'a': 31, 'b': 32, 'c': 33, 'd': 34, 'e': 35, 'f': 36, 'g': 37, 'h': 38, 'i': 39, 'j': 40, 'k': 41, 'l': 42, 'm': 43, 'n': 44, 'o': 45, 'p': 46, 'q': 47, 'r': 48, 's': 49, 't': 50, 'u': 51, 'v': 52, 'w': 53, 'x': 54, 'y': 55, 'z': 56}


In [8]:
index_to_char = {}
for key, value in char_to_index.items():
    index_to_char[value] = key

In [9]:
seq_length = 60
n_samples = int(np.floor((len(text)-1)/seq_length))
print('문장 샘플의 수 : {}'.format(n_samples))

문장 샘플의 수 : 2660


In [10]:
train_X = []
train_y = []

for i in range(n_samples):
    X_sample = text[i*seq_length:(i+1)*seq_length]
    X_encoded = [char_to_index[c] for c in X_sample]
    train_X.append(X_encoded)
    
    y_sample = text[i*seq_length+1:(i+1)*seq_length+1]
    y_encoded = [char_to_index[c] for c in y_sample]
    train_y.append(y_encoded)

In [11]:
print(train_X[0])

[50, 38, 35, 0, 46, 48, 45, 40, 35, 33, 50, 0, 37, 51, 50, 35, 44, 32, 35, 48, 37, 0, 35, 32, 45, 45, 41, 0, 45, 36, 0, 31, 42, 39, 33, 35, 49, 0, 31, 34, 52, 35, 44, 50, 51, 48, 35, 49, 0, 39, 44, 0, 53, 45, 44, 34, 35, 48, 42, 31]


In [12]:
print(train_y[0])

[38, 35, 0, 46, 48, 45, 40, 35, 33, 50, 0, 37, 51, 50, 35, 44, 32, 35, 48, 37, 0, 35, 32, 45, 45, 41, 0, 45, 36, 0, 31, 42, 39, 33, 35, 49, 0, 31, 34, 52, 35, 44, 50, 51, 48, 35, 49, 0, 39, 44, 0, 53, 45, 44, 34, 35, 48, 42, 31, 44]


In [13]:
train_X = to_categorical(train_X)
train_y = to_categorical(train_y)

In [15]:
print('train_X의 크기 : {}'.format(train_X.shape))
print('train_y의 크기 : {}'.format(train_y.shape))

train_X의 크기 : (2660, 60, 57)
train_y의 크기 : (2660, 60, 57)


In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed

In [17]:
model = Sequential()
model.add(LSTM(256, input_shape = (None, train_X.shape[2]), return_sequences = True))
model.add(LSTM(256, return_sequences = True))
model.add(TimeDistributed(Dense(vocab_size, activation = 'softmax')))

In [19]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit(train_X,train_y, epochs = 80, verbose = 2)

Train on 2660 samples
Epoch 1/80
2660/2660 - 27s - loss: 3.0768 - accuracy: 0.1825
Epoch 2/80
2660/2660 - 24s - loss: 2.7110 - accuracy: 0.2541
Epoch 3/80
2660/2660 - 26s - loss: 2.3722 - accuracy: 0.3339
Epoch 4/80
2660/2660 - 26s - loss: 2.2254 - accuracy: 0.3684
Epoch 5/80
2660/2660 - 27s - loss: 2.1224 - accuracy: 0.3923
Epoch 6/80
2660/2660 - 26s - loss: 2.0397 - accuracy: 0.4128
Epoch 7/80
2660/2660 - 26s - loss: 1.9720 - accuracy: 0.4308
Epoch 8/80
2660/2660 - 27s - loss: 1.9098 - accuracy: 0.4489
Epoch 9/80
2660/2660 - 27s - loss: 1.8557 - accuracy: 0.4626
Epoch 10/80
2660/2660 - 28s - loss: 1.8054 - accuracy: 0.4762
Epoch 11/80
2660/2660 - 27s - loss: 1.7610 - accuracy: 0.4885
Epoch 12/80
2660/2660 - 28s - loss: 1.7145 - accuracy: 0.5009
Epoch 13/80
2660/2660 - 29s - loss: 1.6754 - accuracy: 0.5113
Epoch 14/80
2660/2660 - 29s - loss: 1.6365 - accuracy: 0.5211
Epoch 15/80
2660/2660 - 28s - loss: 1.6006 - accuracy: 0.5307
Epoch 16/80
2660/2660 - 28s - loss: 1.5647 - accuracy: 0.

<tensorflow.python.keras.callbacks.History at 0x260cd128048>

In [72]:
def sentence_generation(model, length):
    ix = [np.random.randint(vocab_size)] 
    y_char = [index_to_char[ix[-1]]]
    print(ix[-1],'번 글자',y_char[-1],'로 예측을 시작!')
    X = np.zeros((1, length, vocab_size))

    for i in range(length):
        X[0][i][ix[-1]] = 1 
        print(index_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(index_to_char[ix[-1]])
    return ('').join(y_char)

In [73]:
sentence_generation(model, 100)

10 번 글자 [','] 로 예측을 시작!
,

ValueError: Error when checking input: expected lstm_2_input to have shape (10, 33) but got array with shape (1, 33)

In [24]:
text='''
I get on with life as a programmer,
I like to contemplate beer.
But when I start to daydream,
My mind turns straight to wine.

Do I love wine more than beer?

I like to use words about beer.
But when I stop my talking,
My mind turns straight to wine.

I hate bugs and errors.
But I just think back to wine,
And I'm happy once again.

I like to hang out with programming and deep learning.
But when left alone,
My mind turns straight to wine.
'''

In [25]:
tokens = text.split()
text = ' '.join(tokens)
print(text)

I get on with life as a programmer, I like to contemplate beer. But when I start to daydream, My mind turns straight to wine. Do I love wine more than beer? I like to use words about beer. But when I stop my talking, My mind turns straight to wine. I hate bugs and errors. But I just think back to wine, And I'm happy once again. I like to hang out with programming and deep learning. But when left alone, My mind turns straight to wine.


In [26]:
char_vocab = sorted(list(set(text)))
print(char_vocab)

[' ', "'", ',', '.', '?', 'A', 'B', 'D', 'I', 'M', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']


In [27]:
vocab_size = len(char_vocab)
print('글자 집합의 크기 : {}'.format(vocab_size))

글자 집합의 크기 : 33


In [28]:
char_to_index = dict((c,i) for i,c in enumerate(char_vocab))
print(char_to_index)

{' ': 0, "'": 1, ',': 2, '.': 3, '?': 4, 'A': 5, 'B': 6, 'D': 7, 'I': 8, 'M': 9, 'a': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'i': 18, 'j': 19, 'k': 20, 'l': 21, 'm': 22, 'n': 23, 'o': 24, 'p': 25, 'r': 26, 's': 27, 't': 28, 'u': 29, 'v': 30, 'w': 31, 'y': 32}


In [34]:
length = 11
sequences = []
for i in range(length, len(text)):
    seq = text[i-length:i]
    sequences.append(seq)
print('총 훈련샘플의 수 : %d'%len(sequences))

총 훈련샘플의 수 : 426


In [35]:
sequences[:10]

['I get on wi',
 ' get on wit',
 'get on with',
 'et on with ',
 't on with l',
 ' on with li',
 'on with lif',
 'n with life',
 ' with life ',
 'with life a']

In [36]:
X = []
for line in sequences:
    temp_X = [char_to_index[char] for char in line]
    X.append(temp_X)

In [37]:
for line in X[:5]:
    print(line)

[8, 0, 16, 14, 28, 0, 24, 23, 0, 31, 18]
[0, 16, 14, 28, 0, 24, 23, 0, 31, 18, 28]
[16, 14, 28, 0, 24, 23, 0, 31, 18, 28, 17]
[14, 28, 0, 24, 23, 0, 31, 18, 28, 17, 0]
[28, 0, 24, 23, 0, 31, 18, 28, 17, 0, 21]


In [39]:
sequences = np.array(X)
X = sequences[:,:-1]
y = sequences[:,-1]

In [40]:
print(y[:5])

[18 28 17  0 21]


In [41]:
sequences = [to_categorical(x, num_classes = vocab_size) for x in X]
X = np.array(sequences)
y = to_categorical(y, num_classes = vocab_size)

In [42]:
print(X.shape)

(426, 10, 33)


In [43]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [44]:
model = Sequential()
model.add(LSTM(80, input_shape = (X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation = 'softmax'))

In [45]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit(X,y, epochs = 100, verbose = 2)

Train on 426 samples
Epoch 1/100
426/426 - 1s - loss: 3.4688 - accuracy: 0.1174
Epoch 2/100
426/426 - 0s - loss: 3.3255 - accuracy: 0.1972
Epoch 3/100
426/426 - 0s - loss: 3.0531 - accuracy: 0.1972
Epoch 4/100
426/426 - 0s - loss: 3.0010 - accuracy: 0.1972
Epoch 5/100
426/426 - 0s - loss: 2.9517 - accuracy: 0.1972
Epoch 6/100
426/426 - 0s - loss: 2.9309 - accuracy: 0.1972
Epoch 7/100
426/426 - 0s - loss: 2.9135 - accuracy: 0.1972
Epoch 8/100
426/426 - 0s - loss: 2.8980 - accuracy: 0.1972
Epoch 9/100
426/426 - 0s - loss: 2.8687 - accuracy: 0.1972
Epoch 10/100
426/426 - 0s - loss: 2.8380 - accuracy: 0.1972
Epoch 11/100
426/426 - 0s - loss: 2.8109 - accuracy: 0.2042
Epoch 12/100
426/426 - 0s - loss: 2.7658 - accuracy: 0.2113
Epoch 13/100
426/426 - 0s - loss: 2.7380 - accuracy: 0.2089
Epoch 14/100
426/426 - 0s - loss: 2.7039 - accuracy: 0.2160
Epoch 15/100
426/426 - 0s - loss: 2.6594 - accuracy: 0.2488
Epoch 16/100
426/426 - 0s - loss: 2.6033 - accuracy: 0.2300
Epoch 17/100
426/426 - 0s - 

<tensorflow.python.keras.callbacks.History at 0x26098cc4088>

In [46]:
def sentence_generation(model, char_to_index, seq_length, seed_text, n):
    init_text = seed_text
    sentence = ''
    
    for _ in range(n):
        encoded = [char_to_index[char] for char in seed_text]
        encoded = pad_sequences([encoded], maxlen = seq_length, padding = 'pre')
        encoded = to_categorical(encoded, num_classes = len(char_to_index))
        result = model.predict_classes(encoded, verbose = 0)
        
        for char, index in char_to_index.items():
            if index == result:
                break
        seed_text = seed_text+char
        sentence = sentence+char
        
    sentence = init_text + sentence
    return sentence

In [47]:
print(sentence_generation(model, char_to_index, 10, 'I get on w', 80))

I get on with life as a programmer, I like to use words about beer. But when I stop my tal
