In [1]:
import keras
import numpy as np

path = keras.utils.get_file(
    'nietzsche.txt',
    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('Corpus length:', len(text))

Using TensorFlow backend.


Corpus length: 600893


In [2]:
# 提取60个字符组成的序列
maxlen = 60

# 每3个字符采样一个新序列
step = 3

# 保存所提取的序列
sentences = []

# 保存目标（即下一个字符）
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences))

# 语料中唯一字符组成的列表
chars = sorted(list(set(text)))
print('Unique characters:', len(chars))
# 一个字典，将唯一字符映射为它在列表chars中的索引
char_indices = dict((char, chars.index(char)) for char in chars)

# 将字符one-hot编码为二进制数组
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Number of sequences: 200278
Unique characters: 58
Vectorization...


In [3]:
from keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

In [4]:
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
import random
import sys

for epoch in range(1, 60):
    print('epoch', epoch)
    # 将模型在数据上拟合一次
    model.fit(x, y,
              batch_size=128,
              epochs=1)
    # 随机选择一个文本种子
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated_text = text[start_index: start_index + maxlen]
    print('--- Generating with seed: "' + generated_text + '"')

    # 尝试一系列不同的采样温度
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('------ temperature:', temperature)
        sys.stdout.write(generated_text)

        # 从种子文本开始，生成400个字符
        for i in range(400):
            # 对目前生产的字符编码进行one-hot编码
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1.

            # 对下一个字符进行采样
            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
        

epoch 1
Epoch 1/1
--- Generating with seed: " unfortunately, otherwise:
for there is no eternal justice.
"
------ temperature: 0.2
 unfortunately, otherwise:
for there is no eternal justice.


12. the sone the sould the self-moral and the solless of the sain the sones and the sone the moral of the solled the sollection of the spirit of the have not and the profore the sonsting of the sone of the sould the spirit the sain the contemm the self-moral the self-conteple of the longer the self-mankind the self-mankind the spirit of the spirit of the sould the power the contemm the sain the 
------ temperature: 0.5
 the spirit of the sould the power the contemm the sain the philosophan
the sonselves and the for the for the would be a would an a whole the presided and soch of the facture, the called the power of even and stronger the suffices the sentions for his moral one as the intermsticism the far an inceals of the profor self-mone and moral the was for mankind not worth the manking restin