# 下载并解析原始数据文本

In [7]:
import keras
import numpy as np

# 设置随机种子
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

path = keras.utils.get_file("nietzsche.txt", origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path, encoding='utf-8').read().lower()
print(f'文本长度:{len(text)}个字符')

文本长度:600893个字符


# 将字符序列向量化

In [8]:
maxlen = 60 # 每个序列的长度
step = 3 # 每隔三个字符采样一个序列
sentences = [] # 用来保存提取的序列
next_chars = [] # 用来保存序列对应的下一个字符

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])

print(f"序列数量:{len(sentences)}, 下一个字符数量:{len(next_chars)}")

chars = sorted(list(set(text))) # sorted()函数对字符进行排序 set()函数去除重复字符
print(f"唯一字符数量:{len(chars)}")
char_indices = dict((char, chars.index(char)) for char in chars) # 将唯一字符映射到它chars中的索引位置
print('正在向量化')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=bool)
y = np.zeros((len(sentences), len(chars)), dtype=bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

序列数量:200278, 下一个字符数量:200278
唯一字符数量:57
正在向量化


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = np.zeros((len(sentences), len(chars)), dtype=np.bool)


# 构建网络

In [9]:
from keras.models import Sequential
from keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape= (maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer= optimizer)
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 128)               95232     
                                                                 
 dense_2 (Dense)             (None, 57)                7353      
                                                                 
Total params: 102,585
Trainable params: 102,585
Non-trainable params: 0
_________________________________________________________________


# 给定模型预测, 采样下一个字符的函数

In [10]:

def sample(preds, temperature=1.0):

    """
    根据预测概率和温度参数进行采样，返回最可能的字符索引

    参数:
        preds: 模型输出的预测概率数组
        temperature: 温度参数，用于控制采样的随机性，默认值为1.0

    返回:
        根据采样概率返回最可能的字符索引
    """
    preds = np.asarray(preds).astype('float64')  # 将输入转换为float64类型的numpy数组
    preds = np.log(preds) / temperature  # 对预测概率取对数并除以温度参数
    exp_preds = np.exp(preds)  # 对处理后的值取指数
    preds = exp_preds / np.sum(exp_preds)  # 归一化，使概率总和为1
    probas = np.random.multinomial(1, preds, 1)  # 进行多项式分布采样
    return np.argmax(probas) # 返回概率最大的字符索引 argmax()函数返回最大值的索引

# 文本生成循环

In [11]:
import random
import sys

for epoch in range(1, 60):
    print(f'开始第{epoch}轮')
    model.fit(x, y, batch_size=128, epochs=1) # 将模型在数据上拟合一次
    start_index = random.randint(0, len(text) - maxlen -1) # 随机选择一个起始点
    generated_text = text[start_index: start_index + maxlen] # 从起始点开始生成文本
    print(f'---生成文本---\n {generated_text}', end='')
    # 采用一系列不同的温度值来生成文本 对比在不同温度下的文本差异以及连贯性
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print(f'---在温度{temperature}下生成文本---')
        sys.stdout.write(generated_text)

        for i in range(400):
            # 对当前生成的文本进行向量化
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1.

            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]
            generated_text += next_char
            generated_text = generated_text[1:]
            sys.stdout.write(next_char)

开始第1轮
---生成文本---
  the lofty independent
spirituality, the will to stand alone---在温度0.2下生成文本---
 the lofty independent
spirituality, the will to stand alone the present in the some the sensies of the same the some to the serience of the same and the same the distury the such a still the same the sense of the same the sensies the some the sense of the same the selfice of the present and the sensies of the same the consequents the may and the madity of the sense of the sense of the same to the serience of the section of the same the constraing to the s---在温度0.5下生成文本---
 serience of the section of the same the constraing to the sen the est such it with the same suit a compirious and the some the its most soue one a compartious are souch of the seed to the same of one one to the dignest that the enviluation of the incertaint, which all the self are meant to fartion of disture the world as a to in the sectiation of the person the sertion in the moral on fulther the contrad become some can m

KeyboardInterrupt: 