In [1]:
import random
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop

In [87]:
# 加载诗歌数据集
poems_df = pd.read_csv("Poetry.csv").head(500)

joined_text = " ".join(poems_df["Poem"].values)

In [88]:
# 处理文本数据
joined_text = " ".join(poems_df["Poem"].values)

tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(joined_text.lower())
unique_tokens, counts = np.unique(tokens, return_counts=True)
# 仅保留出现次数最多的前5000个词汇
top_tokens = unique_tokens[np.argsort(counts)][::-1][:5000]
unique_token_index = {token: index for index, token in enumerate(top_tokens)}

In [89]:
# 设置参数
n_words = 10
input_words = []
next_word = []


In [91]:
# 准备输入和输出数据
for i in range(len(tokens) - n_words):
    input_words.append(tokens[i:i + n_words])
    next_word.append(tokens[i + n_words])

# 只取前 2000 个样本
 # input_words = input_words[:2000]
# next_word = next_word[:2000]


X = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool)
y = np.zeros((len(next_word), len(unique_tokens)), dtype=bool)

for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        X[i, j, unique_token_index[word]] = 1
    y[i, unique_token_index[next_word[i]]] = 1


MemoryError: Unable to allocate 33.2 GiB for an array with shape (238984, 10, 14927) and data type bool

In [72]:
# 创建模型
model = Sequential()
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation('softmax'))
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

  super().__init__(**kwargs)


In [73]:
# 训练模型
history = model.fit(X, y, batch_size=64, epochs=10, shuffle=True).history
model.save("poetry_gen_model.h5")

Epoch 1/10
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 66ms/step - accuracy: 0.0619 - loss: 7.0314
Epoch 2/10
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 70ms/step - accuracy: 0.0748 - loss: 6.6352
Epoch 3/10
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 79ms/step - accuracy: 0.0812 - loss: 6.3987
Epoch 4/10
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 75ms/step - accuracy: 0.0890 - loss: 6.1754
Epoch 5/10
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 76ms/step - accuracy: 0.0968 - loss: 5.9562
Epoch 6/10
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 74ms/step - accuracy: 0.1206 - loss: 5.7084
Epoch 7/10
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 72ms/step - accuracy: 0.1360 - loss: 5.4510
Epoch 8/10
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 71ms/step - accuracy: 0.1644 - loss: 5.1704
Epoch 9/10
[1m308/308[



In [74]:
# 加载模型
model = load_model("poetry_gen_model.h5")



In [78]:
# 预测下一个单词，并排除数字
def predict_next_word(input_text, n_best):
    input_text = input_text.lower().split()
    X = np.zeros((1, n_words, len(unique_tokens)))
    for i, word in enumerate(input_text[:n_words]):
        if word in unique_token_index:
            X[0, i, unique_token_index[word]] = 1
    predictions = model.predict(X)[0]
    candidates = np.argpartition(predictions, -n_best)[-n_best:]
    filtered_candidates = [c for c in candidates if not unique_tokens[c].isdigit()]
    return filtered_candidates


In [81]:
# 生成诗句
def generate_poem(input_word, num_words, creativity=3):
    word_sequence = [input_word]
    for _ in range(num_words):
        sub_sequence = " ".join(word_sequence[-n_words:])
        try:
            candidates = predict_next_word(sub_sequence, creativity)
            choice = None
            for candidate in candidates:
                candidate_word = unique_tokens[candidate]
                if candidate_word != word_sequence[-1]:  # 避免选择最近生成的单词
                    choice = candidate_word
                    break
            if choice is None:  # 如果所有候选词都相同，则选择第一个
                choice = unique_tokens[candidates[0]]
        except:
            choice = random.choice(unique_tokens)
        word_sequence.append(choice)
    return " ".join(word_sequence)



In [82]:
# 示例生成诗句
print(generate_poem("love", 20, 5))
print(generate_poem("nature", 20, 5))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23