In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
import re

In [14]:
# 读取数据
with open('data.txt', 'r') as f:
    text = f.read().lower()

# 数据预处理
def preprocess(text):
    text = re.sub(r'[^a-z\s]', '', text)  # 只保留字母和空格
    words = text.split()
    return words

words = preprocess(text)
vocab = sorted(set(words))
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}
n_vocab = len(vocab)

In [16]:
vocab[:10]

['a', 'again', 'ago', 'all', 'along', 'and', 'as', 'away', 'back', 'before']

In [19]:
word_to_ix

{'a': 0,
 'again': 1,
 'ago': 2,
 'all': 3,
 'along': 4,
 'and': 5,
 'as': 6,
 'away': 7,
 'back': 8,
 'before': 9,
 'best': 10,
 'breaking': 11,
 'but': 12,
 'by': 13,
 'can': 14,
 'changed': 15,
 'clearly': 16,
 'come': 17,
 'cry': 18,
 'do': 19,
 'each': 20,
 'even': 21,
 'every': 22,
 'favorite': 23,
 'fine': 24,
 'for': 25,
 'friend': 26,
 'get': 27,
 'gone': 28,
 'good': 29,
 'had': 30,
 'happy': 31,
 'has': 32,
 'heart': 33,
 'her': 34,
 'hes': 35,
 'how': 36,
 'i': 37,
 'id': 38,
 'in': 39,
 'it': 40,
 'its': 41,
 'just': 42,
 'lang': 43,
 'like': 44,
 'listen': 45,
 'long': 46,
 'looking': 47,
 'lost': 48,
 'love': 49,
 'made': 50,
 'make': 51,
 'makes': 52,
 'me': 53,
 'melodies': 54,
 'melt': 55,
 'memories': 56,
 'memorize': 57,
 'more': 58,
 'much': 59,
 'my': 60,
 'not': 61,
 'of': 62,
 'old': 63,
 'on': 64,
 'once': 65,
 'part': 66,
 'played': 67,
 'radio': 68,
 'rather': 69,
 'really': 70,
 'sad': 71,
 'seem': 72,
 'shalalala': 73,
 'shines': 74,
 'shingalingaling': 75,

In [20]:
ix_to_word

{0: 'a',
 1: 'again',
 2: 'ago',
 3: 'all',
 4: 'along',
 5: 'and',
 6: 'as',
 7: 'away',
 8: 'back',
 9: 'before',
 10: 'best',
 11: 'breaking',
 12: 'but',
 13: 'by',
 14: 'can',
 15: 'changed',
 16: 'clearly',
 17: 'come',
 18: 'cry',
 19: 'do',
 20: 'each',
 21: 'even',
 22: 'every',
 23: 'favorite',
 24: 'fine',
 25: 'for',
 26: 'friend',
 27: 'get',
 28: 'gone',
 29: 'good',
 30: 'had',
 31: 'happy',
 32: 'has',
 33: 'heart',
 34: 'her',
 35: 'hes',
 36: 'how',
 37: 'i',
 38: 'id',
 39: 'in',
 40: 'it',
 41: 'its',
 42: 'just',
 43: 'lang',
 44: 'like',
 45: 'listen',
 46: 'long',
 47: 'looking',
 48: 'lost',
 49: 'love',
 50: 'made',
 51: 'make',
 52: 'makes',
 53: 'me',
 54: 'melodies',
 55: 'melt',
 56: 'memories',
 57: 'memorize',
 58: 'more',
 59: 'much',
 60: 'my',
 61: 'not',
 62: 'of',
 63: 'old',
 64: 'on',
 65: 'once',
 66: 'part',
 67: 'played',
 68: 'radio',
 69: 'rather',
 70: 'really',
 71: 'sad',
 72: 'seem',
 73: 'shalalala',
 74: 'shines',
 75: 'shingalingaling',

In [21]:

# 创建训练数据
def create_sequences(words, seq_length):
    sequences = []
    for i in range(len(words) - seq_length):
        seq = words[i:i + seq_length]
        label = words[i + seq_length]
        sequences.append((seq, label))
    return sequences

seq_length = 5
sequences = create_sequences(words, seq_length)

In [23]:
sequences[:5]  #5个词一组,label为下一个词

[(['yesterday', 'once', 'more', 'when', 'i'], 'was'),
 (['once', 'more', 'when', 'i', 'was'], 'young'),
 (['more', 'when', 'i', 'was', 'young'], 'id'),
 (['when', 'i', 'was', 'young', 'id'], 'listen'),
 (['i', 'was', 'young', 'id', 'listen'], 'to')]

In [24]:
class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNNLanguageModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embeddings(x)
        out, _ = self.rnn(x)  #输出：out:每个时间步的预测值，_: 隐藏状态向量h
        out = self.fc(out[:, -1, :])  # 只取最后一个时间步的输出
        return out

In [25]:
# 超参数
embedding_dim = 50
hidden_dim = 100
num_epochs = 100
learning_rate = 0.001

# 数据集和数据加载器
class TextDataset(Dataset):
    def __init__(self, sequences, word_to_ix):
        self.sequences = sequences
        self.word_to_ix = word_to_ix

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq, label = self.sequences[idx]
        seq_tensor = torch.tensor([self.word_to_ix[word] for word in seq], dtype=torch.long)
        label_tensor = torch.tensor(self.word_to_ix[label], dtype=torch.long)
        return seq_tensor, label_tensor

In [28]:
dataset = TextDataset(sequences, word_to_ix)   #样本索引化
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)  #batch_size划分
list(dataset)[:10]

[(tensor([107,  65,  58, 100,  37]), tensor(97)),
 (tensor([ 65,  58, 100,  37,  97]), tensor(108)),
 (tensor([ 58, 100,  37,  97, 108]), tensor(38)),
 (tensor([100,  37,  97, 108,  38]), tensor(45)),
 (tensor([ 37,  97, 108,  38,  45]), tensor(94)),
 (tensor([ 97, 108,  38,  45,  94]), tensor(87)),
 (tensor([108,  38,  45,  94,  87]), tensor(68)),
 (tensor([38, 45, 94, 87, 68]), tensor(96)),
 (tensor([45, 94, 87, 68, 96]), tensor(25)),
 (tensor([94, 87, 68, 96, 25]), tensor(60))]

In [29]:
# 模型、损失函数和优化器
model = RNNLanguageModel(n_vocab, embedding_dim, hidden_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 训练过程
for epoch in range(num_epochs):
    for seq_tensor, label_tensor in data_loader:
        optimizer.zero_grad()
        output = model(seq_tensor)
        loss = criterion(output, label_tensor)
        loss.backward()
        optimizer.step()
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [10/100], Loss: 1.8607
Epoch [20/100], Loss: 0.3517
Epoch [30/100], Loss: 0.0936
Epoch [40/100], Loss: 0.0725
Epoch [50/100], Loss: 0.0393
Epoch [60/100], Loss: 0.1511
Epoch [70/100], Loss: 0.1534
Epoch [80/100], Loss: 0.0179
Epoch [90/100], Loss: 0.0127
Epoch [100/100], Loss: 0.1553


In [12]:
def generate_sentence(model, start_word, length):
    model.eval()
    words = [start_word]
    input_seq = torch.tensor([[word_to_ix[start_word]]], dtype=torch.long) #起始词索引化

    for _ in range(length):  #指定长度的句子
        with torch.no_grad():
            output = model(input_seq)
            _, predicted = torch.max(output, dim=1) #第二个维度上的最大值及其索引
            next_word = ix_to_word[predicted.item()]  #索引对应的单词
            words.append(next_word) #串入words

            # 更新输入序列
            input_seq = torch.cat([input_seq, predicted.view(1, 1)], dim=1)

    return ' '.join(words)

# 生成句子示例
start_word = 'it'  # 你可以选择任何在词汇表中的单词,已全部小写
sentence_length = 10
generated_sentence = generate_sentence(model, start_word, sentence_length)
print(generated_sentence)

it was in years gone by and the good times that
