In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

# ======================= 1️⃣  数据加载 =======================
def load_text_data(file_paths):
    """从多个文本文件加载训练数据"""
    text_data = ""
    for file in file_paths:
        with open(file, 'r', encoding='utf-8') as f:
            text_data += f.read().lower()  # 统一小写
    return text_data

# 你可以使用多个数据集文件
file_paths = ["data1.txt", "data2.txt"]  # 这里换成你的数据文件
text = load_text_data(file_paths)

# ======================= 2️⃣  训练 BPE Tokenizer =======================
tokenizer = Tokenizer(models.BPE())  # 训练 BPE 词表
trainer = trainers.BpeTrainer(special_tokens=["<pad>", "<bos>", "<eos>"])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.train(file_paths, trainer)

# 测试 Tokenizer
encoded = tokenizer.encode("hello world")
print("编码:", encoded.tokens)
print("解码:", tokenizer.decode(encoded.ids))

编码: ['<bos>', 'he', 'll', 'o', '<eos>']
解码: he ll o


In [7]:
# ======================= 3️⃣  构建 Transformer 解码器 =======================
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, dim_feedforward, max_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_len, d_model))
        
        decoder_layer = nn.TransformerDecoderLayer(d_model, num_heads, dim_feedforward)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, input_seq, memory):
        seq_len = input_seq.size(1)
        embedded = self.embedding(input_seq) + self.positional_encoding[:, :seq_len, :]
        memory = self.embedding(memory) + self.positional_encoding[:, :memory.size(1), :]
        output = self.transformer_decoder(embedded, memory)
        return self.fc_out(output)

In [8]:
# 设置超参数
vocab_size = tokenizer.get_vocab_size()
d_model = 64  
num_heads = 4  
num_layers = 3  
dim_feedforward = 128  
max_len = 200  

model = TransformerDecoder(vocab_size, d_model, num_heads, num_layers, dim_feedforward, max_len)

# ======================= 4️⃣  训练数据准备 =======================
def create_training_data(text, tokenizer, seq_length=50):
    token_ids = tokenizer.encode(text).ids
    inputs, targets = [], []

    for i in range(len(token_ids) - seq_length):
        inputs.append(token_ids[i:i+seq_length])
        targets.append(token_ids[i+1:i+seq_length+1])

    return torch.tensor(inputs), torch.tensor(targets)

# 生成训练数据
inputs, targets = create_training_data(text, tokenizer)
print("训练样本:", inputs.shape, targets.shape)

训练样本: torch.Size([603, 50]) torch.Size([603, 50])


In [9]:
# 创建 PyTorch 数据加载器
dataset = TensorDataset(inputs, targets)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# ======================= 5️⃣  训练模型 =======================
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.95)

def train(model, dataloader, epochs=20):
    for epoch in range(epochs):
        for batch_inputs, batch_targets in dataloader:
            optimizer.zero_grad()
            output = model(batch_inputs, batch_inputs)
            loss = nn.CrossEntropyLoss()(output.view(-1, vocab_size), batch_targets.view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # 避免梯度爆炸
            optimizer.step()
        scheduler.step()
        print(f"Epoch {epoch}, Loss: {loss.item()}")

# 开始训练
train(model, dataloader, epochs=20)

Epoch 0, Loss: 5.30748176574707
Epoch 1, Loss: 4.808955669403076
Epoch 2, Loss: 3.4255547523498535
Epoch 3, Loss: 2.2905361652374268
Epoch 4, Loss: 1.5883567333221436
Epoch 5, Loss: 1.3537038564682007
Epoch 6, Loss: 1.3528802394866943
Epoch 7, Loss: 1.20084810256958
Epoch 8, Loss: 1.205197811126709
Epoch 9, Loss: 1.1740862131118774
Epoch 10, Loss: 1.1421250104904175
Epoch 11, Loss: 1.1442739963531494
Epoch 12, Loss: 1.1199126243591309
Epoch 13, Loss: 1.1381272077560425
Epoch 14, Loss: 1.1888636350631714
Epoch 15, Loss: 1.0965999364852905
Epoch 16, Loss: 1.2028051614761353
Epoch 17, Loss: 1.1394095420837402
Epoch 18, Loss: 1.1190690994262695
Epoch 19, Loss: 1.1053941249847412


In [None]:
# ======================= 6️⃣  文本生成（Greedy 解码） =======================
def generate_text(model, tokenizer, start_text, max_length=100):
    model.eval()
    generated = tokenizer.encode(start_text).ids
    input_tensor = torch.tensor([generated])

    with torch.no_grad():
        for _ in range(max_length):
            output = model(input_tensor, input_tensor)
            next_token = output.argmax(dim=-1)[:, -1].item()
            generated.append(next_token)
            input_tensor = torch.tensor([generated])

    return tokenizer.decode(generated)

# 测试生成
print("生成文本:", generate_text(model, tokenizer, "hello"))

生成文本: he ll o the president t , the president that in the president that the president that the president that the president that in the president that the president that the president said . “ that the president that the president that in on the u k rain e u k a t , so that the u this is . u dz ha , with with the general , with that in u , that the president and told t a m had with u , that in the region in the r that r ussi an s , and m and
