In [3]:
# LSTM下的名字分类

In [5]:
import torch

# 选择cpu or gpu
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

torch.set_default_device(device)
print(f"Using device = {torch.get_default_device()}")

Using device = cuda:0


In [6]:
import string
import unicodedata

# 使用_来代替未在字典中出现的字符
allowed_characters = string.ascii_letters + " .,;'" + "_"
n_letters = len(allowed_characters)

# 奖Unicode编码转为ASCII编码 
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in allowed_characters
    )

In [7]:
#获取字符的序号
def letterToIndex(letter):
    if letter not in allowed_characters:
        return allowed_characters.find("_")
    else:
        return allowed_characters.find(letter)
#将输入字符转化为tensor
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for i, letter in enumerate(line):
        tensor[i][0][letterToIndex(letter)] = 1
    return tensor

In [8]:
from io import open
import glob
import os
import time

import torch
from torch.utils.data import Dataset

In [9]:
class NamesDataset(Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir # 保存数据目录路径（用于记录数据来源）
        self.load_time = time.localtime() # 保存加载时间（用于记录数据来源）
        
        # 只存储文件路径和每个文件对应的标签，不加载内容，防止内容过载
        self.samples = []  # 存储 (name, label) 的元组
        text_files = glob.glob(os.path.join(data_dir, '*.txt'))
        
        for filename in text_files:
            label = os.path.splitext(os.path.basename(filename))[0]
            # 逐行读取，但只保存 (name, label)，不保存张量！
            with open(filename, encoding='utf-8') as f:
                for line in f:
                    name = line.strip()
                    if name:  # 忽略空行
                        self.samples.append((name, label))
        
        # 构建标签到索引的映射（只需一次）
        unique_labels = sorted(set(label for _, label in self.samples))
        self.label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
        self.idx_to_label = unique_labels  # 可选：用于反向查找

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        name, label = self.samples[idx]
        # 在这里才转换为张量
        data_tensor = lineToTensor(name)  # 按需计算
        label_tensor = torch.tensor([self.label_to_idx[label]], dtype=torch.long)
        return label_tensor, data_tensor, label, name

In [10]:
alldata = NamesDataset("data/names")

In [11]:
#将数据分为训练集和测试集
train_set, test_set = torch.utils.data.random_split(alldata, [.85, .15], generator=torch.Generator(device=device).manual_seed(2024))

print(f"train examples = {len(train_set)}, validation examples = {len(test_set)}")

train examples = 17063, validation examples = 3011


In [12]:
import torch.nn as nn
import torch.nn.functional as F

class CharLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharLSTM, self).__init__()
        self.hidden_size = hidden_size
        
        # LSTM 层 (输入维度, 隐藏层大小)
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=False)
        
        # 输出层 (隐藏状态 -> 类别)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_seq,lengths=None):
         # 处理变长序列（可选但推荐）
        if lengths is not None:
            # 按长度降序排列（pack_padded_sequence 要求）
            lengths, perm_idx = lengths.sort(0, descending=True)
            input_seq = input_seq[:, perm_idx]
            
            # 打包序列
            packed_input = pack_padded_sequence(input_seq, lengths, batch_first=False)
            packed_output, (hidden, cell) = self.lstm(packed_input)
            
            # 恢复原始顺序
            _, unperm_idx = perm_idx.sort(0)
            hidden = hidden[:, unperm_idx]
        else:
            # 不使用 packing（效率略低）
            _, (hidden, cell) = self.lstm(input_seq)
        
        # 取最后一层的隐藏状态
        last_hidden = hidden[-1]  # (batch, hidden_size)
        output = self.h2o(last_hidden)
        
        return self.softmax(output)

    def init_hidden(self, batch_size=1):
        """初始化隐藏状态和细胞状态"""
        return (torch.zeros(1, batch_size, self.hidden_size),
                torch.zeros(1, batch_size, self.hidden_size))

In [13]:
n_hidden = 128
n_categories = len(alldata.label_to_idx) 
rnn = CharLSTM(n_letters, n_hidden, n_categories)
print(rnn)

CharLSTM(
  (lstm): LSTM(58, 128)
  (h2o): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=1)
)


In [23]:
import torch
import torch.nn as nn
import random
import time
import numpy as np
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

def train_lstm(rnn, training_data, n_epoch=10, n_batch_size=64, report_every=50, 
               learning_rate=0.01, criterion=nn.NLLLoss(),save_path="best_name_classify.pth"):
    """
    Train LSTM/GRU model on variable-length name classification data.
    Handles batching of sequences with different lengths properly.
    """
    rnn.train()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)  # Adam 更适合 LSTM
    all_losses = []
    
    print(f"Training on dataset with n = {len(training_data)}")
    start = time.time()

    best_loss = float('inf')  # 初始化为无穷大
    for epoch in range(1, n_epoch + 1):
        # 打乱数据
        indices = list(range(len(training_data)))
        random.shuffle(indices)
        
        total_loss = 0
        num_batches = 0
        
        
        # 按 batch_size 分组
        for i in range(0, len(indices), n_batch_size):
            batch_indices = indices[i:i + n_batch_size]
            
            # 提取 batch 数据
            batch_texts = []
            batch_labels = []
            batch_lengths = []
            
            for idx in batch_indices:
                label_tensor, text_tensor, label, text = training_data[idx]
                # text_tensor shape: (seq_len, 1, input_size) → squeeze to (seq_len, input_size)
                batch_texts.append(text_tensor.squeeze(1))  # (seq_len, input_size)
                batch_labels.append(label_tensor)
                batch_lengths.append(text_tensor.size(0))  # 序列长度
            
            # 标签堆叠
            labels = torch.cat(batch_labels).squeeze()  # (batch_size,)
            
            # 填充序列到相同长度（关键）
            padded_seqs = pad_sequence(batch_texts, batch_first=False, padding_value=0)
            # padded_seqs shape: (max_seq_len, batch_size, input_size)
            
            # 创建长度张量（用于 pack_padded_sequence）
            lengths_tensor = torch.LongTensor(batch_lengths)
            
            # 前向传播
            optimizer.zero_grad()
            output = rnn(padded_seqs, lengths_tensor)  # 修改模型以支持 lengths
            
            loss = criterion(output, labels)
            loss.backward()
            
            # 梯度裁剪（LSTM 必需！，防止梯度爆炸）
            nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5.0)
            
            optimizer.step()
            
            total_loss += loss.item()
            num_batches += 1
        
        avg_loss = total_loss / num_batches
        all_losses.append(avg_loss)

        #每次loss最小时保存一次模型
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(rnn.state_dict(), save_path)
            print(f"New best model saved at epoch {epoch} (loss={avg_loss:.4f})")
            
        if epoch % report_every == 0:
            elapsed = time.time() - start
            print(f"Epoch {epoch} ({epoch/n_epoch:.0%}): "
                  f"Average Loss = {avg_loss:.4f}, Time = {elapsed:.1f}s")
    
    return all_losses

In [25]:
#进行训练
start = time.time()
all_losses = train_lstm(rnn, train_set, n_epoch=6, learning_rate=0.05, report_every=5)
end = time.time()
print(f"training took {end-start}s")

Training on dataset with n = 17063
New best model saved at epoch 1 (loss=1.0716)
New best model saved at epoch 2 (loss=0.6916)
New best model saved at epoch 3 (loss=0.5907)
New best model saved at epoch 4 (loss=0.5497)
New best model saved at epoch 5 (loss=0.5254)
Epoch 5 (83%): Average Loss = 0.5254, Time = 63.3s
New best model saved at epoch 6 (loss=0.4992)
training took 78.36218023300171s


In [26]:
# 加载训练好的参数
rnn.load_state_dict(torch.load('best_name_classify.pth', weights_only=True))
rnn.eval();  # 切换到评估模式

In [27]:
def predict_name(model, name, dataset):
    model.eval()
    with torch.no_grad():
        # 1. 整个名字转张量 (seq_len, n_letters)
        input_tensor = lineToTensor(name)  # shape: (L, input_size)
        
        # 2. 添加 batch 维度 → (L, 1, input_size)
        input_batch = input_tensor.unsqueeze(1)  # (seq_len, batch=1, input_size)
        
        # 3. 构造长度张量
        lengths = torch.tensor([len(name)], dtype=torch.long,device='cpu')  # (1,)
        
        # 4. 一次性前向传播（不是逐字符！）
        output = model(input_batch, lengths)  # 注意：只返回 output，不返回 hidden
        
        # 5. 获取预测
        predicted_idx = output.argmax(dim=1).item()
        return dataset.idx_to_label[predicted_idx]

In [28]:
# 3. 预测
result = predict_name(rnn, "Albert", alldata)
print(result)  # → 应该输出 "English"

English
