In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from collections import Counter

# 1. 加载数据
data = pd.read_csv('data/train.txt', sep='\t', header=None)  # 请替换为你的数据文件路径
data.columns = ['label', 'text']  # 设置列名
texts = data['text'].values.tolist()  # 文本列
labels = data['label'].values.tolist()  # 标签列

# 2. 数据预处理
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# 3. 编码标签
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)  # 转换标签为数字

# 4. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# 5. 文本向量化
tokenizer = lambda x: x.split()  # 简单的分词
all_tokens = [token for text in X_train for token in tokenizer(text)]
counter = Counter(all_tokens)

# 创建词汇表，限制词汇表大小
vocab = {token: idx + 1 for idx, (token, _) in enumerate(counter.most_common(5000))}  # 从1开始索引
vocab['<PAD>'] = 0  # 添加填充标记

def encode_text(text):
    return torch.tensor([vocab.get(token, 0) for token in tokenizer(text)], dtype=torch.long)  # 未知的词汇映射到0

X_train_encoded = [encode_text(text) for text in X_train]
X_test_encoded = [encode_text(text) for text in X_test]

train_dataset = TextDataset(X_train_encoded, torch.tensor(y_train, dtype=torch.long))
test_dataset = TextDataset(X_test_encoded, torch.tensor(y_test, dtype=torch.long))

# 6. 创建数据加载器
def collate_fn(batch):
    texts, labels = zip(*batch)  # 解压批次中的文本和标签
    texts = pad_sequence(texts, batch_first=True, padding_value=vocab['<PAD>'])  # 填充文本
    labels = torch.tensor(labels, dtype=torch.long)  # 转换标签为 Tensor
    return texts, labels

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

# 7. 搭建 BiLSTM 模型
class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # 因为是双向，乘以 2

    def forward(self, x):
        x = self.embedding(x)  # 输入嵌入
        x, (hn, cn) = self.lstm(x)  # LSTM 输出
        
        # 从最后一层提取最后时间步的隐藏状态
        x = torch.cat((hn[-1], hn[-2]), dim=1)  # 连接最后一层的两个方向的隐藏状态
        
        x = self.fc(x)  # 使用最后连接的隐藏状态进行分类
        return x

In [2]:
# 8. 初始化模型
model = BiLSTMModel(vocab_size=len(vocab), embedding_dim=128, hidden_dim=128, output_dim=2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 9. 训练模型
for epoch in range(10):
    model.train()
    for batch_texts, batch_labels in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_texts)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1}, Loss: {loss.item():.4f}')

Epoch 1, Loss: 0.0738
Epoch 2, Loss: 0.0139
Epoch 3, Loss: 0.0870
Epoch 4, Loss: 0.0012
Epoch 5, Loss: 0.0004
Epoch 6, Loss: 0.0006
Epoch 7, Loss: 0.0075
Epoch 8, Loss: 0.0002
Epoch 9, Loss: 0.0001
Epoch 10, Loss: 0.0000


In [3]:
# 10. 评估模型
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch_texts, batch_labels in test_loader:
        outputs = model(batch_texts)
        _, predicted = torch.max(outputs, 1)
        total += batch_labels.size(0)
        correct += (predicted == batch_labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy:.4f}')

# 11. 做预测
model.eval()
predictions = []

with torch.no_grad():
    for batch_texts, _ in test_loader:  # 只提取文本
        outputs = model(batch_texts)  # batch_texts 是 Tensor
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.numpy())

# 可以使用 sklearn 来计算更多评估指标
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

Test Accuracy: 0.9740
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       580
           1       0.97      0.99      0.98       804

    accuracy                           0.97      1384
   macro avg       0.98      0.97      0.97      1384
weighted avg       0.97      0.97      0.97      1384

