In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
import gensim.downloader as api
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

file_path = "data/noemoticon_preprocessed.csv"
df = pd.read_csv(file_path)
label_mapping = {0: 0, 4: 1}  # 0/4 映射为 0/1
df["polarity"] = df["polarity"].map(label_mapping)

df["tokens"] = df["text"].apply(word_tokenize) #base on the experience, directly use nftk's word_tokenize

Using device: cuda


In [3]:
counter = Counter()
for tokens in df["tokens"]:  
    counter.update(tokens)

# max_vocab_size = 50000
max_vocab_size = 100000
# 限制词汇表大小，防止低频词占用太多空间 
# based on 
#   1. Words appearing > 5 times: 50289 
#   2. Percentage of low-frequency words: 88.30%
word2idx = {"<PAD>": 0, "<UNK>": 1}  # 预留特殊标记
for i, (word, freq) in enumerate(counter.most_common(max_vocab_size - 2), start=2):
    word2idx[word] = i

# **将文本转换为索引**
def tokens_to_ids(tokens, word2idx):
    return [word2idx.get(token, word2idx["<UNK>"]) for token in tokens]

df["token_ids"] = df["tokens"].apply(lambda x: tokens_to_ids(x, word2idx))


In [4]:
# **训练/测试数据集划分**
train_x, temp_x, train_y, temp_y = train_test_split(df["token_ids"].tolist(), df["polarity"].tolist(),test_size=0.7, random_state=34, stratify=df["polarity"])
val_x, test_x, val_y, test_y = train_test_split(temp_x, temp_y, test_size=0.5, random_state=34, stratify=temp_y)

In [5]:
# 数据集类
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = [torch.tensor(t, dtype=torch.long) for t in texts]
        self.labels = torch.tensor(labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = torch.tensor([len(t) for t in texts], dtype=torch.long)
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=0)
    return texts_padded, lengths, torch.tensor(labels, dtype=torch.long)

In [6]:
# **创建 DataLoader**
batch_size = 128
train_dataset = TextDataset(train_x, train_y)
val_dataset = TextDataset(val_x, val_y)
test_dataset = TextDataset(test_x, test_y)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [7]:
# 加载 GloVe 预训练词向量
glove_model = api.load("glove-twitter-200")
embedding_dim = 200
embedding_matrix = np.zeros((max_vocab_size, embedding_dim))
for word, idx in word2idx.items():
    if word in glove_model:
        embedding_matrix[idx] = glove_model[word]
embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float32)

In [8]:
# 构建 RNN 模型
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=2):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_tensor, freeze=False)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, 1)  # 输出维度调整为 1
        self.dropout = nn.Dropout(0.5)

    def forward(self, x, lengths):
        x = self.embedding(x)
        lengths, perm_idx = lengths.sort(0, descending=True)
        x = x[perm_idx]
        x = pack_padded_sequence(x, lengths.cpu(), batch_first=True)
        
        # 获取隐藏状态
        packed_output, (h_n, c_n) = self.lstm(x)
        
        # 提取双向最终隐藏状态
        # h_n 形状: (num_layers * 2, batch_size, hidden_size)
        h_n = h_n.view(self.lstm.num_layers, 2, -1, self.lstm.hidden_size)
        forward_final = h_n[-1, 0, :, :]  # 最后一层的前向隐藏状态
        backward_final = h_n[-1, 1, :, :]  # 最后一层的后向隐藏状态
        x = torch.cat([forward_final, backward_final], dim=1)
        
        x = self.dropout(x)
        x = torch.sigmoid(self.fc(x)).squeeze(-1)
        
        # 还原输入顺序
        _, unperm_idx = perm_idx.sort(0)
        x = x[unperm_idx]
        return x


In [9]:
# 超参数
hidden_dim = 128
num_classes = 1
num_epochs = 10

# 初始化模型
model = RNNClassifier(max_vocab_size, embedding_dim, hidden_dim, num_classes).to(device)

# 损失函数 & 优化器 & 学习率衰减
criterion = nn.BCELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)


# 确保标签转换为 float
def prepare_labels(labels):
    return labels.float()

def get_predictions(outputs):
    return (outputs > 0.5).long()

In [10]:
# 训练循环
best_acc = 0
patience = 3

# device = torch.device("cpu")

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    # torch.cuda.empty_cache() 
    
    for batch_texts, batch_lengths, batch_labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        batch_texts, batch_labels, batch_lengths = batch_texts.to(device), batch_labels.to(device), batch_lengths.to(device)

        # # 🚀 **检查 batch_labels 是否超出范围**
        # print("Unique batch labels:", batch_labels.unique())
        # assert batch_labels.max() < num_classes, f"类别索引超出范围: {batch_labels.unique()}"

        # # 🚀 **确保 batch 长度非零**
        # if batch_lengths.max() == 0:
        #     continue  # 🟢 跳过空 batch

        optimizer.zero_grad()
        outputs = model(batch_texts, batch_lengths)
        # loss = criterion(outputs, batch_labels)
        loss = criterion(outputs, batch_labels.float())  # 转换为 float
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    scheduler.step()
    
    # 评估模型
    correct, total = 0, 0
    model.eval()
    with torch.no_grad():
        for batch_texts, batch_lengths, batch_labels in val_loader:
            batch_texts, batch_labels, batch_lengths = batch_texts.to(device), batch_labels.to(device), batch_lengths.to(device)
            outputs = model(batch_texts, batch_lengths)
            preds = get_predictions(outputs)
            correct += (preds == batch_labels).sum().item()
            total += batch_labels.size(0)
    acc = correct / total
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}, Val Accuracy: {acc:.4f}")
    if acc > best_acc:
        best_acc = acc
        patience = 3
    else:
        patience -= 1
        if patience == 0:
            print("Early stopping 触发")
            break

print("训练完成，最佳验证集准确率:", best_acc)

Epoch 1/10: 100%|██████████| 3744/3744 [01:24<00:00, 44.24it/s]


Epoch 1, Loss: 0.4488, Val Accuracy: 0.8103


Epoch 2/10: 100%|██████████| 3744/3744 [01:22<00:00, 45.53it/s]


Epoch 2, Loss: 0.3990, Val Accuracy: 0.8197


Epoch 3/10: 100%|██████████| 3744/3744 [01:23<00:00, 44.87it/s]


Epoch 3, Loss: 0.3773, Val Accuracy: 0.8263


Epoch 4/10: 100%|██████████| 3744/3744 [01:22<00:00, 45.21it/s]


Epoch 4, Loss: 0.3601, Val Accuracy: 0.8302


Epoch 5/10: 100%|██████████| 3744/3744 [05:24<00:00, 11.55it/s]


Epoch 5, Loss: 0.3461, Val Accuracy: 0.8287


Epoch 6/10: 100%|██████████| 3744/3744 [03:50<00:00, 16.22it/s]


Epoch 6, Loss: 0.3301, Val Accuracy: 0.8323


Epoch 7/10: 100%|██████████| 3744/3744 [01:20<00:00, 46.29it/s]


Epoch 7, Loss: 0.3229, Val Accuracy: 0.8320


Epoch 8/10: 100%|██████████| 3744/3744 [01:20<00:00, 46.31it/s]


Epoch 8, Loss: 0.3163, Val Accuracy: 0.8324


Epoch 9/10: 100%|██████████| 3744/3744 [01:21<00:00, 46.20it/s]


Epoch 9, Loss: 0.3101, Val Accuracy: 0.8312


Epoch 10/10: 100%|██████████| 3744/3744 [02:48<00:00, 22.26it/s]


Epoch 10, Loss: 0.3040, Val Accuracy: 0.8297
训练完成，最佳验证集准确率: 0.8324136068245198


In [11]:
def evaluate_test(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch_texts, batch_lengths, batch_labels in tqdm(test_loader, desc="Testing"):
            batch_texts = batch_texts.to(device)
            batch_labels = batch_labels.to(device)
            
            outputs = model(batch_texts, batch_lengths.to(device))
            preds = (outputs > 0.5).long()
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())
    
    # 计算指标
    accuracy = (np.array(all_preds) == np.array(all_labels)).mean()
    print(f"\nTest Accuracy: {accuracy:.4f}")
    print(classification_report(all_labels, all_preds, target_names=["Negative", "Positive"]))
    return accuracy, all_preds, all_labels

# 3. 执行测试
test_acc, all_preds, all_labels = evaluate_test(model, test_loader, device)



Testing: 100%|██████████| 4368/4368 [00:53<00:00, 82.40it/s]



Test Accuracy: 0.8290
              precision    recall  f1-score   support

    Negative       0.81      0.85      0.83    279479
    Positive       0.85      0.80      0.82    279565

    accuracy                           0.83    559044
   macro avg       0.83      0.83      0.83    559044
weighted avg       0.83      0.83      0.83    559044



###even support transfer learning, no improvment.