In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
file_path = "data/noemoticon_preprocessed.csv"
df = pd.read_csv(file_path)
print(df.info())

##already do following steps
##removing HTML and brackets.
# df["text"] = df["text"].apply(helper.denoise_text)
#Expands abbreviations, e.g. "can't" → "cannot", "I'm" → "I am"
# df["text"] = df["text"].apply(helper.expand_contractions)
#Remove @someone and email addresses from text
# df["text"] = df["text"].apply(helper.remove_mentions_and_emails)
#apply futher
# df["text"] = df["text"].apply(helper.remove_non_ascii_in_text)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1597267 entries, 0 to 1597266
Data columns (total 2 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   polarity  1597267 non-null  int64 
 1   text      1597267 non-null  object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB
None


In [4]:
label_mapping = {0: 0, 4: 1} #把原始的 0/4 映射成 0/1，实现 二分类任务
df["polarity"] = df["polarity"].map(label_mapping)

In [5]:
df["tokens"] = df["text"].apply(word_tokenize) #base on the experience, directly use nftk's word_tokenize

In [6]:
counter = Counter()
for tokens in df["tokens"]:  
    counter.update(tokens)

In [7]:
max_vocab_size = 50000  
# 限制词汇表大小，防止低频词占用太多空间 
# based on 
#   1. Words appearing > 5 times: 50289 
#   2. Percentage of low-frequency words: 88.30%
word2idx = {"<PAD>": 0, "<UNK>": 1}  # 预留特殊标记
for i, (word, freq) in enumerate(counter.most_common(max_vocab_size - 2), start=2):
    word2idx[word] = i

In [8]:
# **将文本转换为索引**
def tokens_to_ids(tokens, word2idx):
    return [word2idx.get(token, word2idx["<UNK>"]) for token in tokens]

df["token_ids"] = df["tokens"].apply(lambda x: tokens_to_ids(x, word2idx))

In [9]:
# **训练/测试数据集划分**
train_x, test_x, train_y, test_y = train_test_split(
    df["token_ids"].tolist(), df["polarity"].tolist(),
    test_size=0.2, random_state=34, stratify=df["polarity"]
)

In [10]:
# **PyTorch Dataset**
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = [torch.tensor(t, dtype=torch.long) for t in texts]
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

In [11]:
# **Padding Collate Function**
def collate_fn(batch):
    texts, labels = zip(*batch)
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=word2idx['<PAD>'])
    labels = torch.tensor(labels, dtype=torch.long)
    return texts_padded.to(device), labels.to(device)

In [12]:
# **创建 DataLoader**
batch_size = 128
train_dataset = TextDataset(train_x, train_y)
test_dataset = TextDataset(test_x, test_y)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [13]:
# **构建 RNN 模型**
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=word2idx['<PAD>'])
        # self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.gru(x)
        x = self.dropout(x[:, -1, :])  # 取最后一个时间步的输出
        x = self.fc(x)
        return x

In [14]:
# **超参数**
embed_dim = 128
# hidden_dim = 64
hidden_dim = 128
num_classes = 2

In [15]:
# **初始化模型**
model = RNNClassifier(vocab_size=len(word2idx), embed_dim=embed_dim, hidden_dim=hidden_dim, num_classes=num_classes)
model.to(device)

# **损失函数 & 优化器**
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [16]:
# **训练模型**
# num_epochs = 5
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_texts, batch_labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)

        optimizer.zero_grad()
        outputs = model(batch_texts)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

Epoch 1/10:   0%|          | 0/9983 [00:00<?, ?it/s]

Epoch 1/10: 100%|██████████| 9983/9983 [01:41<00:00, 98.68it/s] 


Epoch 1, Loss: 0.4123


Epoch 2/10: 100%|██████████| 9983/9983 [01:31<00:00, 108.63it/s]


Epoch 2, Loss: 0.3509


Epoch 3/10: 100%|██████████| 9983/9983 [01:30<00:00, 110.38it/s]


Epoch 3, Loss: 0.3239


Epoch 4/10: 100%|██████████| 9983/9983 [01:27<00:00, 114.56it/s]


Epoch 4, Loss: 0.3001


Epoch 5/10: 100%|██████████| 9983/9983 [01:26<00:00, 115.02it/s]


Epoch 5, Loss: 0.2785


Epoch 6/10: 100%|██████████| 9983/9983 [01:26<00:00, 115.19it/s]


Epoch 6, Loss: 0.2601


Epoch 7/10: 100%|██████████| 9983/9983 [01:26<00:00, 115.16it/s]


Epoch 7, Loss: 0.2447


Epoch 8/10: 100%|██████████| 9983/9983 [01:26<00:00, 115.01it/s]


Epoch 8, Loss: 0.2327


Epoch 9/10: 100%|██████████| 9983/9983 [01:26<00:00, 115.30it/s]


Epoch 9, Loss: 0.2230


Epoch 10/10: 100%|██████████| 9983/9983 [01:26<00:00, 115.28it/s]

Epoch 10, Loss: 0.2171





In [18]:
# **评估模型**
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch_texts, batch_labels in test_loader:
        batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)

        outputs = model(batch_texts)
        _, preds = torch.max(outputs, 1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch_labels.cpu().numpy())

# **计算 Precision, Recall, F1-score**
print(classification_report(all_labels, all_preds, target_names=["Negative", "Positive"]))

              precision    recall  f1-score   support

    Negative       0.83      0.83      0.83    159702
    Positive       0.83      0.83      0.83    159752

    accuracy                           0.83    319454
   macro avg       0.83      0.83      0.83    319454
weighted avg       0.83      0.83      0.83    319454



### Try to improve accuracy, check 4.1-RNN-improved.ipynb