Data Preparation

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# 读取数据
df = pd.read_csv("labeled_news_comments.csv")

# 去掉 URL & 多余空格
df['body'] = df['body'].apply(lambda x: re.sub(r'http\S+|www.\S+', '', str(x)))
df['body'] = df['body'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# 训练集 / 验证集
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['body'], df['label'],
    test_size=0.2, stratify=df['label'], random_state=42
)


In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import numpy as np
import os

# -------------------------
# 1. EarlyStopping 类
# -------------------------
class EarlyStopping:
    def __init__(self, patience=2, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_loss = None
        self.counter = 0
        self.early_stop = False

    def __call__(self, val_loss, model, path):
        if self.best_loss is None or val_loss < self.best_loss - self.delta:
            self.best_loss = val_loss
            self.counter = 0
            torch.save(model.state_dict(), path)  # 保存最佳模型
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

# -------------------------
# 2. 数据集类
# -------------------------
class HateDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# -------------------------
# 3. 模型 & Tokenizer
# -------------------------
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# -------------------------


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

HateDataset

In [3]:
# 4. DataLoader
# -------------------------
BATCH_SIZE = 32

train_dataset = HateDataset(train_texts, train_labels, tokenizer)
val_dataset = HateDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# -------------------------
# 5. 优化器、调度器、早停
# -------------------------
EPOCHS = 3
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

early_stopping = EarlyStopping(patience=2)
writer = SummaryWriter(log_dir="runs/hate_speech")




In [4]:
# -------------------------
# 6. 训练循环
# -------------------------
for epoch in range(EPOCHS):
    # ---- Train ----
    model.train()
    train_loss = 0
    train_preds, train_labels_all = [], []

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Train]"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        train_preds.extend(preds)
        train_labels_all.extend(labels.cpu().numpy())

    train_acc = accuracy_score(train_labels_all, train_preds)
    writer.add_scalar("Loss/train", train_loss / len(train_loader), epoch)
    writer.add_scalar("Accuracy/train", train_acc, epoch)

    # ---- Validate ----
    model.eval()
    val_loss = 0
    val_preds, val_labels_all = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Val]"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            val_preds.extend(preds)
            val_labels_all.extend(labels.cpu().numpy())

    val_acc = accuracy_score(val_labels_all, val_preds)
    writer.add_scalar("Loss/val", val_loss / len(val_loader), epoch)
    writer.add_scalar("Accuracy/val", val_acc, epoch)

    print(f"Epoch {epoch+1}: Train Acc={train_acc:.4f}, Val Acc={val_acc:.4f}")

    # ---- Early Stopping ----
    early_stopping(val_loss / len(val_loader), model, "best_model.pth")
    if early_stopping.early_stop:
        print("Early stopping triggered.")
        break

writer.close()

  return forward_call(*args, **kwargs)
Epoch 1/3 [Train]: 100%|██████████| 3742/3742 [24:10<00:00,  2.58it/s]
  return forward_call(*args, **kwargs)
Epoch 1/3 [Val]: 100%|██████████| 936/936 [01:56<00:00,  8.01it/s]


Epoch 1: Train Acc=0.9030, Val Acc=0.9210


  return forward_call(*args, **kwargs)
Epoch 2/3 [Train]: 100%|██████████| 3742/3742 [24:12<00:00,  2.58it/s]
  return forward_call(*args, **kwargs)
Epoch 2/3 [Val]: 100%|██████████| 936/936 [01:56<00:00,  8.03it/s]


Epoch 2: Train Acc=0.9521, Val Acc=0.9274


  return forward_call(*args, **kwargs)
Epoch 3/3 [Train]: 100%|██████████| 3742/3742 [24:14<00:00,  2.57it/s]
  return forward_call(*args, **kwargs)
Epoch 3/3 [Val]: 100%|██████████| 936/936 [01:56<00:00,  8.01it/s]

Epoch 3: Train Acc=0.9826, Val Acc=0.9268





In [6]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
sia = SentimentIntensityAnalyzer()

class HateDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, sentiment_analyzer, max_len=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.sentiment_analyzer = sentiment_analyzer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        # 情感分数
        sentiment = self.sentiment_analyzer.polarity_scores(text)
        sentiment_vector = torch.tensor([
            sentiment['neg'], sentiment['neu'], sentiment['pos'], sentiment['compound']
        ], dtype=torch.float)

        # BERT 编码
        encoding = self.tokenizer(
            text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'sentiment': sentiment_vector,
            'labels': torch.tensor(label, dtype=torch.long)
        }


[nltk_data] Downloading package vader_lexicon to C:\Users\Tang
[nltk_data]     Heqiang\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [7]:
from torch.utils.data import DataLoader

BATCH_SIZE = 32

train_dataset = HateDataset(train_texts, train_labels, tokenizer, sia, max_len=128)
val_dataset = HateDataset(val_texts, val_labels, tokenizer, sia, max_len=128)

# 注意：num_workers=0 防止 Windows + Jupyter 报错
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=0)


In [8]:
from transformers import AutoModel
import torch.nn as nn

class HateSpeechModel(nn.Module):
    def __init__(self, bert_model_name, num_labels):
        super(HateSpeechModel, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.3)
        # BERT [CLS] 输出 768 + 情感特征 4
        self.fc = nn.Linear(768 + 4, num_labels)

    def forward(self, input_ids, attention_mask, sentiment):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS]
        combined = torch.cat((cls_output, sentiment), dim=1)
        x = self.dropout(combined)
        return self.fc(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HateSpeechModel("bert-base-uncased", num_labels=len(set(df['label']))).to(device)


In [9]:
from torch.utils.tensorboard import SummaryWriter
from torch.optim import AdamW
from tqdm import tqdm
import torch.nn.functional as F

from torch.utils.tensorboard import SummaryWriter
import datetime

# 创建日志文件夹
log_dir = f"runs/hate_speech_{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}"
writer = SummaryWriter(log_dir=log_dir)

# ======== 训练循环 ========
EPOCHS = 10
best_val_loss = float('inf')

for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")

    # ---- Train ----
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = train_loss / len(train_loader)

    # ---- Validation ----
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)

    print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

    # ======== 写入 TensorBoard ========
    writer.add_scalar("Loss/Train", avg_train_loss, epoch)
    writer.add_scalar("Loss/Validation", avg_val_loss, epoch)
    writer.add_scalar("LR", optimizer.param_groups[0]['lr'], epoch)

    # ======== 学习率调度器更新 ========
    scheduler.step(avg_val_loss)

    # ======== 保存最佳模型 ========
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_model.pt")
        print("✅ Model saved.")

    # ======== Early stopping ========
    early_stopping(avg_val_loss)
    if early_stopping.early_stop:
        print("⛔ Early stopping triggered.")
        break

writer.close()


  return forward_call(*args, **kwargs)
Epoch 1/3 [Train]: 100%|██████████| 3742/3742 [24:29<00:00,  2.55it/s]
Epoch 1/3 [Val]: 100%|██████████| 936/936 [01:59<00:00,  7.82it/s]


Epoch 1/3 | Train Acc: 0.9099 | Val Acc: 0.9228


Epoch 2/3 [Train]: 100%|██████████| 3742/3742 [24:28<00:00,  2.55it/s]
Epoch 2/3 [Val]: 100%|██████████| 936/936 [01:58<00:00,  7.87it/s]


Epoch 2/3 | Train Acc: 0.9516 | Val Acc: 0.9203


Epoch 3/3 [Train]: 100%|██████████| 3742/3742 [24:24<00:00,  2.56it/s]
Epoch 3/3 [Val]: 100%|██████████| 936/936 [01:58<00:00,  7.87it/s]

Epoch 3/3 | Train Acc: 0.9804 | Val Acc: 0.9160





In [12]:
!tensorboard --logdir runs

^C


In [5]:
%load_ext tensorboard
%tensorboard --logdir runs/hate_speech


Reusing TensorBoard on port 6008 (pid 36768), started 16:13:20 ago. (Use '!kill 36768' to kill it.)

In [6]:
!kill 36768

'kill' 不是内部或外部命令，也不是可运行的程序
或批处理文件。


In [7]:
torch.save(model.state_dict(), "best_model.pt")


In [None]:
model.load_state_dict(torch.load("best_model.pt"))

In [15]:
def predict_hate_speech(text, threshold=0.5):
    """
    预测文本是否为仇恨言论

    参数:
        text (str): 要分类的文本
        threshold (float): 判定为仇恨言论的概率阈值

    返回:
        dict: 包含预测结果和置信度
    """
    # 文本编码
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)

    # 预测
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)

    # 获取概率
    prob_hate = probs[0][1].item()  # 假设索引1对应"仇恨言论"类别

    # 判定结果
    is_hate = prob_hate >= threshold
    confidence = prob_hate if is_hate else 1 - prob_hate

    return {
        "text": text,
        "is_hate_speech": bool(is_hate),
        "confidence": float(confidence),
        "hate_probability": float(prob_hate),
        "threshold": float(threshold)
    }

In [17]:
# 测试语句
test_texts = [
    "Asians are ugly!",
    "You people are disgusting and should leave this country",
    "This is just my opinion, but the policy seems unfair"
]

# 批量预测
for text in test_texts:
    result = predict_hate_speech(text)
    print(f"文本: {result['text']}")
    print(f"是否为仇恨言论: {result['is_hate_speech']} (置信度: {result['confidence']:.2f})")
    print(f"仇恨概率: {result['hate_probability']:.4f}")
    print("-" * 50)

TypeError: HateSpeechModel.forward() got an unexpected keyword argument 'token_type_ids'

In [18]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import numpy as np
import os

# -------------------------
# 1. EarlyStopping 类
# -------------------------
class EarlyStopping:
    def __init__(self, patience=2, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_loss = None
        self.counter = 0
        self.early_stop = False

    def __call__(self, val_loss, model, path):
        if self.best_loss is None or val_loss < self.best_loss - self.delta:
            self.best_loss = val_loss
            self.counter = 0
            torch.save(model.state_dict(), path)  # 保存最佳模型
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

# -------------------------
# 2. 数据集类
# -------------------------
class HateDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# -------------------------
# 3. 模型 & Tokenizer
# -------------------------
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# -------------------------
# 4. DataLoader
# -------------------------
BATCH_SIZE = 32

train_dataset = HateDataset(train_texts, train_labels, tokenizer)
val_dataset = HateDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# -------------------------
# 5. 优化器、调度器、早停
# -------------------------
EPOCHS = 3
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

early_stopping = EarlyStopping(patience=2)
writer = SummaryWriter(log_dir="runs/hate_speech")

# -------------------------
# 6. 训练循环


OSError: 页面文件太小，无法完成操作。 (os error 1455)

In [None]:
# -------------------------
for epoch in range(EPOCHS):
    # ---- Train ----
    model.train()
    train_loss = 0
    train_preds, train_labels_all = [], []

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Train]"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        train_preds.extend(preds)
        train_labels_all.extend(labels.cpu().numpy())

    train_acc = accuracy_score(train_labels_all, train_preds)
    writer.add_scalar("Loss/train", train_loss / len(train_loader), epoch)
    writer.add_scalar("Accuracy/train", train_acc, epoch)

    # ---- Validate ----
    model.eval()
    val_loss = 0
    val_preds, val_labels_all = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Val]"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            val_preds.extend(preds)
            val_labels_all.extend(labels.cpu().numpy())

    val_acc = accuracy_score(val_labels_all, val_preds)
    writer.add_scalar("Loss/val", val_loss / len(val_loader), epoch)
    writer.add_scalar("Accuracy/val", val_acc, epoch)

    print(f"Epoch {epoch+1}: Train Acc={train_acc:.4f}, Val Acc={val_acc:.4f}")

    # ---- Early Stopping ----
    early_stopping(val_loss / len(val_loader), model, "best_model.pth")
    if early_stopping.early_stop:
        print("Early stopping triggered.")
        break

writer.close()
