In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
# 检查 CUDA 是否可用，并设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 定义模型列表（Hugging Face 模型 + 本地模型）
model_names = [
    "facebook/roberta-hate-speech-dynabench-r4-target",
    "tomh/toxigen_roberta",
    "./hate_speech_model"  # 本地模型路径
]

# 初始化存储模型、tokenizer 和 pipeline 的列表
models = []
tokenizers = []
pipelines = []

# 加载所有模型
for name in model_names:
    try:
        print(f"Loading model: {name}")

        # 加载 tokenizer 和 model
        tokenizer = AutoTokenizer.from_pretrained(name)
        model = AutoModelForSequenceClassification.from_pretrained(name)

        # 将模型移动到 GPU（如果可用）
        model = model.to(device)

        # 创建 pipeline
        clf = pipeline(
            "text-classification",
            model=model,
            tokenizer=tokenizer,
            device=0 if device.type == "cuda" else -1  # 0=GPU, -1=CPU
        )

        # 存储到列表
        tokenizers.append(tokenizer)
        models.append(model)
        pipelines.append(clf)

        print(f"Successfully loaded: {name}")
    except Exception as e:
        print(f"Failed to load {name}: {str(e)}")


Using device: cuda
Loading model: facebook/roberta-hate-speech-dynabench-r4-target
Successfully loaded: facebook/roberta-hate-speech-dynabench-r4-target
Loading model: tomh/toxigen_roberta
Successfully loaded: tomh/toxigen_roberta
Loading model: ./hate_speech_model
Successfully loaded: ./hate_speech_model


In [11]:
from collections import Counter

def detect_hate_speech(text, pipelines, weights=None , threshold=0.5):
    try:
        text = text[:512]  # 截断过长文本
        scores = []
        for idx, clf in enumerate(pipelines):
            output = clf(text)[0]
            label_str = output['label'].lower()
            score = output['score']
            prob_hate = score if 'hate' in label_str or '1' in label_str else 1 - score
            weight = weights[idx] if weights else 1
            scores.append(prob_hate * weight)

        avg_score = sum(scores) / (sum(weights) if weights else len(scores))
        return (1 if avg_score > threshold else 0), avg_score

    except Exception as e:
        print(f"[ERROR] Voting failed: {text[:30]} - {e}")
        return ('error', 0.0)


In [12]:
import pandas as pd
import csv
import warnings
from transformers import logging

# 设置transformers日志级别
logging.set_verbosity_error()

# 精确过滤特定警告
warnings.filterwarnings(
    "ignore",
    module="torch.nn.modules.module",  # 限定模块
    category=FutureWarning,
    message=".*encoder_attention_mask.*"
)

# from your_module import detect_hate_speech, pipelines

input_csv = r"news_comments1.csv"
output_csv = r"labeled_news_comments.csv"

# 写入表头（第一次写）
with open(output_csv, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    writer.writerow(["author", "score", "created", "link", "body", "label", "detect_score"])

# 按块读取，每次只加载 chunk_size 行到内存
chunk_size = 5000  # 可根据内存调大或调小
for chunk in pd.read_csv(input_csv, quoting=csv.QUOTE_ALL, chunksize=chunk_size):
    # 去掉空值和 [deleted]
    chunk = chunk[chunk["body"].notna()]
    chunk = chunk[chunk["body"] != "[deleted]"]

    # 存检测结果
    labels = []
    scores = []

    for i, text in enumerate(chunk["body"]):
        label, score = detect_hate_speech(text, pipelines)
        labels.append(label)
        scores.append(score)
        print(i, label, score)

    chunk["label"] = labels
    chunk["detect_score"] = scores

    # 追加写入
    with open(output_csv, "a", newline="", encoding="utf-8") as f:
        chunk.to_csv(f, header=False, index=False, quoting=csv.QUOTE_ALL)

    # 主动释放内存
    del chunk, labels, scores
    import gc
    gc.collect()

print(f"✅ 处理完成，结果保存在: {output_csv}")


0 0 0.3720574577649434
1 0 0.3475235501925151
2 0 0.349066694577535
3 0 0.3872978885968526
4 0 0.365392804145813
5 0 0.364283283551534
6 0 0.36238471666971844
7 0 0.3510579864184062
8 0 0.45118149121602374
9 1 0.7201560139656067
10 0 0.3527287046114604
11 0 0.36520657936731976
12 0 0.3885638515154521
13 0 0.4828110734621684
14 0 0.3822472294171651
15 0 0.3644024729728699
16 0 0.365392804145813
17 0 0.365392804145813
18 0 0.365392804145813
19 1 0.6864325801531473
20 1 0.7345754106839498
21 0 0.365392804145813
22 0 0.42275470495224
23 0 0.365392804145813
24 0 0.365392804145813
25 0 0.37149224678675336
26 1 0.658577044804891
27 0 0.36489683389663696
28 0 0.3562808434168498
29 0 0.365392804145813
30 0 0.440100093682607
31 1 0.8757382233937582
32 1 0.5047443509101868
33 0 0.365392804145813
34 0 0.3452656070391337
35 0 0.3506962060928345
36 0 0.35865165789922077
37 0 0.35058464606602985
38 0 0.35510369141896564
39 1 0.5382745862007141
40 0 0.365392804145813
41 0 0.3876192768414815
42 0 0.351