In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 读取数据
train_data = pd.read_csv("mediaeval-2015-trainingset.txt", sep="\t")
test_data = pd.read_csv("mediaeval-2015-testset.txt", sep="\t")

# 提取文本和标签
train_texts = train_data["tweetText"].tolist()
train_labels = train_data["label"].apply(lambda x: 1 if x == "real" else 0).tolist()

test_texts = test_data["tweetText"].tolist()
test_labels = test_data["label"].apply(lambda x: 1 if x == "real" else 0).tolist()

# 划分验证集
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42
)


In [2]:
from transformers import BertTokenizer
from torch.utils.data import Dataset

# 指定本地路径
local_path = "F:\python\SVM_for_FakeNewsDetection-master\bert-base"

# 加载预训练的BERT分词器
tokenizer = BertTokenizer.from_pretrained(local_path)

class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# 创建数据集实例
train_dataset = FakeNewsDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = FakeNewsDataset(val_texts, val_labels, tokenizer, max_length=128)
test_dataset = FakeNewsDataset(test_texts, test_labels, tokenizer, max_length=128)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import BertForSequenceClassification
import torch

# 加载预训练的BERT模型（2分类）
model = BertForSequenceClassification.from_pretrained(local_path, num_labels=2)

# 将模型移动到GPU（如果可用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at F:\python\SVM_for_FakeNewsDetection-master\models and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [4]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# 定义DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# 优化器
optimizer = AdamW(model.parameters(), lr=5e-5)

# 训练循环
epochs = 10

for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        inputs = {
            "input_ids": batch["input_ids"].to(device),
            "attention_mask": batch["attention_mask"].to(device),
            "labels": batch["labels"].to(device),
        }
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == inputs["labels"]).sum().item()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {correct / len(train_dataset):.4f}")


100%|████████████████████████████████████████████████████████████████████████████████| 714/714 [03:02<00:00,  3.91it/s]


Epoch 1/10, Loss: 264.6469, Accuracy: 0.8398


100%|████████████████████████████████████████████████████████████████████████████████| 714/714 [03:07<00:00,  3.80it/s]


Epoch 2/10, Loss: 156.5793, Accuracy: 0.9131


100%|████████████████████████████████████████████████████████████████████████████████| 714/714 [03:00<00:00,  3.95it/s]


Epoch 3/10, Loss: 107.2887, Accuracy: 0.9424


100%|████████████████████████████████████████████████████████████████████████████████| 714/714 [02:56<00:00,  4.05it/s]


Epoch 4/10, Loss: 77.5975, Accuracy: 0.9609


100%|████████████████████████████████████████████████████████████████████████████████| 714/714 [02:57<00:00,  4.03it/s]


Epoch 5/10, Loss: 58.6604, Accuracy: 0.9712


100%|████████████████████████████████████████████████████████████████████████████████| 714/714 [02:56<00:00,  4.05it/s]


Epoch 6/10, Loss: 50.5142, Accuracy: 0.9749


100%|████████████████████████████████████████████████████████████████████████████████| 714/714 [02:55<00:00,  4.07it/s]


Epoch 7/10, Loss: 47.4686, Accuracy: 0.9756


100%|████████████████████████████████████████████████████████████████████████████████| 714/714 [02:55<00:00,  4.07it/s]


Epoch 8/10, Loss: 29.3489, Accuracy: 0.9847


100%|████████████████████████████████████████████████████████████████████████████████| 714/714 [02:55<00:00,  4.06it/s]


Epoch 9/10, Loss: 23.3791, Accuracy: 0.9882


100%|████████████████████████████████████████████████████████████████████████████████| 714/714 [02:55<00:00,  4.07it/s]

Epoch 10/10, Loss: 22.0660, Accuracy: 0.9888





In [5]:
from sklearn.metrics import classification_report

# 评估函数
def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = {
                "input_ids": batch["input_ids"].to(device),
                "attention_mask": batch["attention_mask"].to(device),
            }
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels)
    return all_preds, all_labels

# 在验证集上评估
val_preds, val_labels = evaluate(model, val_loader)
print(classification_report(val_labels, val_preds, target_names=["Fake", "Real"]))


              precision    recall  f1-score   support

        Fake       0.91      0.97      0.94      1854
        Real       0.93      0.83      0.88      1002

    accuracy                           0.92      2856
   macro avg       0.92      0.90      0.91      2856
weighted avg       0.92      0.92      0.92      2856

