In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np
from collections import Counter
import spacy
nlp = spacy.load("en_core_web_sm")
from nltk.tokenize import word_tokenize


In [26]:
# 设置参数
num_filters = 100
filter_sizes = [3, 4, 5]
num_classes = 2
embedding_size = 128
sequence_length = 500
batch_size = 64
learning_rate = 0.0002
num_epochs = 10
vocab_size = 5000

In [27]:
# 加载数据
def load_data(file_path):
    df = pd.read_csv(file_path,usecols=['content','label'])
    return df

# 分词和生成词汇表
def tokenize_and_build_vocab(df):
    tokens = []
    for text in df['content']:
        tokens.extend([token.text for token in nlp(text)])

    word_counter = Counter(tokens)
    vocab = sorted(word_counter, key=word_counter.get, reverse=True)[:vocab_size-2]
    vocab.append('<UNK>')
    vocab.append('<PAD>')

    word_index = {word: i for i, word in enumerate(vocab)}
    return word_index

In [28]:
# 创建数据集
class NewsDataset(Dataset):
    def __init__(self, texts, word_index, labels=None):
        self.texts = texts
        self.labels = labels
        self.word_index = word_index

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        tokens = word_tokenize(text)
        sequence = [self.word_index.get(token, self.word_index['<UNK>']) for token in tokens]
        sequence = sequence[:sequence_length] if len(sequence) > sequence_length else sequence + [self.word_index['<PAD>']] * (sequence_length - len(sequence))
        if self.labels is not None:
            return np.array(sequence), self.labels[idx]
        else:
            return np.array(sequence)

In [29]:
# 创建TextCNN模型
class TextCNN(nn.Module):
    def __init__(self):
        super(TextCNN, self).__init__()
        self.num_filters_total = num_filters * len(filter_sizes)
        self.W = nn.Embedding(vocab_size, embedding_size)
        self.Weight = nn.Linear(self.num_filters_total, num_classes, bias=False)
        self.Bias = nn.Parameter(torch.ones([num_classes]))
        self.filter_list = nn.ModuleList([nn.Conv2d(1, num_filters, (size, embedding_size)) for size in filter_sizes])

    def forward(self, X):
        embedded_chars = self.W(X)
        embedded_chars = embedded_chars.unsqueeze(1)

        pooled_outputs = []
        for i, conv in enumerate(self.filter_list):
            h = F.relu(conv(embedded_chars))
            mp = nn.MaxPool2d((sequence_length - filter_sizes[i] + 1, 1))
            pooled = mp(h).permute(0, 3, 2, 1)
            pooled_outputs.append(pooled)

        h_pool = torch.cat(pooled_outputs, len(filter_sizes))
        h_pool_flat = torch.reshape(h_pool, [-1, self.num_filters_total])
        model = self.Weight(h_pool_flat) + self.Bias
        return model

In [30]:
# 加载数据
train_df = load_data("data_process/clean_data.csv")

# 标签编码
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['label'])

In [31]:
# 分词和生成词汇表
word_index = tokenize_and_build_vocab(train_df)

In [32]:
# 数据集分割
train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_df['content'], train_df['label'], test_size=0.2, random_state=42)

# 重置索引
train_texts = train_texts.reset_index(drop=True)
valid_texts = valid_texts.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True)
valid_labels = valid_labels.reset_index(drop=True)

# 创建数据集
train_dataset = NewsDataset(train_texts, word_index, train_labels)
valid_dataset = NewsDataset(valid_texts, word_index, valid_labels)

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

In [33]:
# 创建模型
model = TextCNN()
if torch.cuda.is_available():
    model.cuda()

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [34]:
# 训练模型
for epoch in range(num_epochs):
    for i, (texts, labels) in enumerate(train_loader):
        if torch.cuda.is_available():
            texts = texts.long().cuda()
            labels = labels.long().cuda()

        outputs = model(texts)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # 每个epoch后在验证集上进行评估
    correct = 0
    total = 0
    for texts, labels in valid_loader:
        if torch.cuda.is_available():
            texts = texts.long().cuda()
            labels = labels.long().cuda()

        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Epoch {epoch+1}/{num_epochs}, Accuracy on validation set: {correct/total}')

Epoch 1/10, Accuracy on validation set: 0.8096646942800789
Epoch 2/10, Accuracy on validation set: 0.8323471400394478
Epoch 3/10, Accuracy on validation set: 0.8530571992110454
Epoch 4/10, Accuracy on validation set: 0.8826429980276134
Epoch 5/10, Accuracy on validation set: 0.8875739644970414
Epoch 6/10, Accuracy on validation set: 0.8925049309664694
Epoch 7/10, Accuracy on validation set: 0.9003944773175543
Epoch 8/10, Accuracy on validation set: 0.9023668639053254
Epoch 9/10, Accuracy on validation set: 0.9023668639053254
Epoch 10/10, Accuracy on validation set: 0.9043392504930966


In [35]:
# 加载测试集
# test_df = load_data("/home/mw/input/news_data58668156/test_news.csv")
# test_dataset = NewsDataset(test_df['text'], word_index)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [36]:
# 在测试数据上进行预测
# predictions = []
# model.eval()
# with torch.no_grad():
#     for texts in test_loader:
#         if torch.cuda.is_available():
#             texts = texts.long().cuda()
#         outputs = model(texts)
#         _, predicted = torch.max(outputs.data, 1)
#         predictions += predicted.cpu().numpy().tolist()
#
# # 将预测结果转换为所需的格式
# predictions = ['FAKE' if prediction == 0 else 'REAL' for prediction in predictions]
#
# # 创建结果数据框并保存为csv文件
# result_df = pd.DataFrame({
#     'id': [str(i) for i in range(len(predictions))],
#     'answer': predictions
# })
# print(result_df.head(2))
# result_df.to_csv('answer6.csv', index=False, encoding='utf-8-sig')