In [2]:
import pandas as pd
import torch.nn as nn
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import spacy
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel
import torch
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
stop_words = set(stopwords.words('english'))

In [6]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in string.punctuation and word not in stop_words]

    doc = nlp(" ".join(tokens))
    lemmatized = [token.lemma_ for token in doc]
    
    return " ".join(lemmatized)

In [9]:
fake_news = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")
true_news = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv")

In [11]:
fake_news['label'] = 0
true_news['label'] = 1

data = pd.concat([fake_news, true_news], ignore_index=True)
data

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


In [12]:
data['title_processed'] = data['title'].apply(preprocess_text)
data['text_processed'] = data['text'].apply(preprocess_text)

In [13]:
data.to_csv("processed_news.csv", index=False)
print("Предобработка завершена! Данные сохранены в processed_news.csv")

Предобработка завершена! Данные сохранены в processed_news.csv


## TF-IDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(data['text_processed'])
y = data['label']

print("TF-IDF матрица:", X_tfidf.shape)

TF-IDF матрица: (44898, 5000)


## Word2Vec

In [19]:
tokenized_texts = data['text_processed'].apply(word_tokenize)
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

In [20]:
def get_text_vector(text, model):
    words = word_tokenize(text)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(100)

In [21]:
X_w2v = np.array([get_text_vector(text, w2v_model) for text in data['text_processed']])
y = data['label']

print("Word2Vec матрица:", X_w2v.shape)

Word2Vec матрица: (44898, 100)


## BERT

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Используется устройство: {device}")

Используется устройство: cuda


In [25]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [26]:
def get_bert_embedding(text, max_length=128):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)

    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

In [27]:
batch_size = 32
num_samples = min(1000, len(data))
X_bert = []

for i in range(0, num_samples, batch_size):
    batch_texts = data['text_processed'][i:i+batch_size].tolist()
    batch_embeddings = [get_bert_embedding(text) for text in batch_texts]
    X_bert.extend(batch_embeddings)

X_bert = np.array(X_bert)
y_bert = data['label'][:num_samples].values

print("BERT матрица:", X_bert.shape)

BERT матрица: (1024, 768)


# Обучение моделей

## CatBoost с TF-IDF

In [29]:
X_tfidf = X_tfidf.toarray()

In [32]:
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [33]:
catboost_tfidf = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, task_type="GPU", verbose=100)
catboost_tfidf.fit(X_train_tfidf, y_train)

0:	learn: 0.4499837	total: 473ms	remaining: 7m 52s
100:	learn: 0.0101904	total: 19.8s	remaining: 2m 56s
200:	learn: 0.0083385	total: 38.4s	remaining: 2m 32s
300:	learn: 0.0073544	total: 56.6s	remaining: 2m 11s
400:	learn: 0.0069738	total: 1m 14s	remaining: 1m 51s
500:	learn: 0.0067665	total: 1m 32s	remaining: 1m 31s
600:	learn: 0.0065982	total: 1m 50s	remaining: 1m 13s
700:	learn: 0.0064461	total: 2m 8s	remaining: 54.8s
800:	learn: 0.0063225	total: 2m 26s	remaining: 36.4s
900:	learn: 0.0061434	total: 2m 44s	remaining: 18.1s
999:	learn: 0.0059341	total: 3m 2s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7ed876a319c0>

In [36]:
y_pred_tfidf = catboost_tfidf.predict(X_test_tfidf)

In [37]:
tfidf_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_tfidf),
    'F1-score': f1_score(y_test, y_pred_tfidf),
    'Precision': precision_score(y_test, y_pred_tfidf),
    'Recall': recall_score(y_test, y_pred_tfidf)
}

print("TF-IDF + CatBoost метрики:", tfidf_metrics)

TF-IDF + CatBoost метрики: {'Accuracy': 0.9975501113585746, 'F1-score': 0.9974123735591627, 'Precision': 0.9964747356051704, 'Recall': 0.9983517777254532}


## CatBoost с Word2Vec

In [38]:
X_train_w2v, X_test_w2v, y_train, y_test = train_test_split(X_w2v, y, test_size=0.2, random_state=42)

In [39]:
catboost_w2v = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, task_type="GPU", verbose=100)
catboost_w2v.fit(X_train_w2v, y_train)

0:	learn: 0.5850291	total: 75.4ms	remaining: 1m 15s
100:	learn: 0.0992141	total: 3.88s	remaining: 34.5s
200:	learn: 0.0641955	total: 7.57s	remaining: 30.1s
300:	learn: 0.0474638	total: 11.2s	remaining: 26.1s
400:	learn: 0.0372853	total: 14.9s	remaining: 22.3s
500:	learn: 0.0312877	total: 18.7s	remaining: 18.6s
600:	learn: 0.0280393	total: 22s	remaining: 14.6s
700:	learn: 0.0257529	total: 25.2s	remaining: 10.7s
800:	learn: 0.0231448	total: 28.4s	remaining: 7.05s
900:	learn: 0.0212528	total: 31.6s	remaining: 3.47s
999:	learn: 0.0194459	total: 34.8s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7ed876a301f0>

In [40]:
y_pred_w2v = catboost_w2v.predict(X_test_w2v)

In [41]:
w2v_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_w2v),
    'F1-score': f1_score(y_test, y_pred_w2v),
    'Precision': precision_score(y_test, y_pred_w2v),
    'Recall': recall_score(y_test, y_pred_w2v)
}

print("Word2Vec + CatBoost метрики:", w2v_metrics)

Word2Vec + CatBoost метрики: {'Accuracy': 0.984966592427617, 'F1-score': 0.9840970667923195, 'Precision': 0.9846770391324847, 'Recall': 0.9835177772545326}


## BERT

In [51]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

In [46]:
data_subset = data.sample(5000, random_state=42)  # 5000 примеров
X_texts = data_subset['text_processed'].tolist()
y = torch.tensor(data_subset['label'].values, dtype=torch.long)

In [53]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [47]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=self.max_length)
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': label
        }

In [48]:
dataset = NewsDataset(X_texts, y, tokenizer)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [49]:
class BertClassifier(nn.Module):
    def __init__(self, bert_model, num_classes=2):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.fc(cls_output)
        return logits

In [54]:
model = BertClassifier(bert_model).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

In [55]:
model.train()
for epoch in range(3):  # 3 эпохи
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Эпоха {epoch+1}, Loss: {loss.item()}")

Эпоха 1, Loss: 0.0010826856596395373
Эпоха 2, Loss: 0.0006481912569142878
Эпоха 3, Loss: 0.003025618614628911


In [56]:
model.eval()
y_true_bert, y_pred_bert = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)
        y_true_bert.extend(labels.cpu().numpy())
        y_pred_bert.extend(preds.cpu().numpy())

In [57]:
bert_metrics = {
    'Accuracy': accuracy_score(y_true_bert, y_pred_bert),
    'F1-score': f1_score(y_true_bert, y_pred_bert),
    'Precision': precision_score(y_true_bert, y_pred_bert),
    'Recall': recall_score(y_true_bert, y_pred_bert)
}

print("BERT + Нейросеть метрики:", bert_metrics)

BERT + Нейросеть метрики: {'Accuracy': 0.999, 'F1-score': 0.9989637305699481, 'Precision': 1.0, 'Recall': 0.9979296066252588}


# Результаты

In [58]:
results_df = pd.DataFrame([tfidf_metrics, w2v_metrics, bert_metrics], 
                          index=['TF-IDF + CatBoost', 'Word2Vec + CatBoost', 'BERT + Neural Net'])

print("Сравнение результатов:")
results_df

Сравнение результатов:


Unnamed: 0,Accuracy,F1-score,Precision,Recall
TF-IDF + CatBoost,0.99755,0.997412,0.996475,0.998352
Word2Vec + CatBoost,0.984967,0.984097,0.984677,0.983518
BERT + Neural Net,0.999,0.998964,1.0,0.99793
