In [1]:
import pandas as pd
import torch
from transformers import DistilBertTokenizerFast, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb
import numpy as np
import joblib
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Load your dataset (already processed)
df = pd.read_csv("final_fake_news_dataset.csv")  # replace with actual file if needed
df.dropna(inplace=True)
texts = df["content"].tolist()
labels = df["label"].tolist()

In [5]:

# Initialize tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)
bert_model.eval()  # Set model to eval mode

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [6]:
# Generate BERT embeddings (CLS token)
def get_bert_embeddings(texts, batch_size=16):
    all_embeddings = []

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Generating BERT Embeddings"):
            batch = texts[i:i+batch_size]
            encoded = tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=512).to(device)
            outputs = bert_model(**encoded)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token
            all_embeddings.append(cls_embeddings)

    return np.vstack(all_embeddings)


In [7]:
# Step 1: Create embeddings
embeddings = get_bert_embeddings(texts)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Generating BERT Embeddings: 100%|██████████████████████████████████████████████████| 1450/1450 [01:05<00:00, 22.06it/s]


In [8]:
# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42, stratify=labels)

In [9]:
# Step 3: Train XGBoost
clf = xgb.XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss')
clf.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [10]:
# Step 4: Evaluate
y_pred = clf.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.95      0.89      3489
           1       0.75      0.47      0.58      1151

    accuracy                           0.83      4640
   macro avg       0.80      0.71      0.74      4640
weighted avg       0.82      0.83      0.82      4640



In [11]:
# Step 5: Save model
joblib.dump(clf, "xgboost_fake_news.pkl")
print("✅ Model saved as xgboost_fake_news.pkl")

✅ Model saved as xgboost_fake_news.pkl
