In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, classification_report

In [36]:
import shap
import torch
import torch.nn.functional as F

In [40]:
import numpy as np

In [10]:
from sklearn.model_selection import train_test_split

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv("clean_data_merged.csv")

In [9]:
df.head()

Unnamed: 0,title,text,subject,date,label,content,clean_content
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1,"As U.S. budget fight looms, Republicans flip t...",u.s. budget fight loom republicans flip fiscal...
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1,U.S. military to accept transgender recruits o...,u.s. military accept transgender recruit monda...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1,Senior U.S. Republican senator: 'Let Mr. Muell...,senior u.s. republican senator let mr. mueller...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1,FBI Russia probe helped by Australian diplomat...,fbi russia probe help australian diplomat tip ...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1,Trump wants Postal Service to charge 'much mor...,trump want postal service charge amazon shipme...


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
MAX_LEN = 256



In [5]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = list(texts)
        self.labels = list(labels)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [11]:
texts = df["clean_content"].astype(str)
labels = df["label"]

In [12]:
df = df.drop_duplicates(subset="clean_content").reset_index(drop=True)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

In [14]:
train_dataset = NewsDataset(X_train, y_train)
test_dataset  = NewsDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=16)

In [15]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
).to(device)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 199/199 [00:01<00:00, 130.48it/s, Materializing param=bert.pooler.dense.weight]                               
[1mBertForSequenceClassification LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNE

In [17]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [18]:
def train_epoch(model, loader):
    model.train()
    total_loss = 0

    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)

In [19]:
EPOCHS = 3

for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_loader)
    print(f"\nEpoch {epoch+1}")
    print(f"Train Loss: {train_loss:.4f}")


Epoch 1
Train Loss: 0.0243

Epoch 2
Train Loss: 0.0044

Epoch 3
Train Loss: 0.0034


In [20]:
def evaluate(model, loader):
    model.eval()
    preds_all = []
    labels_all = []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            preds = torch.argmax(outputs.logits, dim=1)

            preds_all.extend(preds.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())

    acc = accuracy_score(labels_all, preds_all)
    return acc, labels_all, preds_all

In [21]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

def evaluate_full(model, loader):
    model.eval()

    preds_all = []
    labels_all = []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            preds = torch.argmax(outputs.logits, dim=1)

            preds_all.extend(preds.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())

    # ---- metrics ----
    acc = accuracy_score(labels_all, preds_all)
    precision = precision_score(labels_all, preds_all)
    recall = recall_score(labels_all, preds_all)
    f1 = f1_score(labels_all, preds_all)
    cm = confusion_matrix(labels_all, preds_all)

    print("\n===== Evaluation Metrics =====")
    print("Accuracy :", acc)
    print("Precision:", precision)
    print("Recall   :", recall)
    print("F1 Score :", f1)

    print("\nConfusion Matrix:")
    print(cm)

    print("\nClassification Report:")
    print(classification_report(labels_all, preds_all))

    return acc, precision, recall, f1, cm

In [22]:
evaluate_full(model, test_loader)


===== Evaluation Metrics =====
Accuracy : 0.9989977728285078
Precision: 0.999766191255553
Recall   : 0.9981325863678805
F1 Score : 0.9989487209438149

Confusion Matrix:
[[4695    1]
 [   8 4276]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4696
           1       1.00      1.00      1.00      4284

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



(0.9989977728285078,
 0.999766191255553,
 0.9981325863678805,
 0.9989487209438149,
 array([[4695,    1],
        [   8, 4276]]))

In [23]:
def predict(text, model, tokenizer, max_len=256):
    model.eval()

    encoding = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=max_len,
        return_tensors="pt"
    )

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits
        pred = torch.argmax(logits, dim=1).item()

    return pred

In [34]:
text = "Trump will be the new president of usa"

prediction = predict(text, model, tokenizer)

#print("Prediction:", prediction)

if(prediction == 1):
     print("REAL")
else: 
    print("FAKE")

FAKE


In [30]:
model.save_pretrained("bert_fake_news_model")
tokenizer.save_pretrained("bert_fake_news_model")

Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.31it/s]


('bert_fake_news_model\\tokenizer_config.json',
 'bert_fake_news_model\\tokenizer.json')

In [41]:
def predict_proba(texts):
    # ---- FIX INPUT FORMAT ----
    if isinstance(texts, str):
        texts = [texts]

    # convert numpy array → list
    if isinstance(texts, np.ndarray):
        texts = texts.tolist()

    # ensure everything is string
    texts = [str(t) for t in texts]

    # ---- TOKENIZE ----
    enc = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=256,
        return_tensors="pt"
    )

    enc = {k: v.to(device) for k, v in enc.items()}

    # ---- MODEL ----
    with torch.no_grad():
        outputs = model(**enc)
        probs = F.softmax(outputs.logits, dim=1)

    return probs.cpu().numpy()

In [42]:
explainer = shap.Explainer(predict_proba, tokenizer)

In [43]:
text = "Scientists claim new miracle drug cures all diseases overnight."

shap_values = explainer([text])

PartitionExplainer explainer: 2it [00:12, 12.30s/it]               


In [45]:
print("Class 0 (FAKE) explanation")
shap.plots.text(shap_values[0, :, 0])

print("Class 1 (REAL) explanation")
shap.plots.text(shap_values[0, :, 1])

Class 0 (FAKE) explanation


Class 1 (REAL) explanation
