# Fake News Detection

## 1. Data Preprocessing

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizerFast

In [31]:
# Load Fake and True news CSVs
fake_df = pd.read_csv("ISOT/Fake.csv")
true_df = pd.read_csv("ISOT/True.csv")

# Add labels
fake_df["label"] = 0  # Fake = 0
true_df["label"] = 1  # True = 1

print("Fake shape:", fake_df.shape)
print("True shape:", true_df.shape)

Fake shape: (23481, 5)
True shape: (21417, 5)


In [32]:
# Merge into single dataset
df = pd.concat([fake_df, true_df], axis=0).reset_index(drop=True)

# Shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df.head())

                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      0  
1       April 5, 2017       1  
2  September 27, 2017       1  
3         May 22, 2017      0  
4       June 24, 2016       1  


In [33]:
# Drop missing values
df = df.dropna(subset=["title", "text"])

# Combine title + text into a single column
df["content"] = df["title"] + " " + df["text"]

# Remove extra whitespaces/newlines
df["content"] = df["content"].str.replace(r"\s+", " ", regex=True).str.strip()

print(df["content"].iloc[0][:500])  # Preview first 500 chars

Ben Stein Calls Out 9th Circuit Court: Committed a ‘Coup d’état’ Against the Constitution 21st Century Wire says Ben Stein, reputable professor from, Pepperdine University (also of some Hollywood fame appearing in TV shows and films such as Ferris Bueller s Day Off) made some provocative statements on Judge Jeanine Pirro s show recently. While discussing the halt that was imposed on President Trump s Executive Order on travel. Stein referred to the judgement by the 9th Circuit Court in Washingto


In [34]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["content"],
    df["label"],
    test_size=0.2,      # 80% train, 20% temp
    stratify=df["label"],
    random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,      # 10% val, 10% test
    stratify=temp_labels,
    random_state=42
)

print("Train size:", len(train_texts))
print("Validation size:", len(val_texts))
print("Test size:", len(test_texts))

Train size: 35918
Validation size: 4490
Test size: 4490


In [35]:
# Load RoBERTa tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

# Tokenize datasets
train_encodings = tokenizer(
    list(train_texts),
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

val_encodings = tokenizer(
    list(val_texts),
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

test_encodings = tokenizer(
    list(test_texts),
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

print("Tokenized sample:", train_encodings["input_ids"][0][:50])


Tokenized sample: tensor([    0, 39954,  3587,    13,  3738,    11,  1625,   412,    71,  8969,
          256,  6725, 13701,  8100,    36,  1251,    43,   111,    20,  1707,
           13,  1680,     9,    41,  8969,    14,   848,  2213,    11,  1625,
          412,  1249,    15,   307,    25,  3906,  1138,  4609,     5,   809,
            9,     5,    94,   621,   684,     7,    28,  1716, 11352,     5])


## 2. PyTorch Dataset class

In [36]:
import torch

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        return item


In [37]:
# Convert into dataset objects
train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset   = NewsDataset(val_encodings, val_labels)
test_dataset  = NewsDataset(test_encodings, test_labels)

print("Train dataset length:", len(train_dataset))
print("Validation dataset length:", len(val_dataset))
print("Test dataset length:", len(test_dataset))


Train dataset length: 35918
Validation dataset length: 4490
Test dataset length: 4490


In [38]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16)
test_loader  = DataLoader(test_dataset, batch_size=16)

# Example: fetch one batch
batch = next(iter(train_loader))
print(batch['input_ids'].shape, batch['attention_mask'].shape, batch['labels'].shape)


torch.Size([16, 512]) torch.Size([16, 512]) torch.Size([16])


## 3. Model Training

In [39]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments
import torch

In [40]:
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=2
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
pip install -U "transformers>=4.40" "evaluate>=0.4.0" accelerate datasets

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [44]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    # older versions may not accept per_device_*; if so, try per_gpu_* instead:
    per_device_train_batch_size=16,   # if this errors, use per_gpu_train_batch_size
    per_device_eval_batch_size=16,    # if this errors, use per_gpu_eval_batch_size
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    # use step-based saving instead of save_strategy
    save_steps=500,
    # for step-based evaluation on some older releases:
    # (if this errors, just remove it and run trainer.evaluate() after training)
    eval_steps=500
)


In [45]:
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "precision": precision.compute(predictions=preds, references=labels, average="weighted")["precision"],
        "recall": recall.compute(predictions=preds, references=labels, average="weighted")["recall"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [46]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [47]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,0.2025,,,,,
100,0.0267,,,,,
150,0.0233,,,,,
200,0.0099,,,,,
250,0.0002,,,,,
300,0.0002,,,,,
350,0.0001,,,,,
400,0.0088,,,,,
450,0.0056,,,,,
500,0.0001,,,,,


KeyboardInterrupt: 

In [54]:
results = trainer.evaluate(test_dataset)
print(results)

{'eval_loss': 0.003965416457504034, 'eval_accuracy': 0.999554565701559, 'eval_precision': 0.9995549812186285, 'eval_recall': 0.999554565701559, 'eval_f1': 0.9995545747349035}


## 4. Model Saving and Testing

In [55]:
trainer.save_model("./fake_news_roberta")
tokenizer.save_pretrained("./fake_news_roberta")

('./fake_news_roberta\\tokenizer_config.json',
 './fake_news_roberta\\special_tokens_map.json',
 './fake_news_roberta\\vocab.json',
 './fake_news_roberta\\merges.txt',
 './fake_news_roberta\\added_tokens.json',
 './fake_news_roberta\\tokenizer.json')

In [56]:
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast

model = RobertaForSequenceClassification.from_pretrained("./fake_news_roberta")
tokenizer = RobertaTokenizerFast.from_pretrained("./fake_news_roberta")


In [58]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    preds = torch.argmax(outputs.logits, dim=-1).item()
    return "FAKE" if preds == 0 else "TRUE"

# Example
print(predict("Breaking news: Aliens landed in New York City!"))
print(predict("The government announced a new economic policy today."))
print(predict("President signs new healthcare reform bill into law."))
print(predict("Shocking! Eating chocolate cures all diseases instantly."))

FAKE
TRUE
TRUE
FAKE
