# 1) Installing necessary libraries

In [1]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu

Looking in indexes: https://download.pytorch.org/whl/nightly/cpu
[31mERROR: Could not find a version that satisfies the requirement torchvision (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for torchvision[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
pip install transformers pandas scikit-learn flask streamlit sentencepiece shap

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
import re
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [4]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS backend for Apple Silicon GPU (M1/M2)")
else:
    device = torch.device("cpu")
    print("Using CPU only")

Using MPS backend for Apple Silicon GPU (M1/M2)


# 2) Data Pre-processing

In [15]:
# Load the datasets
claim_real = pd.read_csv('ClaimRealCOVID-19.csv')
news_fake = pd.read_csv('NewsFakeCOVID-19.csv')
news_real = pd.read_csv('NewsRealCOVID-19.csv')

# Add labels
claim_real['label'] = 1
news_real['label'] = 1
news_fake['label'] = 0

# Select relevant columns
claim_real_selected = claim_real[['title', 'label']]
news_real_selected = news_real[['title', 'content', 'label']]
news_fake_selected = news_fake[['title', 'content', 'label']]

# Fill missing content
news_fake_selected['content'] = news_fake_selected['content'].fillna(news_fake_selected['title'])
news_real_selected['content'] = news_real_selected['content'].fillna(news_real_selected['title'])

# Combine all datasets
combined_df = pd.concat([claim_real_selected, news_real_selected, news_fake_selected], ignore_index=True)
combined_df['content'] = combined_df['content'].fillna('')
combined_df['content'] = combined_df['content'].astype(str)

# Clean Text Function
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

# Apply cleaning function
combined_df['cleaned_content'] = combined_df['content'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_fake_selected['content'] = news_fake_selected['content'].fillna(news_fake_selected['title'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_real_selected['content'] = news_real_selected['content'].fillna(news_real_selected['title'])


In [16]:
# Separate real and fake news
real_news = combined_df[combined_df['label'] == 1]
fake_news = combined_df[combined_df['label'] == 0]

# Upsample fake news to match real news count
fake_news_upsampled = resample(fake_news, replace=True, n_samples=len(real_news), random_state=42)

# Combine real and upsampled fake news
balanced_df = pd.concat([real_news, fake_news_upsampled], ignore_index=True)
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save balanced dataset
balanced_df.to_csv('balanced_dataset.csv', index=False)

# Reload dataset
combined_df = pd.read_csv('balanced_dataset.csv')
combined_df['cleaned_content'] = combined_df['cleaned_content'].astype(str)

In [17]:
# Load BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_and_pad(texts, tokenizer, max_length=128):
    encoding = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return encoding['input_ids'], encoding['attention_mask']

def prepare_dataloader(input_ids, attention_mask, labels, batch_size=4):
    dataset = TensorDataset(input_ids, attention_mask, torch.tensor(labels.values))
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)




# 3) Training data

In [18]:
# Limit training to 500 samples for testing
X = combined_df['cleaned_content'][:500]
y = combined_df['label'][:500]

# Split dataset for training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to list
X_train_list = X_train.tolist()
X_val_list = X_val.tolist()

# Tokenize
X_train_ids, X_train_mask = tokenize_and_pad(X_train_list, bert_tokenizer)
X_val_ids, X_val_mask = tokenize_and_pad(X_val_list, bert_tokenizer)

# Prepare DataLoader
train_dataloader = prepare_dataloader(X_train_ids, X_train_mask, y_train)
val_dataloader = prepare_dataloader(X_val_ids, X_val_mask, y_val)

In [7]:
def train_model(model, train_dataloader, epochs=1, learning_rate=2e-5):
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            inputs, masks, labels = [b.to(device) for b in batch]

            optimizer.zero_grad()
            outputs = model(input_ids=inputs, attention_mask=masks, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f'Epoch {epoch + 1}, Training Loss: {avg_train_loss}')
    
    return model


### Training BERT Model

In [8]:
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
trained_bert = train_model(bert_model, train_dataloader, epochs=1)
torch.save(trained_bert.state_dict(), 'trained_bert_model.pth')
del bert_model
torch.mps.empty_cache()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Training Loss: 0.2979233482480049


### Training ALBERT Model

In [19]:
from transformers import AlbertForSequenceClassification, RobertaForSequenceClassification


# Load ALBERT model
albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)
albert_model.to(device)

# Train the model
trained_albert = train_model(albert_model, train_dataloader, epochs=1)
torch.save(trained_albert.state_dict(), 'trained_albert_model.pth')
del albert_model
torch.mps.empty_cache()

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Training Loss: 0.47211181934922936


### Training RoBERTa Model

In [20]:
# Load RoBERTa model
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
roberta_model.to(device)

# Train the model
trained_roberta = train_model(roberta_model, train_dataloader, epochs=1)
torch.save(trained_roberta.state_dict(), 'trained_roberta_model.pth')
del roberta_model
torch.mps.empty_cache()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Training Loss: 0.4103435812331736


# 4) Evaluation

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from transformers import BertForSequenceClassification, AlbertForSequenceClassification, RobertaForSequenceClassification


def evaluate_model(model, val_dataloader):
    model.to(device)
    model.eval()
    
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in val_dataloader:
            inputs, masks, labels = [b.to(device) for b in batch]
            
            outputs = model(input_ids=inputs, attention_mask=masks)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    # Classification report
    report = classification_report(true_labels, predictions, target_names=["Fake", "Real"])
    print(report)
    
    # Return scores for comparison
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    
    return accuracy, precision, recall, f1


In [11]:
# Load the trained BERT model
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
bert_model.load_state_dict(torch.load('trained_bert_model.pth'))

# Evaluate BERT model
bert_scores = evaluate_model(bert_model, val_dataloader)
del bert_model
torch.mps.empty_cache()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


              precision    recall  f1-score   support

        Fake       0.96      1.00      0.98        48
        Real       1.00      0.96      0.98        52

    accuracy                           0.98       100
   macro avg       0.98      0.98      0.98       100
weighted avg       0.98      0.98      0.98       100



In [12]:
# Load the trained ALBERT model
albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)
albert_model.load_state_dict(torch.load('trained_albert_model.pth'))

# Evaluate ALBERT model
albert_scores = evaluate_model(albert_model, val_dataloader)
del albert_model
torch.mps.empty_cache()


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


              precision    recall  f1-score   support

        Fake       0.71      1.00      0.83        48
        Real       1.00      0.62      0.76        52

    accuracy                           0.80       100
   macro avg       0.85      0.81      0.79       100
weighted avg       0.86      0.80      0.79       100



In [13]:
# Load the trained RoBERTa model
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
roberta_model.load_state_dict(torch.load('trained_roberta_model.pth'))

# Evaluate RoBERTa model
roberta_scores = evaluate_model(roberta_model, val_dataloader)
del roberta_model
torch.mps.empty_cache()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


              precision    recall  f1-score   support

        Fake       0.92      1.00      0.96        48
        Real       1.00      0.92      0.96        52

    accuracy                           0.96       100
   macro avg       0.96      0.96      0.96       100
weighted avg       0.96      0.96      0.96       100



In [14]:
# Combine scores for comparison
import pandas as pd

model_comparison = pd.DataFrame({
    "Model": ["BERT", "ALBERT", "RoBERTa"],
    "Accuracy": [bert_scores[0], albert_scores[0], roberta_scores[0]],
    "Precision": [bert_scores[1], albert_scores[1], roberta_scores[1]],
    "Recall": [bert_scores[2], albert_scores[2], roberta_scores[2]],
    "F1 Score": [bert_scores[3], albert_scores[3], roberta_scores[3]]
})

print(model_comparison)


     Model  Accuracy  Precision    Recall  F1 Score
0     BERT      0.98        1.0  0.961538  0.980392
1   ALBERT      0.80        1.0  0.615385  0.761905
2  RoBERTa      0.96        1.0  0.923077  0.960000
