In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
#pip install 'accelerate>=0.26.0'
!pip install accelerate



In [1]:
# Cell 1: Install dependencies
!pip install transformers torch pandas scikit-learn



In [2]:
import pandas as pd
import numpy as np
import torch
import time
import os
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Disable WandB to avoid API key prompts
os.environ["WANDB_DISABLED"] = "true"

In [3]:
# Load 100% of the data frac = 1

# CHANGE PATH
#train_df = pd.read_csv('/content/drive/MyDrive/Colab/all_samples/all_train.tsv', sep='\t').sample(frac=1, random_state=42)
#test_df = pd.read_csv('/content/drive/MyDrive/Colab/all_samples/all_test_public.tsv', sep='\t').sample(frac=1, random_state=42)
train_df = pd.read_csv('all_train.tsv', sep='\t').sample(frac=1, random_state=42)
test_df = pd.read_csv('all_test_public.tsv', sep='\t').sample(frac=1, random_state=42)

# Preprocess text and labels
train_df['combined_text'] = train_df['clean_title'].fillna(train_df['title'])
test_df['combined_text'] = test_df['clean_title'].fillna(test_df['title'])
train_df['label'] = train_df['2_way_label']
test_df['label'] = test_df['2_way_label']

# Remove NaN values
train_df = train_df.dropna(subset=['combined_text'])
test_df = test_df.dropna(subset=['combined_text'])

print(f"Training samples: {len(train_df)}, Test samples: {len(test_df)}")
print("\nSample data:")
print(train_df[['combined_text', 'label']].head())

Training samples: 804378, Test samples: 84654

Sample data:
                                            combined_text  label
681297              woman on a swing on top of a mountain      1
436392  vegan denied passport because she was being an...      1
797451  sacrificing jessica to the almighty kingfisher...      0
61792                                  this map of europe      1
19909                              rancor snacking on dog      0


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(texts):
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

In [5]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [7]:
# Hyperparameters
BATCH_SIZE = 32
NUM_EPOCHS = 5
MAX_LENGTH = 128
EVAL_STEPS = 500
KFOLDS = 2

# Initialize KFold
kfold = KFold(n_splits=KFOLDS, shuffle=True, random_state=42)

# Timer
start_time = time.time()

# Metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [8]:
# Initialize lists to store predictions across all folds
all_true = []
all_preds = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(train_df)):
    print(f"\n=== Fold {fold + 1}/{KFOLDS} ===")

    # Split data
    train_data = train_df.iloc[train_idx]
    val_data = train_df.iloc[val_idx]

    # Tokenize
    train_encodings = tokenize_data(train_data['combined_text'])
    val_encodings = tokenize_data(val_data['combined_text'])

    # Create datasets
    train_dataset = FakeNewsDataset(train_encodings, torch.tensor(train_data['label'].tolist()))
    val_dataset = FakeNewsDataset(val_encodings, torch.tensor(val_data['label'].tolist()))

    # Initialize model
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.to(device)

    # Configure training
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold}',
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        logging_dir=f'./logs_fold_{fold}',
        logging_steps=100,
        eval_strategy='steps',
        eval_steps=EVAL_STEPS,
        save_steps=EVAL_STEPS,
        load_best_model_at_end=True,
        metric_for_best_model='f1'
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    # Train
    trainer.train()

    # Evaluate
    val_outputs = trainer.predict(val_dataset)
    val_preds = np.argmax(val_outputs.predictions, axis=1)

    # Store predictions
    all_true.extend(val_data['label'].tolist())
    all_preds.extend(val_preds)

    # Print fold results
    print(f"\nFold {fold + 1} Metrics:")
    metrics = compute_metrics(val_outputs)
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1: {metrics['f1']:.4f}")

print("\nCross-validation completed!")


=== Fold 1/2 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.3571,0.348284,0.852778,0.853043,0.850163,0.855943
1000,0.329,0.33131,0.860993,0.864253,0.843172,0.886415
1500,0.3293,0.322112,0.864153,0.859376,0.889183,0.831503
2000,0.323,0.309149,0.869241,0.867228,0.879345,0.85544
2500,0.3158,0.30502,0.87205,0.869896,0.883341,0.856855
3000,0.3131,0.298038,0.87571,0.877307,0.864838,0.890141
3500,0.2317,0.328843,0.875121,0.876844,0.863574,0.890529
4000,0.2359,0.325576,0.876277,0.877024,0.870394,0.883755
4500,0.2352,0.31645,0.872533,0.876333,0.849688,0.904704
5000,0.2307,0.30617,0.87764,0.87643,0.883748,0.869232





Fold 1 Metrics:
Accuracy: 0.8767
Precision: 0.8599
Recall: 0.8996
F1: 0.8793

=== Fold 2/2 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.3662,0.343878,0.852629,0.849853,0.859791,0.840142
1000,0.3335,0.332372,0.858835,0.852954,0.883177,0.82473
1500,0.3175,0.320003,0.865553,0.861369,0.882332,0.841379
2000,0.3121,0.314166,0.870837,0.872369,0.856171,0.889191
2500,0.3155,0.317499,0.869208,0.873294,0.841196,0.907938
3000,0.3076,0.303828,0.872896,0.874443,0.857944,0.89159
3500,0.2319,0.319741,0.872145,0.870975,0.872666,0.869291
4000,0.2362,0.304118,0.87658,0.875986,0.873924,0.878056
4500,0.2392,0.311948,0.876667,0.877346,0.866426,0.888544
5000,0.2303,0.318221,0.87853,0.877421,0.87911,0.875738





Fold 2 Metrics:
Accuracy: 0.8786
Precision: 0.8704
Recall: 0.8876
F1: 0.8789

Cross-validation completed!


In [9]:
# Aggregate results
precision, recall, f1, _ = precision_recall_fscore_support(all_true, all_preds, average='binary')
acc = accuracy_score(all_true, all_preds)

print("\n=== Final Cross-Validation Metrics ===")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"\nTotal Training Time: {time.time() - start_time:.2f} seconds")


=== Final Cross-Validation Metrics ===
Accuracy: 0.8776
Precision: 0.8650
Recall: 0.8936
F1-Score: 0.8791

Total Training Time: 65822.50 seconds


In [10]:
# Test evaluation (removed time limit check)
print("\n=== Evaluating on Test Set ===")
test_encodings = tokenize_data(test_df['combined_text'])
test_dataset = FakeNewsDataset(test_encodings, torch.tensor(test_df['label'].tolist()))
test_outputs = trainer.predict(test_dataset)

print("\nTest Metrics:")
print(f"Accuracy: {test_outputs.metrics['test_accuracy']:.4f}")
print(f"F1: {test_outputs.metrics['test_f1']:.4f}")


=== Evaluating on Test Set ===





Test Metrics:
Accuracy: 0.8777
F1: 0.8787


In [11]:
# Fake News Detection Test (Standalone)
def predict_fake_news(text, model, tokenizer, device):
    """Predict whether a given text is fake news (1) or real (0)"""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred = torch.argmax(probs).item()
    return "Fake" if pred == 1 else "Real", probs[0][1].item()

# Test cases
test_samples = [
    "Scientists confirm that drinking bleach cures COVID-19",  # Clearly fake
    "The president signed the new budget bill today",         # Likely real
    "Aliens landed in New York and took over the government", # Fake
    "The stock market reached a new high this quarter",       # Real
    "Study shows chocolate is healthier than vegetables",     # Likely fake
    "City council approves new park construction plan"       # Likely real

    # Test
    "Donald Trump has three legs and four hands",
    "China will invade Taiwan next week",
    "Slovenia shares borders with Austria, Hungary, Italy and Croatia",
    "Slovenia shares borders with Ecuador",
    "Joe Biden won the 2024 US presidential election",
]


for text in test_samples:
    label, fake_prob = predict_fake_news(text, model, tokenizer, device)
    print(f"Text: '{text[:60]}...'")  # Show first 60 chars to avoid long outputs
    print(f"→ Prediction: {label} (Fake probability: {fake_prob:.2%})\n")

Text: 'Scientists confirm that drinking bleach cures COVID-19...'
→ Prediction: Fake (Fake probability: 96.56%)

Text: 'The president signed the new budget bill today...'
→ Prediction: Real (Fake probability: 36.15%)

Text: 'Aliens landed in New York and took over the government...'
→ Prediction: Real (Fake probability: 0.27%)

Text: 'The stock market reached a new high this quarter...'
→ Prediction: Fake (Fake probability: 91.20%)

Text: 'Study shows chocolate is healthier than vegetables...'
→ Prediction: Fake (Fake probability: 99.01%)

Text: 'City council approves new park construction planDonald Trump...'
→ Prediction: Fake (Fake probability: 60.83%)

Text: 'China will invade Taiwan next week...'
→ Prediction: Fake (Fake probability: 88.56%)

Text: 'Slovenia shares borders with Austria, Hungary, Italy and Cro...'
→ Prediction: Fake (Fake probability: 88.46%)

Text: 'Slovenia shares borders with Ecuador...'
→ Prediction: Fake (Fake probability: 93.83%)

Text: 'Joe Biden won the 202