In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Cell 1: Install dependencies
!pip install transformers torch pandas scikit-learn

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [4]:
import pandas as pd
import numpy as np
import torch
import time
import os
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Disable WandB to avoid API key prompts
os.environ["WANDB_DISABLED"] = "true"

In [5]:
# Load and sample 10% of data
train_df = pd.read_csv('/content/drive/MyDrive/Colab/all_samples/all_train.tsv', sep='\t').sample(frac=0.1, random_state=42)
test_df = pd.read_csv('/content/drive/MyDrive/Colab/all_samples/all_test_public.tsv', sep='\t').sample(frac=0.1, random_state=42)

# Preprocess text and labels
train_df['combined_text'] = train_df['clean_title'].fillna(train_df['title'])
test_df['combined_text'] = test_df['clean_title'].fillna(test_df['title'])
train_df['label'] = train_df['2_way_label']
test_df['label'] = test_df['2_way_label']

# Remove NaN values
train_df = train_df.dropna(subset=['combined_text'])
test_df = test_df.dropna(subset=['combined_text'])

print(f"Training samples: {len(train_df)}, Test samples: {len(test_df)}")
print("\nSample data:")
print(train_df[['combined_text', 'label']].head())

Training samples: 80538, Test samples: 8486

Sample data:
                                            combined_text  label
681297              woman on a swing on top of a mountain      1
436392  vegan denied passport because she was being an...      1
797451  sacrificing jessica to the almighty kingfisher...      0
61792                                  this map of europe      1
19909                              rancor snacking on dog      0


In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(texts):
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [9]:
# Hyperparameters
BATCH_SIZE = 32
NUM_EPOCHS = 2
MAX_LENGTH = 128
EVAL_STEPS = 500
KFOLDS = 2

# Initialize KFold
kfold = KFold(n_splits=KFOLDS, shuffle=True, random_state=42)

# Timer
start_time = time.time()

# Metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [10]:
# Initialize lists to store predictions across all folds
all_true = []
all_preds = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(train_df)):
    print(f"\n=== Fold {fold + 1}/{KFOLDS} ===")

    # Split data
    train_data = train_df.iloc[train_idx]
    val_data = train_df.iloc[val_idx]

    # Tokenize
    train_encodings = tokenize_data(train_data['combined_text'])
    val_encodings = tokenize_data(val_data['combined_text'])

    # Create datasets
    train_dataset = FakeNewsDataset(train_encodings, torch.tensor(train_data['label'].tolist()))
    val_dataset = FakeNewsDataset(val_encodings, torch.tensor(val_data['label'].tolist()))

    # Initialize model
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.to(device)

    # Configure training
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold}',
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        logging_dir=f'./logs_fold_{fold}',
        logging_steps=100,
        eval_strategy='steps',
        eval_steps=EVAL_STEPS,
        save_steps=EVAL_STEPS,
        load_best_model_at_end=True,
        metric_for_best_model='f1'
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    # Train
    trainer.train()

    # Evaluate
    val_outputs = trainer.predict(val_dataset)
    val_preds = np.argmax(val_outputs.predictions, axis=1)

    # Store predictions
    all_true.extend(val_data['label'].tolist())
    all_preds.extend(val_preds)

    # Print fold results - FIXED METRIC ACCESS
    print(f"\nFold {fold + 1} Metrics:")
    metrics = compute_metrics(val_outputs)  # Compute all metrics once
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1: {metrics['f1']:.4f}")

print("\nCross-validation completed!")


=== Fold 1/2 ===


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.4018,0.393386,0.827411,0.830859,0.814486,0.847904
1000,0.3568,0.37237,0.842261,0.841216,0.846719,0.835784
1500,0.2617,0.382065,0.844868,0.843186,0.852322,0.834244
2000,0.2246,0.398313,0.844744,0.846433,0.837221,0.855851
2500,0.2176,0.39874,0.84606,0.844959,0.850939,0.839062



Fold 1 Metrics:
Accuracy: 0.8447
Precision: 0.8372
Recall: 0.8559
F1: 0.8464

=== Fold 2/2 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.4289,0.391436,0.828603,0.834095,0.806639,0.863485
1000,0.3791,0.380758,0.829621,0.841456,0.785394,0.906136
1500,0.2483,0.381715,0.850034,0.849383,0.851315,0.847459
2000,0.2761,0.369693,0.848866,0.851278,0.836238,0.866869
2500,0.2584,0.372326,0.851449,0.853812,0.838767,0.869407



Fold 2 Metrics:
Accuracy: 0.8514
Precision: 0.8388
Recall: 0.8694
F1: 0.8538

Cross-validation completed!


In [11]:
# Aggregate results
precision, recall, f1, _ = precision_recall_fscore_support(all_true, all_preds, average='binary')
acc = accuracy_score(all_true, all_preds)

print("\n=== Final Cross-Validation Metrics ===")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"\nTotal Training Time: {time.time() - start_time:.2f} seconds")


=== Final Cross-Validation Metrics ===
Accuracy: 0.8481
Precision: 0.8380
Recall: 0.8626
F1-Score: 0.8501

Total Training Time: 4191.13 seconds


In [12]:
if (time.time() - start_time) < 4 * 3600:  # Only run if under 4 hours
    print("\n=== Evaluating on Test Set ===")
    test_encodings = tokenize_data(test_df['combined_text'])
    test_dataset = FakeNewsDataset(test_encodings, torch.tensor(test_df['label'].tolist()))
    test_outputs = trainer.predict(test_dataset)

    print("\nTest Metrics:")
    print(f"Accuracy: {test_outputs.metrics['test_accuracy']:.4f}")
    print(f"F1: {test_outputs.metrics['test_f1']:.4f}")
else:
    print("\nSkipping test evaluation (time limit exceeded)")


=== Evaluating on Test Set ===



Test Metrics:
Accuracy: 0.8476
F1: 0.8513


In [13]:
# ==============================================
# FAKE NEWS DETECTION TEST (STANDALONE)
# ==============================================
def predict_fake_news(text, model, tokenizer, device):
    """Predict whether a given text is fake news (1) or real (0)"""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred = torch.argmax(probs).item()
    return "Fake" if pred == 1 else "Real", probs[0][1].item()  # Return label and fake probability

# Test cases (customize these as needed)
test_samples = [
    "Scientists confirm that drinking bleach cures COVID-19",  # Clearly fake
    "The president signed the new budget bill today",         # Likely real
    "Aliens landed in New York and took over the government", # Fake
    "The stock market reached a new high this quarter",       # Real
    "Study shows chocolate is healthier than vegetables",     # Likely fake
    "City council approves new park construction plan"       # Likely real
]

print("\n=== Fake News Detection Test ===")
print("(Using last trained model from previous cell)\n")

for text in test_samples:
    label, fake_prob = predict_fake_news(text, model, tokenizer, device)
    print(f"Text: '{text[:60]}...'")  # Show first 60 chars to avoid long outputs
    print(f"→ Prediction: {label} (Fake probability: {fake_prob:.2%})\n")


=== Fake News Detection Test ===
(Using last trained model from previous cell)

Text: 'Scientists confirm that drinking bleach cures COVID-19...'
→ Prediction: Fake (Fake probability: 85.84%)

Text: 'The president signed the new budget bill today...'
→ Prediction: Real (Fake probability: 10.56%)

Text: 'Aliens landed in New York and took over the government...'
→ Prediction: Real (Fake probability: 3.66%)

Text: 'The stock market reached a new high this quarter...'
→ Prediction: Fake (Fake probability: 53.84%)

Text: 'Study shows chocolate is healthier than vegetables...'
→ Prediction: Fake (Fake probability: 97.86%)

Text: 'City council approves new park construction plan...'
→ Prediction: Fake (Fake probability: 91.47%)

