In [None]:
import time
import os
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# ✅ Disable Weights & Biases (W&B) logging
os.environ["WANDB_DISABLED"] = "true"

# 1️⃣ Load PHEME Dataset
df = pd.read_csv("pheme-dataset-for-rumour-detection.csv")  # Update with correct dataset path

# ✅ Drop NaN values in both text and labels
df = df.dropna(subset=["Body", "Label"])

texts = df["Body"].astype(str).tolist()  # Convert text to string format
labels = df["Label"].astype(int).tolist()  # Ensure labels are integers (0 or 1)

# 2️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

# 3️⃣ Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization Function
def encode_texts(texts, tokenizer, max_len=100):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_len, return_tensors="pt")

train_encodings = encode_texts(X_train, tokenizer)
test_encodings = encode_texts(X_test, tokenizer)

# 4️⃣ Convert Data to Torch Dataset
class RumorDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)  # Convert labels to tensors

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

train_dataset = RumorDataset(train_encodings, y_train)
test_dataset = RumorDataset(test_encodings, y_test)

# 5️⃣ Load Pre-trained BERT Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# 6️⃣ Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="none",  # ✅ Disables W&B logging
)

# 7️⃣ Define Trainer Function
def compute_metrics(pred):
    logits = pred.predictions
    preds = torch.argmax(torch.tensor(logits), axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(pred.label_ids, preds, average='binary')
    acc = accuracy_score(pred.label_ids, preds)
    cm = confusion_matrix(pred.label_ids, preds)

    print("\n🔹 Confusion Matrix:")
    print(cm)  # Prints confusion matrix
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# 8️⃣ Train Model & Measure Execution Time
print("🚀 Starting Sequential Training...")
start_time = time.time()

trainer.train()

end_time = time.time()
sequential_time = end_time - start_time

# 9️⃣ Evaluate Model
results = trainer.evaluate()

# 🔹 Print Performance Metrics & Execution Time
print("\n📊 Sequential Processing Results:")
print(f"✅ Accuracy: {results['eval_accuracy']:.4f}")
print(f"✅ Precision: {results['eval_precision']:.4f}")
print(f"✅ Recall: {results['eval_recall']:.4f}")
print(f"✅ F1-Score: {results['eval_f1']:.4f}")
print(f"⏳ Total Execution Time (Sequential): {sequential_time:.2f} sec")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🚀 Starting Sequential Training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2288,0.236949,0.913764,0.824117,0.77613,0.799404
2,0.2013,0.206149,0.923292,0.841588,0.805063,0.822921
3,0.065,0.292938,0.928257,0.857909,0.810127,0.833333



🔹 Confusion Matrix:
[[9266  458]
 [ 619 2146]]

🔹 Confusion Matrix:
[[9305  419]
 [ 539 2226]]

🔹 Confusion Matrix:
[[9353  371]
 [ 525 2240]]



🔹 Confusion Matrix:
[[9305  419]
 [ 539 2226]]

📊 Sequential Processing Results:
✅ Accuracy: 0.9233
✅ Precision: 0.8416
✅ Recall: 0.8051
✅ F1-Score: 0.8229
⏳ Total Execution Time (Sequential): 2821.93 sec
