In [1]:
pip install torch transformers datasets nltk spacy pandas numpy scikit-learn lime requests streamlit

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting streamlit
  Downloading streamlit-1.43.1-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x

In [8]:
import pandas as pd
import numpy as np
import re
import torch
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Google Sheets CSV URLs (Replace with actual links)
train_url = "https://docs.google.com/spreadsheets/d/13Tpvc8Gyw0yaY3UhRnVdJeMIW0wVvdsp/gviz/tq?tqx=out:csv&sheet=train"
test_url = "https://docs.google.com/spreadsheets/d/13Tpvc8Gyw0yaY3UhRnVdJeMIW0wVvdsp/gviz/tq?tqx=out:csv&sheet=test"
valid_url = "https://docs.google.com/spreadsheets/d/13Tpvc8Gyw0yaY3UhRnVdJeMIW0wVvdsp/gviz/tq?tqx=out:csv&sheet=valid"

try:
    # Load datasets from Google Sheets
    df_train = pd.read_csv(train_url, skiprows=0)
    df_test = pd.read_csv(test_url, skiprows=0)
    df_valid = pd.read_csv(valid_url, skiprows=0)

    # Normalize column names
    df_train.columns = df_train.columns.str.strip().str.lower()
    df_test.columns = df_test.columns.str.strip().str.lower()
    df_valid.columns = df_valid.columns.str.strip().str.lower()

    # Debug: Print column names
    print("Updated Column Names (Train):", df_train.columns)

except Exception as e:
    print(f"Error loading dataset: {e}")

# Drop missing values
df_train = df_train.dropna()
df_test = df_test.dropna()
df_valid = df_valid.dropna()

# Ensure "statement" exists
if "statement" not in df_train.columns:
    raise KeyError("Error: 'statement' column not found! Check CSV format.")

# Text cleaning function
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = " ".join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text

# Apply text cleaning
df_train["clean_statement"] = df_train["statement"].apply(clean_text)
df_test["clean_statement"] = df_test["statement"].apply(clean_text)
df_valid["clean_statement"] = df_valid["statement"].apply(clean_text)

# Encode labels (0 to 5 for six categories)
label_encoder = LabelEncoder()
df_train["label"] = label_encoder.fit_transform(df_train["label"])
df_test["label"] = label_encoder.transform(df_test["label"])
df_valid["label"] = label_encoder.transform(df_valid["label"])

# Splitting train dataset further into training & validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_train["clean_statement"], df_train["label"], test_size=0.1, stratify=df_train["label"], random_state=42
)

# Save processed data for later use
train_texts.to_csv("train_texts.csv", index=False)
val_texts.to_csv("val_texts.csv", index=False)
test_texts = df_test["clean_statement"]
test_labels = df_test["label"]
test_texts.to_csv("test_texts.csv", index=False)
train_labels.to_csv("train_labels.csv", index=False)
val_labels.to_csv("val_labels.csv", index=False)
test_labels.to_csv("test_labels.csv", index=False)

print("✅ Dataset preprocessing complete!")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Updated Column Names (Train): Index(['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state',
       'party_affiliation', 'barely_true_count', 'false_count',
       'half_true_count', 'mostly_true_count', 'pants_on_fire_count',
       'context'],
      dtype='object')
✅ Dataset preprocessing complete!


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load processed data
train_texts = pd.read_csv("train_texts.csv").values.ravel()
train_labels = pd.read_csv("train_labels.csv").values.ravel()
val_texts = pd.read_csv("val_texts.csv").values.ravel()
val_labels = pd.read_csv("val_labels.csv").values.ravel()

# ✅ Train TF-IDF + Logistic Regression
vectorizer = TfidfVectorizer(max_features=5000)
lr_clf = make_pipeline(vectorizer, LogisticRegression(max_iter=500))
lr_clf.fit(train_texts, train_labels)

# Evaluate
val_preds = lr_clf.predict(val_texts)
print("Traditional ML Model Performance:")
print(classification_report(val_labels, val_preds))

# Save model
joblib.dump(lr_clf, "lr_model.pkl")
print("✅ Logistic Regression Model Saved!")


Traditional ML Model Performance:
              precision    recall  f1-score   support

           0       0.17      0.10      0.13       105
           1       0.22      0.28      0.25       131
           2       0.22      0.26      0.24       142
           3       0.26      0.31      0.28       136
           4       0.00      0.00      0.00        44
           5       0.27      0.26      0.26       116

    accuracy                           0.23       674
   macro avg       0.19      0.20      0.19       674
weighted avg       0.22      0.23      0.22       674

✅ Logistic Regression Model Saved!


In [10]:
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader

# ✅ Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Define dataset class
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Load Data
train_dataset = FakeNewsDataset(train_texts, train_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Define BERT+LSTM model
class HybridBERTLSTM(nn.Module):
    def __init__(self, hidden_dim=128, num_classes=6):
        super(HybridBERTLSTM, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        self.lstm = nn.LSTM(768, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        lstm_out, _ = self.lstm(bert_output.last_hidden_state)
        return self.fc(lstm_out[:, -1, :])

# Train BERT+LSTM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HybridBERTLSTM().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

for epoch in range(3):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} complete.")

torch.save(model.state_dict(), "bert_lstm_model.pth")
print("✅ BERT+LSTM Model Training Complete!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1 complete.
Epoch 2 complete.
Epoch 3 complete.
✅ BERT+LSTM Model Training Complete!
