In [1]:
# =====================================================
# BASELINE: LSTM (Bi-directional) - FAKENEWSNET
# Framework: PyTorch
# =====================================================

import os, re, time, pickle, psutil
import pandas as pd
import numpy as np
from collections import Counter
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset, concatenate_datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from google.colab import drive

# 1. SETUP & CONFIG
if not os.path.exists('/content/drive'):
    try:
        drive.mount('/content/drive', force_remount=True)
    except ValueError: pass

OUTPUT_DIR = "/content/drive/MyDrive/FakeNewsNet_LSTM_Baseline"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Hyperparameters (T·ªëi ∆∞u cho vƒÉn b·∫£n d√†i c·ªßa FakeNewsNet)
MAX_VOCAB_SIZE = 25000  # TƒÉng vocab size v√¨ b√†i b√°o ƒëa d·∫°ng t·ª´ v·ª±ng
MAX_SEQ_LEN = 300       # Gi·ªØ 300 t·ª´ ƒë·∫ßu ti√™n (ƒë·ªß ƒë·ªÉ n·∫Øm √Ω ch√≠nh)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
BATCH_SIZE = 64
EPOCHS = 8
LEARNING_RATE = 0.001
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Device: {DEVICE}")

# 2. LOAD DATA
print("\n‚è≥ ƒêang t·∫£i dataset FakeNewsNet...")
try:
    ds_gossip = load_dataset("rickstello/FakeNewsNet", "gossipcop", split="train")
    ds_politi = load_dataset("rickstello/FakeNewsNet", "politifact", split="train")
    dataset_full = concatenate_datasets([ds_gossip, ds_politi])
    df = pd.DataFrame(dataset_full)
except Exception as e:
    print(f"‚ö†Ô∏è T·∫£i config con th·∫•t b·∫°i ({e}), t·∫£i b·∫£n default...")
    dataset = load_dataset("rickstello/FakeNewsNet", split="train")
    df = pd.DataFrame(dataset)

# 3. PRE-PROCESSING
# A. T√¨m t√™n c·ªôt an to√†n
text_col = next((c for c in ['news_content', 'text', 'content', 'body'] if c in df.columns), None)
title_col = next((c for c in ['title', 'news_title', 'headline'] if c in df.columns), None)
label_col = next((c for c in ['real', 'label', 'class', 'fake'] if c in df.columns), None)

if not label_col: raise ValueError("‚ùå Kh√¥ng t√¨m th·∫•y c·ªôt nh√£n!")

# B. X·ª≠ l√Ω d·ªØ li·ªáu
text_data = df[text_col].fillna('') if text_col else pd.Series([""] * len(df))
title_data = df[title_col].fillna('') if title_col else pd.Series([""] * len(df))
df['label'] = df[label_col].astype(int)

# C. Clean Text (Deep Learning Style)
def clean_text_dl(s):
    if not isinstance(s, str): return ""
    s = s.lower()
    s = re.sub(r'https?://\S+', '', s)
    s = re.sub(r'<.*?>', '', s)
    s = re.sub(r'[^a-z0-9\s]', '', s) # Gi·ªØ l·∫°i ch·ªØ v√† s·ªë
    s = re.sub(r'\s+', ' ', s).strip()
    return s

print("üßπ Pre-processing...")
# Gh√©p Title + Text: Title th∆∞·ªùng ch·ª©a th√¥ng tin quan tr·ªçng nh·∫•t (clickbait)
df['content'] = (title_data + " " + text_data).apply(clean_text_dl)
df = df[df['content'].str.len() > 50] # L·ªçc b·ªè m·∫´u r√°c

# 4. SPLIT DATA
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['content'].values, df['label'].values, test_size=0.2, random_state=42, stratify=df['label']
)

print(f"Train: {len(X_train_text)} | Test: {len(X_test_text)}")

# 5. VOCABULARY & TOKENIZATION
print("\n‚öôÔ∏è X√¢y d·ª±ng b·ªô t·ª´ v·ª±ng...")
word_counts = Counter()
for text in X_train_text:
    word_counts.update(text.split())

common_words = word_counts.most_common(MAX_VOCAB_SIZE - 2)
vocab = {word: i+2 for i, (word, _) in enumerate(common_words)}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

def encode_text(text, vocab, max_len):
    tokens = text.split()
    encoded = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    if len(encoded) > max_len:
        encoded = encoded[:max_len]
    else:
        encoded = encoded + [vocab['<PAD>']] * (max_len - len(encoded))
    return encoded

# 6. DATASET & DATALOADER
class FnnDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoded = encode_text(text, self.vocab, self.max_len)
        return torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.float)

train_ds = FnnDataset(X_train_text, y_train, vocab, MAX_SEQ_LEN)
test_ds  = FnnDataset(X_test_text, y_test, vocab, MAX_SEQ_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE * 2, shuffle=False)

# 7. MODEL ARCHITECTURE (Bi-LSTM)
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                            bidirectional=bidirectional, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.lstm(embedded)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        return self.fc(hidden)

model = LSTMClassifier(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, 1, 2, True, 0.3).to(DEVICE)

# T√≠nh class weight cho Loss function (v√¨ FNN m·∫•t c√¢n b·∫±ng)
num_pos = sum(y_train)
num_neg = len(y_train) - num_pos
pos_weight = torch.tensor([num_neg / num_pos]).to(DEVICE) # Tr·ªçng s·ªë cho l·ªõp Positive (Real)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# 8. TRAINING LOOP
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    return correct.sum() / len(correct)

print(f"\nüöÄ B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán LSTM (FakeNewsNet)...")

for epoch in range(EPOCHS):
    start_t = time.time()
    model.train()
    train_loss, train_acc = 0, 0

    for text, label in train_loader:
        text, label = text.to(DEVICE), label.to(DEVICE)
        optimizer.zero_grad()
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, label)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_acc += binary_accuracy(predictions, label).item()

    end_t = time.time()
    print(f'Epoch {epoch+1:02} | Time: {int(end_t-start_t)}s | '
          f'Train Loss: {train_loss/len(train_loader):.3f} | Train Acc: {train_acc/len(train_loader)*100:.2f}%')

# 9. EVALUATION (HuggingFace Style)
print("\nüéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...")
model.eval()
all_preds, all_labels, all_probs = [], [], []

start_eval = time.time()
with torch.no_grad():
    for text, label in test_loader:
        text, label = text.to(DEVICE), label.to(DEVICE)
        predictions = model(text).squeeze(1)
        prob = torch.sigmoid(predictions)
        all_probs.extend(prob.cpu().numpy())
        all_preds.extend(torch.round(prob).cpu().numpy())
        all_labels.extend(label.cpu().numpy())

runtime = time.time() - start_eval
samples_per_second = len(all_labels) / runtime

accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
auc = roc_auc_score(all_labels, all_probs)

eval_results = {
    'eval_accuracy': accuracy,
    'eval_precision': precision,
    'eval_recall': recall,
    'eval_f1': f1,
    'eval_auc': auc,
    'eval_loss': 'N/A',
    'eval_runtime': runtime,
    'eval_samples_per_second': samples_per_second
}

print("\n" + "="*50)
print("üìä K·∫æT QU·∫¢ LSTM BASELINE - FAKENEWSNET:")
print("="*50)
print(eval_results)
print("="*50)

# 10. SAVE
torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "lstm_fnn_model.pth"))
with open(os.path.join(OUTPUT_DIR, "vocab_fnn.pkl"), "wb") as f:
    pickle.dump(vocab, f)
print("‚úÖ ƒê√£ l∆∞u model!")

Mounted at /content/drive
Device: cuda

‚è≥ ƒêang t·∫£i dataset FakeNewsNet...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

‚ö†Ô∏è T·∫£i config con th·∫•t b·∫°i (BuilderConfig 'gossipcop' not found. Available: ['default']), t·∫£i b·∫£n default...


FakeNewsNet.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/23196 [00:00<?, ? examples/s]

üßπ Pre-processing...
Train: 14700 | Test: 3675

‚öôÔ∏è X√¢y d·ª±ng b·ªô t·ª´ v·ª±ng...

üöÄ B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán LSTM (FakeNewsNet)...
Epoch 01 | Time: 9s | Train Loss: 0.301 | Train Acc: 66.89%
Epoch 02 | Time: 8s | Train Loss: 0.250 | Train Acc: 77.40%
Epoch 03 | Time: 8s | Train Loss: 0.220 | Train Acc: 80.38%
Epoch 04 | Time: 8s | Train Loss: 0.197 | Train Acc: 82.19%
Epoch 05 | Time: 8s | Train Loss: 0.174 | Train Acc: 84.42%
Epoch 06 | Time: 8s | Train Loss: 0.156 | Train Acc: 85.68%
Epoch 07 | Time: 8s | Train Loss: 0.133 | Train Acc: 87.85%
Epoch 08 | Time: 8s | Train Loss: 0.120 | Train Acc: 89.36%

üéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...

üìä K·∫æT QU·∫¢ LSTM BASELINE - FAKENEWSNET:
{'eval_accuracy': 0.7787755102040816, 'eval_precision': 0.8121835858216199, 'eval_recall': 0.7787755102040816, 'eval_f1': 0.7889353729807991, 'eval_auc': np.float64(0.8483959457909259), 'eval_loss': 'N/A', 'eval_runtime': 0.860276460647583, 'eval_samples_per_second': 4271.8825495162355}
‚ú