In [1]:
# =====================================================
# BASELINE: LSTM (Bi-directional) - LIAR DATASET
# Framework: PyTorch
# =====================================================

import os, re, time, pickle, psutil
import pandas as pd
import numpy as np
from collections import Counter
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from google.colab import drive

# 1. SETUP & CONFIG
if not os.path.exists('/content/drive'):
    try:
        drive.mount('/content/drive', force_remount=True)
    except ValueError: pass

OUTPUT_DIR = "/content/drive/MyDrive/LIAR_LSTM_Baseline"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Hyperparameters (ƒêi·ªÅu ch·ªânh cho LIAR - dataset nh·ªè h∆°n WELFake)
MAX_VOCAB_SIZE = 15000  # LIAR √≠t t·ª´ v·ª±ng h∆°n
MAX_SEQ_LEN = 150       # C√¢u ng·∫Øn, c·ªông th√™m metadata th√¨ t·∫ßm 100-150 l√† ƒë·ªß
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
BATCH_SIZE = 32         # Batch nh·ªè gi√∫p h·ªôi t·ª• t·ªët h∆°n tr√™n dataset nh·ªè
EPOCHS = 10             # TƒÉng epoch v√¨ dataset nh·ªè
LEARNING_RATE = 0.001
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Device: {DEVICE}")

# 2. LOAD DATA
print("\n‚è≥ ƒêang t·∫£i dataset LIAR...")
try:
    dataset = load_dataset("chengxuphd/liar2")
except Exception as e:
    print(f"‚ö†Ô∏è T·∫£i liar2 th·∫•t b·∫°i, d√πng b·∫£n g·ªëc 'liar'...")
    dataset = load_dataset("liar")

df_train = pd.DataFrame(dataset['train'])
df_val   = pd.DataFrame(dataset['validation'])
df_test  = pd.DataFrame(dataset['test'])
df = pd.concat([df_train, df_val, df_test], ignore_index=True)

# 3. PRE-PROCESSING
# A. Label Mapping
def map_liar_labels(lbl):
    if isinstance(lbl, str):
        lbl = lbl.lower()
        if lbl in ['false', 'barely-true', 'pants-fire', 'pants-on-fire']: return 0
        if lbl in ['true', 'mostly-true', 'half-true']: return 1
        return 0
    if isinstance(lbl, (int, np.integer)):
        if lbl in [0, 4, 5]: return 0
        if lbl in [1, 2, 3]: return 1
    return 0

df['binary_label'] = df['label'].apply(map_liar_labels)

# B. Feature Engineering & Cleaning
def prepare_text_dl(row):
    # L·∫•y th√¥ng tin
    stmt = str(row.get('statement', '')).strip()
    speaker = str(row.get('speaker', 'unknown'))
    party = str(row.get('party_affiliation', 'unknown'))
    context = str(row.get('context', 'unknown'))

    # Gh√©p chu·ªói: LSTM s·∫Ω ƒë·ªçc t·ª´ tr√°i sang ph·∫£i
    # "thu·∫ø tƒÉng cao <sep> trump <sep> c·ªông h√≤a"
    text = f"{stmt} <sep> {speaker} {party} {context}"

    # Cleaning
    text = text.lower()
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z0-9\s]', '', text) # Gi·ªØ l·∫°i ch·ªØ s·ªë
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("üßπ Pre-processing & Feature Fusion...")
df['content'] = df.apply(prepare_text_dl, axis=1)
df = df[df['content'].str.len() > 5] # B·ªè m·∫´u r·ªóng

# 4. SPLIT DATA
# Split gi·ªëng nh∆∞ c√°c b√†i tr∆∞·ªõc ƒë·ªÉ so s√°nh c√¥ng b·∫±ng
X_train_text, X_temp_text, y_train, y_temp = train_test_split(
    df['content'].values, df['binary_label'].values, test_size=0.2, random_state=42, stratify=df['binary_label']
)
X_val_text, X_test_text, y_val, y_test = train_test_split(
    X_temp_text, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Train: {len(X_train_text)} | Val: {len(X_val_text)} | Test: {len(X_test_text)}")

# 5. VOCABULARY & TOKENIZATION
print("\n‚öôÔ∏è X√¢y d·ª±ng b·ªô t·ª´ v·ª±ng...")
word_counts = Counter()
for text in X_train_text:
    word_counts.update(text.split())

common_words = word_counts.most_common(MAX_VOCAB_SIZE - 2)
vocab = {word: i+2 for i, (word, _) in enumerate(common_words)}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

def encode_text(text, vocab, max_len):
    tokens = text.split()
    encoded = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    if len(encoded) > max_len:
        encoded = encoded[:max_len]
    else:
        encoded = encoded + [vocab['<PAD>']] * (max_len - len(encoded))
    return encoded

# 6. DATASET & DATALOADER
class LiarDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoded = encode_text(text, self.vocab, self.max_len)
        return torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.float)

train_ds = LiarDataset(X_train_text, y_train, vocab, MAX_SEQ_LEN)
val_ds   = LiarDataset(X_val_text, y_val, vocab, MAX_SEQ_LEN)
test_ds  = LiarDataset(X_test_text, y_test, vocab, MAX_SEQ_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE*2, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE*2, shuffle=False)

# 7. MODEL ARCHITECTURE (Bi-LSTM)
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                            bidirectional=bidirectional, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.lstm(embedded)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        return self.fc(hidden)

model = LSTMClassifier(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, 1, 2, True, 0.4).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

# 8. TRAINING LOOP
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    return correct.sum() / len(correct)

print(f"\nüöÄ B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán LSTM (LIAR)...")

for epoch in range(EPOCHS):
    start_t = time.time()
    model.train()
    train_loss, train_acc = 0, 0

    for text, label in train_loader:
        text, label = text.to(DEVICE), label.to(DEVICE)
        optimizer.zero_grad()
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, label)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_acc += binary_accuracy(predictions, label).item()

    # Validation loop
    model.eval()
    val_loss, val_acc = 0, 0
    with torch.no_grad():
        for text, label in val_loader:
            text, label = text.to(DEVICE), label.to(DEVICE)
            predictions = model(text).squeeze(1)
            loss = criterion(predictions, label)
            val_loss += loss.item()
            val_acc += binary_accuracy(predictions, label).item()

    end_t = time.time()
    print(f'Epoch {epoch+1:02} | Time: {int(end_t-start_t)}s | '
          f'Train Loss: {train_loss/len(train_loader):.3f} | Val Acc: {val_acc/len(val_loader)*100:.2f}%')

# 9. EVALUATION (HuggingFace Style)
print("\nüéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...")
model.eval()
all_preds, all_labels, all_probs = [], [], []

start_eval = time.time()
with torch.no_grad():
    for text, label in test_loader:
        text, label = text.to(DEVICE), label.to(DEVICE)
        predictions = model(text).squeeze(1)
        prob = torch.sigmoid(predictions)
        all_probs.extend(prob.cpu().numpy())
        all_preds.extend(torch.round(prob).cpu().numpy())
        all_labels.extend(label.cpu().numpy())

runtime = time.time() - start_eval
samples_per_second = len(all_labels) / runtime

accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
auc = roc_auc_score(all_labels, all_probs)

eval_results = {
    'eval_accuracy': accuracy,
    'eval_precision': precision,
    'eval_recall': recall,
    'eval_f1': f1,
    'eval_auc': auc,
    'eval_loss': 'N/A',
    'eval_runtime': runtime,
    'eval_samples_per_second': samples_per_second
}

print("\n" + "="*50)
print("üìä K·∫æT QU·∫¢ LSTM BASELINE - LIAR:")
print("="*50)
print(eval_results)
print("="*50)

# 10. SAVE
torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "lstm_liar_model.pth"))
with open(os.path.join(OUTPUT_DIR, "vocab_liar.pkl"), "wb") as f:
    pickle.dump(vocab, f)
print("‚úÖ ƒê√£ l∆∞u model!")

Mounted at /content/drive
Device: cuda

‚è≥ ƒêang t·∫£i dataset LIAR...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/19.0M [00:00<?, ?B/s]

valid.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/18369 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2297 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2296 [00:00<?, ? examples/s]

üßπ Pre-processing & Feature Fusion...
Train: 18369 | Val: 2296 | Test: 2297

‚öôÔ∏è X√¢y d·ª±ng b·ªô t·ª´ v·ª±ng...

üöÄ B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán LSTM (LIAR)...
Epoch 01 | Time: 7s | Train Loss: 0.670 | Val Acc: 60.59%
Epoch 02 | Time: 6s | Train Loss: 0.658 | Val Acc: 61.04%
Epoch 03 | Time: 6s | Train Loss: 0.641 | Val Acc: 62.31%
Epoch 04 | Time: 7s | Train Loss: 0.621 | Val Acc: 62.90%
Epoch 05 | Time: 6s | Train Loss: 0.598 | Val Acc: 62.69%
Epoch 06 | Time: 6s | Train Loss: 0.575 | Val Acc: 60.51%
Epoch 07 | Time: 6s | Train Loss: 0.552 | Val Acc: 61.02%
Epoch 08 | Time: 6s | Train Loss: 0.527 | Val Acc: 61.33%
Epoch 09 | Time: 6s | Train Loss: 0.505 | Val Acc: 62.11%
Epoch 10 | Time: 6s | Train Loss: 0.477 | Val Acc: 62.12%

üéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...

üìä K·∫æT QU·∫¢ LSTM BASELINE - LIAR:
{'eval_accuracy': 0.5964301262516326, 'eval_precision': 0.5885629075376345, 'eval_recall': 0.5964301262516326, 'eval_f1': 0.5912202642249896, 'eval_auc': np.float64(0.60072394