In [1]:
# =====================================================
# BASELINE: SVM (LinearSVC) - LIAR DATASET
# Output Format: HuggingFace Style
# =====================================================

import os, re, psutil, pickle, time
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from google.colab import drive

# 1. MOUNT DRIVE & SETUP
if not os.path.exists('/content/drive'):
    try:
        drive.mount('/content/drive', force_remount=True)
    except ValueError: pass

OUTPUT_DIR = "/content/drive/MyDrive/LIAR_SVM_Baseline"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 2. LOAD DATA
print("‚è≥ ƒêang t·∫£i dataset LIAR...")
try:
    dataset = load_dataset("chengxuphd/liar2")
except Exception as e:
    print(f"‚ö†Ô∏è T·∫£i liar2 th·∫•t b·∫°i ({e}), d√πng b·∫£n g·ªëc 'liar'...")
    dataset = load_dataset("liar")

# Chuy·ªÉn sang Pandas & G·ªôp to√†n b·ªô
df_train = pd.DataFrame(dataset['train'])
df_val   = pd.DataFrame(dataset['validation'])
df_test  = pd.DataFrame(dataset['test'])
df = pd.concat([df_train, df_val, df_test], ignore_index=True)

print(f"T·ªïng s·ªë m·∫´u d·ªØ li·ªáu: {len(df)}")

# 3. PRE-PROCESSING
# A. Label Mapping (6 class -> 2 class)
def map_liar_labels(lbl):
    if isinstance(lbl, str):
        lbl = lbl.lower()
        if lbl in ['false', 'barely-true', 'pants-fire', 'pants-on-fire']: return 0
        if lbl in ['true', 'mostly-true', 'half-true']: return 1
        return 0
    if isinstance(lbl, (int, np.integer)):
        if lbl in [0, 4, 5]: return 0 # Fake
        if lbl in [1, 2, 3]: return 1 # Real
    return 0

df['binary_label'] = df['label'].apply(map_liar_labels)

# B. Feature Engineering (Gh√©p Metadata)
# SVM h∆∞·ªüng l·ª£i r·∫•t l·ªõn t·ª´ vi·ªác c√≥ nhi·ªÅu features r√µ r√†ng
def create_liar_content(row):
    stmt = str(row.get('statement', '')).strip()
    speaker = str(row.get('speaker', 'unknown'))
    party = str(row.get('party_affiliation', 'unknown'))
    subject = str(row.get('subject', 'unknown'))
    context = str(row.get('context', 'unknown'))

    # Gh√©p chu·ªói ƒë·ªÉ t·∫°o ng·ªØ c·∫£nh phong ph√∫ cho SVM ph√¢n t√°ch
    return f"{stmt} speaker: {speaker} party: {party} subject: {subject} context: {context}"

df['content'] = df.apply(create_liar_content, axis=1)

# C. Clean Text
def clean_text_ml(s):
    if not isinstance(s, str): return ""
    s = s.lower()
    s = re.sub(r'https?://\S+', '', s)
    s = re.sub(r'<.*?>', '', s)
    s = re.sub(r'[^a-z0-9\s:]', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

print("üßπ Pre-processing...")
df['content'] = df['content'].apply(clean_text_ml)

# 4. SPLIT DATA
# Train/Test Split
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['content'], df['binary_label'], test_size=0.2, random_state=42, stratify=df['binary_label']
)

print(f"Train: {len(X_train_text)} | Test: {len(X_test_text)}")

# 5. TF-IDF VECTORIZATION
print("\n‚öôÔ∏è Vectorizing (TF-IDF)...")
# SVM x·ª≠ l√Ω t·ªët s·ªë chi·ªÅu l·ªõn, ta c√≥ th·ªÉ d√πng 50k features
vectorizer = TfidfVectorizer(max_features=50000, stop_words='english', ngram_range=(1, 2))

X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

# 6. TRAIN SVM (LinearSVC)
print("üöÄ Training SVM (LinearSVC)...")

# class_weight='balanced': C·ª±c k·ª≥ quan tr·ªçng v·ªõi LIAR ƒë·ªÉ c√¢n b·∫±ng gi·ªØa c√°c nh√£n
svm_model = LinearSVC(class_weight='balanced', random_state=42, max_iter=2000)

# B·ªçc trong CalibratedClassifierCV ƒë·ªÉ l·∫•y ƒë∆∞·ª£c x√°c su·∫•t (predict_proba) cho AUC
clf = CalibratedClassifierCV(svm_model)
clf.fit(X_train, y_train)

# =====================================================
# 7. EVALUATION
# =====================================================
print("\nüéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...")

start_time = time.time()

# D·ª± ƒëo√°n
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

end_time = time.time()
runtime = end_time - start_time
samples_per_second = len(y_test) / runtime

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
auc = roc_auc_score(y_test, y_prob)

eval_results = {
    'eval_accuracy': accuracy,
    'eval_precision': precision,
    'eval_recall': recall,
    'eval_f1': f1,
    'eval_auc': auc,
    'eval_loss': 'N/A (SVM)',
    'eval_runtime': runtime,
    'eval_samples_per_second': samples_per_second,
    'eval_steps_per_second': 'N/A'
}

print("\n" + "="*50)
print("üìä K·∫æT QU·∫¢ SVM BASELINE - LIAR:")
print("="*50)
print(eval_results)
print("="*50)

# 8. SAVE
with open(os.path.join(OUTPUT_DIR, "svm_liar_model.pkl"), "wb") as f:
    pickle.dump(clf, f)
with open(os.path.join(OUTPUT_DIR, "tfidf_liar_vectorizer.pkl"), "wb") as f:
    pickle.dump(vectorizer, f)
print(f"\n‚úÖ ƒê√£ l∆∞u model t·∫°i: {OUTPUT_DIR}")

Mounted at /content/drive
‚è≥ ƒêang t·∫£i dataset LIAR...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/19.0M [00:00<?, ?B/s]

valid.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/18369 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2297 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2296 [00:00<?, ? examples/s]

T·ªïng s·ªë m·∫´u d·ªØ li·ªáu: 22962
üßπ Pre-processing...
Train: 18369 | Test: 4593

‚öôÔ∏è Vectorizing (TF-IDF)...
üöÄ Training SVM (LinearSVC)...

üéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...

üìä K·∫æT QU·∫¢ SVM BASELINE - LIAR:
{'eval_accuracy': 0.6244284781188766, 'eval_precision': 0.6061259112615859, 'eval_recall': 0.6244284781188766, 'eval_f1': 0.5722767011551267, 'eval_auc': np.float64(0.6243640116785167), 'eval_loss': 'N/A (SVM)', 'eval_runtime': 0.01596546173095703, 'eval_samples_per_second': 287683.50564482406, 'eval_steps_per_second': 'N/A'}

‚úÖ ƒê√£ l∆∞u model t·∫°i: /content/drive/MyDrive/LIAR_SVM_Baseline
