In [1]:
# =====================================================
# BASELINE: SVM (LinearSVC) - FAKENEWSNET
# Output Format: HuggingFace Style
# =====================================================

import os, re, psutil, pickle, time
import pandas as pd
import numpy as np
from datasets import load_dataset, concatenate_datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from google.colab import drive

# 1. MOUNT DRIVE
if not os.path.exists('/content/drive'):
    try:
        drive.mount('/content/drive', force_remount=True)
    except ValueError: pass

OUTPUT_DIR = "/content/drive/MyDrive/FakeNewsNet_SVM_Baseline"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 2. LOAD DATA (G·ªôp GossipCop + PolitiFact)
print("‚è≥ ƒêang t·∫£i dataset FakeNewsNet...")
try:
    # C·ªë g·∫Øng t·∫£i 2 t·∫≠p con ƒë·ªÉ c√≥ d·ªØ li·ªáu ƒë·∫ßy ƒë·ªß nh·∫•t
    ds_gossip = load_dataset("rickstello/FakeNewsNet", "gossipcop", split="train")
    ds_politi = load_dataset("rickstello/FakeNewsNet", "politifact", split="train")
    dataset_full = concatenate_datasets([ds_gossip, ds_politi])
    df = pd.DataFrame(dataset_full)
except Exception as e:
    print(f"‚ö†Ô∏è T·∫£i config con th·∫•t b·∫°i ({e}), t·∫£i b·∫£n default...")
    dataset = load_dataset("rickstello/FakeNewsNet", split="train")
    df = pd.DataFrame(dataset)

print(f"Dataset columns: {df.columns.tolist()}") # In ra ƒë·ªÉ ki·ªÉm tra
print(f"T·ªïng s·ªë m·∫´u d·ªØ li·ªáu: {len(df)}")

# 3. PRE-PROCESSING (FIX L·ªñI KEYERROR)

# A. T√¨m t√™n c·ªôt an to√†n
# ∆Øu ti√™n c√°c t√™n c·ªôt th∆∞·ªùng g·∫∑p trong dataset n√†y
text_col = next((c for c in ['news_content', 'text', 'content', 'body'] if c in df.columns), None)
title_col = next((c for c in ['title', 'news_title', 'headline'] if c in df.columns), None)
label_col = next((c for c in ['real', 'label', 'class', 'fake'] if c in df.columns), None)

if not label_col:
    raise ValueError(f"‚ùå Kh√¥ng t√¨m th·∫•y c·ªôt nh√£n! C√°c c·ªôt hi·ªán c√≥: {df.columns.tolist()}")

print(f"‚úÖ ƒê√£ nh·∫≠n di·ªán c·ªôt: Text='{text_col}', Title='{title_col}', Label='{label_col}'")

# B. X·ª≠ l√Ω d·ªØ li·ªáu an to√†n
# N·∫øu kh√¥ng t√¨m th·∫•y c·ªôt Text/Title th√¨ d√πng chu·ªói r·ªóng thay v√¨ ƒë·ªÉ code crash
text_data = df[text_col].fillna('') if text_col else pd.Series([""] * len(df))
title_data = df[title_col].fillna('') if title_col else pd.Series([""] * len(df))

# Chu·∫©n h√≥a Label v·ªÅ int
df['label'] = df[label_col].astype(int)

# C. Clean Text
def clean_text_ml(s):
    if not isinstance(s, str): return ""
    s = s.lower()
    s = re.sub(r'https?://\S+', '', s)
    s = re.sub(r'<.*?>', '', s)
    s = re.sub(r'[^a-z0-9\s]', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

print("üßπ Pre-processing...")
# Gh√©p Title + Text
df['content'] = (title_data + " " + text_data).apply(clean_text_ml)

# L·ªçc b·ªè m·∫´u qu√° ng·∫Øn
df = df[df['content'].str.len() > 20]
print(f" ‚Üí Sau x·ª≠ l√Ω: {len(df):,}")

# 4. SPLIT DATA
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['content'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

print(f"Train: {len(X_train_text)} | Test: {len(X_test_text)}")

# 5. TF-IDF VECTORIZATION
print("\n‚öôÔ∏è Vectorizing (TF-IDF)...")
# FakeNewsNet c√≥ b√†i d√†i, d√πng 50k t·ª´ l√† h·ª£p l√Ω
vectorizer = TfidfVectorizer(max_features=50000, stop_words='english', ngram_range=(1, 2))

X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

# 6. TRAIN SVM (LinearSVC)
print("üöÄ Training SVM (LinearSVC)...")

# D√πng LinearSVC cho t·ªëc ƒë·ªô v√† hi·ªáu su·∫•t cao tr√™n text
# class_weight='balanced' r·∫•t quan tr·ªçng v·ªõi FakeNewsNet (Real > Fake)
svm_model = LinearSVC(class_weight='balanced', random_state=42, max_iter=2000)

# B·ªçc trong CalibratedClassifierCV ƒë·ªÉ t√≠nh ƒë∆∞·ª£c AUC (LinearSVC g·ªëc kh√¥ng c√≥ predict_proba)
clf = CalibratedClassifierCV(svm_model)
clf.fit(X_train, y_train)

# =====================================================
# 7. EVALUATION
# =====================================================
print("\nüéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...")

start_time = time.time()

# D·ª± ƒëo√°n
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1] # X√°c su·∫•t l·ªõp 1 (Real)

end_time = time.time()
runtime = end_time - start_time
samples_per_second = len(y_test) / runtime

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
auc = roc_auc_score(y_test, y_prob)

eval_results = {
    'eval_accuracy': accuracy,
    'eval_precision': precision,
    'eval_recall': recall,
    'eval_f1': f1,
    'eval_auc': auc,
    'eval_loss': 'N/A (SVM)',
    'eval_runtime': runtime,
    'eval_samples_per_second': samples_per_second,
    'eval_steps_per_second': 'N/A'
}

print("\n" + "="*50)
print("üìä K·∫æT QU·∫¢ SVM BASELINE - FAKENEWSNET:")
print("="*50)
print(eval_results)
print("="*50)

# 8. SAVE
with open(os.path.join(OUTPUT_DIR, "svm_fnn_model.pkl"), "wb") as f:
    pickle.dump(clf, f)
with open(os.path.join(OUTPUT_DIR, "tfidf_fnn_vectorizer.pkl"), "wb") as f:
    pickle.dump(vectorizer, f)
print(f"\n‚úÖ ƒê√£ l∆∞u model t·∫°i: {OUTPUT_DIR}")

Mounted at /content/drive
‚è≥ ƒêang t·∫£i dataset FakeNewsNet...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

‚ö†Ô∏è T·∫£i config con th·∫•t b·∫°i (BuilderConfig 'gossipcop' not found. Available: ['default']), t·∫£i b·∫£n default...


FakeNewsNet.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/23196 [00:00<?, ? examples/s]

Dataset columns: ['title', 'news_url', 'source_domain', 'tweet_num', 'real']
T·ªïng s·ªë m·∫´u d·ªØ li·ªáu: 23196
‚úÖ ƒê√£ nh·∫≠n di·ªán c·ªôt: Text='None', Title='title', Label='real'
üßπ Pre-processing...
 ‚Üí Sau x·ª≠ l√Ω: 22,355
Train: 17884 | Test: 4471

‚öôÔ∏è Vectorizing (TF-IDF)...
üöÄ Training SVM (LinearSVC)...

üéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...

üìä K·∫æT QU·∫¢ SVM BASELINE - FAKENEWSNET:
{'eval_accuracy': 0.842764482218743, 'eval_precision': 0.8377817144742215, 'eval_recall': 0.842764482218743, 'eval_f1': 0.8394047671549866, 'eval_auc': np.float64(0.8727983823123103), 'eval_loss': 'N/A (SVM)', 'eval_runtime': 0.02896738052368164, 'eval_samples_per_second': 154346.02367117154, 'eval_steps_per_second': 'N/A'}

‚úÖ ƒê√£ l∆∞u model t·∫°i: /content/drive/MyDrive/FakeNewsNet_SVM_Baseline
