In [1]:
# =====================================================
# BASELINE: LOGISTIC REGRESSION - LIAR DATASET
# Output Format: HuggingFace Style
# =====================================================

import os, re, psutil, pickle, time
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from google.colab import drive

# 1. MOUNT DRIVE & SETUP
if not os.path.exists('/content/drive'):
    try:
        drive.mount('/content/drive', force_remount=True)
    except ValueError: pass

OUTPUT_DIR = "/content/drive/MyDrive/LIAR_LogReg_Baseline"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 2. LOAD DATA
print("‚è≥ ƒêang t·∫£i dataset LIAR...")
try:
    dataset = load_dataset("chengxuphd/liar2")
except Exception as e:
    print(f"‚ö†Ô∏è T·∫£i liar2 th·∫•t b·∫°i ({e}), d√πng b·∫£n g·ªëc 'liar'...")
    dataset = load_dataset("liar")

# Chuy·ªÉn sang Pandas & G·ªôp to√†n b·ªô ƒë·ªÉ chia split gi·ªëng nh∆∞ RoBERTa
df_train = pd.DataFrame(dataset['train'])
df_val   = pd.DataFrame(dataset['validation'])
df_test  = pd.DataFrame(dataset['test'])
df = pd.concat([df_train, df_val, df_test], ignore_index=True)

print(f"T·ªïng s·ªë m·∫´u d·ªØ li·ªáu: {len(df)}")

# 3. PRE-PROCESSING
# A. Label Mapping (6 class -> 2 class)
def map_liar_labels(lbl):
    # X·ª≠ l√Ω n·∫øu nh√£n l√† chu·ªói
    if isinstance(lbl, str):
        lbl = lbl.lower()
        if lbl in ['false', 'barely-true', 'pants-fire', 'pants-on-fire']: return 0
        if lbl in ['true', 'mostly-true', 'half-true']: return 1
        return 0
    # X·ª≠ l√Ω n·∫øu nh√£n l√† s·ªë (0,1,2,3,4,5)
    if isinstance(lbl, (int, np.integer)):
        if lbl in [0, 4, 5]: return 0 # Fake group
        if lbl in [1, 2, 3]: return 1 # Real group
    return 0

df['binary_label'] = df['label'].apply(map_liar_labels)

# B. Feature Engineering (Quan tr·ªçng cho TF-IDF)
def create_liar_content(row):
    stmt = str(row.get('statement', '')).strip()
    speaker = str(row.get('speaker', 'unknown'))
    party = str(row.get('party_affiliation', 'unknown'))
    subject = str(row.get('subject', 'unknown'))
    context = str(row.get('context', 'unknown'))

    # Gh√©p t·∫•t c·∫£ th√†nh m·ªôt chu·ªói vƒÉn b·∫£n ƒë·ªÉ TF-IDF h·ªçc
    # V√≠ d·ª•: "Tax cuts are good. speaker: trump. party: republican. context: debate"
    return f"{stmt} speaker: {speaker} party: {party} subject: {subject} context: {context}"

df['content'] = df.apply(create_liar_content, axis=1)

# C. Clean Text
def clean_text_ml(s):
    if not isinstance(s, str): return ""
    s = s.lower()
    s = re.sub(r'https?://\S+', '', s)
    s = re.sub(r'<.*?>', '', s)
    # Gi·ªØ l·∫°i ch·ªØ, s·ªë v√† d·∫•u hai ch·∫•m (:) ƒë·ªÉ ph√¢n bi·ªát c√°c tr∆∞·ªùng metadata
    s = re.sub(r'[^a-z0-9\s:]', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

print("üßπ Pre-processing...")
df['content'] = df['content'].apply(clean_text_ml)

# 4. SPLIT DATA
# Chia t·∫≠p train/test (80/10/10) - D√πng l·∫°i random_state=42 ƒë·ªÉ kh·ªõp v·ªõi RoBERTa
X_train_text, X_temp_text, y_train, y_temp = train_test_split(
    df['content'], df['binary_label'], test_size=0.2, random_state=42, stratify=df['binary_label']
)
X_val_text, X_test_text, y_val, y_test = train_test_split(
    X_temp_text, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Train: {len(X_train_text)} | Test: {len(X_test_text)}")

# 5. TF-IDF VECTORIZATION
print("\n‚öôÔ∏è Vectorizing (TF-IDF)...")
# LIAR t·ª´ v·ª±ng phong ph√∫ (do t√™n ng∆∞·ªùi, ƒë·ªãa danh), nh∆∞ng 30k features l√† ƒë·ªß
vectorizer = TfidfVectorizer(max_features=30000, stop_words='english', ngram_range=(1, 2))

X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

# 6. TRAIN LOGISTIC REGRESSION
print("üöÄ Training Logistic Regression...")
model = LogisticRegression(
    solver='liblinear',
    C=1.0,
    class_weight='balanced', # LIAR h∆°i m·∫•t c√¢n b·∫±ng, n√™n th√™m c√°i n√†y
    n_jobs=-1,
    random_state=42
)
model.fit(X_train, y_train)

# =====================================================
# 7. EVALUATION
# =====================================================
print("\nüéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...")

start_time = time.time()

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

end_time = time.time()
runtime = end_time - start_time
samples_per_second = len(y_test) / runtime

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
auc = roc_auc_score(y_test, y_prob)

eval_results = {
    'eval_accuracy': accuracy,
    'eval_precision': precision,
    'eval_recall': recall,
    'eval_f1': f1,
    'eval_auc': auc,
    'eval_loss': 'N/A (LogReg)',
    'eval_runtime': runtime,
    'eval_samples_per_second': samples_per_second,
    'eval_steps_per_second': 'N/A'
}

print("\n" + "="*50)
print("üìä K·∫æT QU·∫¢ LOGISTIC REGRESSION - LIAR:")
print("="*50)
print(eval_results)
print("="*50)

# 8. SAVE
with open(os.path.join(OUTPUT_DIR, "logreg_liar_model.pkl"), "wb") as f:
    pickle.dump(model, f)
with open(os.path.join(OUTPUT_DIR, "tfidf_liar_vectorizer.pkl"), "wb") as f:
    pickle.dump(vectorizer, f)
print(f"\n‚úÖ ƒê√£ l∆∞u model t·∫°i: {OUTPUT_DIR}")

Mounted at /content/drive
‚è≥ ƒêang t·∫£i dataset LIAR...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/19.0M [00:00<?, ?B/s]

valid.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/18369 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2297 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2296 [00:00<?, ? examples/s]

T·ªïng s·ªë m·∫´u d·ªØ li·ªáu: 22962
üßπ Pre-processing...
Train: 18369 | Test: 2297

‚öôÔ∏è Vectorizing (TF-IDF)...
üöÄ Training Logistic Regression...

üéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...

üìä K·∫æT QU·∫¢ LOGISTIC REGRESSION - LIAR:
{'eval_accuracy': 0.6051371353939922, 'eval_precision': 0.6150566405388351, 'eval_recall': 0.6051371353939922, 'eval_f1': 0.6085007918330042, 'eval_auc': np.float64(0.6418397155013653), 'eval_loss': 'N/A (LogReg)', 'eval_runtime': 0.002213001251220703, 'eval_samples_per_second': 1037956.9368670546, 'eval_steps_per_second': 'N/A'}





‚úÖ ƒê√£ l∆∞u model t·∫°i: /content/drive/MyDrive/LIAR_LogReg_Baseline
