In [1]:
# ==========================================
# 1. IMPORT LIBRARIES
# ==========================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')


In [2]:

# ==========================================
# 2. LOAD DATA
# ==========================================
# Load the dataset found in your repository
df = pd.read_csv('cleaned_liar_dataset.csv')

print("Original Data Head:")
print(df.head(2))


Original Data Head:
           id  label                                          statement  \
0  11972.json   true  Building a wall on the U.S.-Mexico border will...   
1  11685.json  false  Wisconsin is on pace to double the number of l...   

       subject            speaker             job_title state_info  \
0  immigration         rick-perry              Governor      Texas   
1         jobs  katrina-shankland  State representative  Wisconsin   

  party_affiliation  barely_true_counts  false_counts  half_true_counts  \
0        republican                  30            30                42   
1          democrat                   2             1                 0   

   mostly_true_counts  pants_on_fire_counts            context  target  
0                  23                    18    Radio interview       1  
1                   0                     0  a news conference       0  


In [3]:

# ==========================================
# 3. PREPROCESS & LABEL MAPPING
# ==========================================
# CRITICAL STEP: Map the 6 labels to Binary (0 and 1)
# This matches the logic likely used in 'baseline-LR-1.ipynb' to get high scores.

def map_labels(label):
    label = str(label).lower()
    # Grouping 'True', 'Mostly True', and 'Half True' as REAL
    if label in ['true', 'mostly-true', 'half-true']:
        return 1
    # Grouping 'False', 'Barely True', 'Pants Fire' as FAKE
    else:
        return 0

# NOTE: Verify 'label' matches the actual column name in your CSV. 
# If your column is named 'truth_rating', change 'label' below to 'truth_rating'.
if 'label' in df.columns:
    target_col = 'label'
elif 'truth_rating' in df.columns:
    target_col = 'truth_rating'
else:
    # Fallback: assume the second column is the label if names are missing
    target_col = df.columns[4] 
    print(f"Warning: Could not find standard label column. Using '{target_col}'")

# Apply mapping
df['binary_label'] = df[target_col].apply(map_labels)

# Verify the balance
print("\nLabel Distribution (1=Real, 0=Fake):")
print(df['binary_label'].value_counts())



Label Distribution (1=Real, 0=Fake):
binary_label
1    714
0    553
Name: count, dtype: int64


In [5]:

# ==========================================
# 4. BERT FEATURE EXTRACTION
# ==========================================
# Your project docs mention "NLP techniques (BERT embeddings)" [2]
print("\nLoading BERT model (this may take a moment)...")
bert_model = SentenceTransformer('all-MiniLM-L6-v2') 

# NOTE: Verify 'text' matches your actual text column name. 
# It might be named 'statement', 'tweet', or 'claim'.
text_col = 'statement' if 'statement' in df.columns else df.columns # Fallback to 1st col

print(f"Encoding text from column: '{text_col}'...")
X_embeddings = bert_model.encode(df[text_col].astype(str).tolist(), show_progress_bar=True)
y = df['binary_label']



Loading BERT model (this may take a moment)...
Encoding text from column: 'statement'...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

In [6]:

# ==========================================
# 5. TRAIN/TEST SPLIT
# ==========================================
X_train, X_test, y_train, y_test = train_test_split(
    X_embeddings, 
    y, 
    test_size=0.2, 
    random_state=42
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")



Training set size: 1013
Testing set size: 254


In [7]:

# ==========================================
# 6. MODEL 1: LOGISTIC REGRESSION (LR)
# ==========================================
# This replicates 'baseline-LR-1.ipynb'
print("\n--- Training Logistic Regression ---")
lr_model = LogisticRegression(solver='liblinear', random_state=42)
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)
lr_acc = accuracy_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr, average='weighted')

print(f"Logistic Regression Accuracy: {lr_acc:.4f}")
print(f"Logistic Regression F1 Score: {lr_f1:.4f}")



--- Training Logistic Regression ---
Logistic Regression Accuracy: 0.5827
Logistic Regression F1 Score: 0.5721


In [8]:

# ==========================================
# 7. MODEL 2: XGBOOST
# ==========================================
# This replicates 'lr-xg-1.ipynb'
print("\n--- Training XGBoost ---")
xgb_model = XGBClassifier(
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
xgb_acc = accuracy_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb, average='weighted')

print(f"XGBoost Accuracy: {xgb_acc:.4f}")
print(f"XGBoost F1 Score: {xgb_f1:.4f}")



--- Training XGBoost ---
XGBoost Accuracy: 0.5354
XGBoost F1 Score: 0.5310


In [9]:

# ==========================================
# 8. FINAL COMPARISON
# ==========================================
print("\n=== FINAL RESULTS ===")
print(f"LR Accuracy:  {lr_acc:.4f}")
print(f"XGB Accuracy: {xgb_acc:.4f}")

if xgb_acc > lr_acc:
    print("\nRecommendation: Use XGBoost for your final submission.")
else:
    print("\nRecommendation: Use Logistic Regression for your final submission.")


=== FINAL RESULTS ===
LR Accuracy:  0.5827
XGB Accuracy: 0.5354

Recommendation: Use Logistic Regression for your final submission.
