# IndoHoaxDetector Model Evaluation

This notebook evaluates the IndoHoaxDetector model (TF-IDF + Logistic Regression) on labeled data.

It loads the trained model and vectorizer, evaluates on preprocessed labeled data, and provides:
- Accuracy, F1 scores, confusion matrix
- Error analysis with high-confidence false positives/negatives

Defaults are set for your project files. Run all cells to evaluate.

In [None]:
import sys
import os
from typing import Tuple

import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
)

## Configuration

Set paths and column names. Defaults are for your IndoHoaxDetector project.

In [None]:
# Default paths (adjust if needed)
DEFAULT_MODEL_PATH = "logreg_model.pkl"
DEFAULT_VECTORIZER_PATH = "tfidf_vectorizer.pkl"

# Expected labeled CSV columns
DEFAULT_TEXT_COL = "text_clean"
DEFAULT_LABEL_COL = "label_encoded"

INT_TO_STRING_LABEL = {0: "FAKTA", 1: "HOAX"}

# Data path (update to your labeled CSV)
DATA_PATH = "g:/My Drive/University Files/5th Semester/Data Science/Project/preprocessed_data_FINAL_FINAL.csv"
TEXT_COL = DEFAULT_TEXT_COL
LABEL_COL = DEFAULT_LABEL_COL
MODEL_PATH = DEFAULT_MODEL_PATH
VECTORIZER_PATH = DEFAULT_VECTORIZER_PATH
MAX_SHOW = 5

## Loading Utilities

In [None]:
def load_model_and_vectorizer(
    model_path: str,
    vectorizer_path: str,
):
    if not os.path.exists(model_path):
        print(f"[ERROR] Model file not found: {model_path}", file=sys.stderr)
        return None, None
    if not os.path.exists(vectorizer_path):
        print(f"[ERROR] Vectorizer file not found: {vectorizer_path}", file=sys.stderr)
        return None, None

    print(f"[INFO] Loading model from: {model_path}")
    model = joblib.load(model_path)
    print(f"[INFO] Loading vectorizer from: {vectorizer_path}")
    vectorizer = joblib.load(vectorizer_path)
    return model, vectorizer


def load_labeled_data(
    csv_path: str,
    text_col: str,
    label_col: str,
) -> pd.DataFrame:
    """
    Load labeled evaluation data.

    Assumptions for your project:
    - text_col = 'text_clean' (already preprocessed exactly as during training)
    - label_col = 'label_encoded' (0 = FAKTA, 1 = HOAX)
    """
    if not os.path.exists(csv_path):
        print(f"[ERROR] Labeled CSV not found: {csv_path}", file=sys.stderr)
        return pd.DataFrame()

    print(f"[INFO] Loading labeled data from: {csv_path}")
    df = pd.read_csv(csv_path)

    if text_col not in df.columns:
        print(f"[ERROR] Text column '{text_col}' not found. Available: {list(df.columns)}", file=sys.stderr)
        return pd.DataFrame()

    if label_col not in df.columns:
        print(f"[ERROR] Label column '{label_col}' not found. Available: {list(df.columns)}", file=sys.stderr)
        return pd.DataFrame()

    df = df[[text_col, label_col]].dropna()
    if df.empty:
        print("[ERROR] No valid rows after dropping NA in text/label.", file=sys.stderr)
        return pd.DataFrame()

    # For your file, label_encoded is already 0/1, so we keep as is
    df = df.rename(columns={text_col: "text", label_col: "true_label"})

    # Keep only rows with labels 0 or 1
    before = len(df)
    df = df[df["true_label"].isin([0, 1])]
    after = len(df)
    if after == 0:
        print("[ERROR] No rows with valid labels (0/1) after filtering.", file=sys.stderr)
        return pd.DataFrame()
    if after < before:
        print(f"[INFO] Filtered out {before - after} rows with invalid label values.")

    return df.reset_index(drop=True)

## Evaluation Logic

In [None]:
def evaluate_model_on_labeled(
    df: pd.DataFrame,
    model,
    vectorizer,
    max_examples_to_show: int = 5,
):
    """
    Evaluate model on already-preprocessed texts.

    IMPORTANT:
    - We DO NOT re-clean or restem here because `text_clean` in your dataset
      is assumed to already match what the vectorizer was trained on.
    """
    print("[INFO] Using pre-cleaned text from dataset (no extra preprocessing).")

    # Vectorize using loaded TF-IDF (MUST use transform, not fit_transform)
    print("[INFO] Vectorizing texts with existing TF-IDF...")
    X = vectorizer.transform(df["text"])

    # Predictions
    print("[INFO] Running predictions...")
    probs = model.predict_proba(X)
    preds = model.predict(X)
    confidences = probs.max(axis=1)

    df["pred_label"] = preds
    df["pred_str"] = df["pred_label"].map(INT_TO_STRING_LABEL)
    df["true_str"] = df["true_label"].map(INT_TO_STRING_LABEL)
    df["confidence"] = confidences

    # --- Metrics ---
    print("\n===== CORE METRICS ====")
    acc = accuracy_score(df["true_label"], df["pred_label"])
    print(f"Accuracy: {acc:.4f}")

    print("\nClassification report (macro/micro F1, per-class metrics):")
    print(
        classification_report(
            df["true_label"],
            df["pred_label"],
            target_names=["FAKTA(0)", "HOAX(1)"],
            digits=4,
        )
    )

    print("Confusion matrix [[TN, FP], [FN, TP]]:")
    print(confusion_matrix(df["true_label"], df["pred_label"]))

    # --- Error buckets ---
    fp = df[(df["true_label"] == 0) & (df["pred_label"] == 1)]
    fn = df[(df["true_label"] == 1) & (df["pred_label"] == 0)]

    print(f"\nTotal examples: {len(df)}")
    print(f"False Positives (FAKTA→HOAX): {len(fp)}")
    print(f"False Negatives (HOAX→FAKTA): {len(fn)}")

    # Show high-confidence mistakes for qualitative analysis
    def show_examples(sub_df, title: str):
        if sub_df.empty:
            print(f"\nNo {title} examples.")
            return
        print(f"\n===== {title} (up to {max_examples_to_show}) ====")
        sub_df_sorted = sub_df.sort_values("confidence", ascending=False).head(max_examples_to_show)
        for _, row in sub_df_sorted.iterrows():
            snippet = str(row["text"]).replace("\n", " ")
            if len(snippet) > 200:
                snippet = snippet[:200] + "..."
            print(
                f"- true={row['true_str']}, pred={row['pred_str']}, "
                f"conf={row['confidence']:.3f} :: {snippet}"
            )

    show_examples(fp, "High-confidence False Positives")
    show_examples(fn, "High-confidence False Negatives")

    print("\n[INFO] Evaluation complete.")
    return df

## Jupyter-Friendly Evaluation Function

In [None]:
def run_evaluation(
    data_path: str = DATA_PATH,
    text_col: str = TEXT_COL,
    label_col: str = LABEL_COL,
    model_path: str = MODEL_PATH,
    vectorizer_path: str = VECTORIZER_PATH,
    max_show: int = MAX_SHOW,
):
    """
    JUPYTER-FRIENDLY ENTRYPOINT.

    Call this from a notebook to avoid argparse / ipykernel --f issues.

    Example:
        eval_df = run_evaluation()
    """
    model, vectorizer = load_model_and_vectorizer(
        model_path,
        vectorizer_path,
    )
    if model is None or vectorizer is None:
        return pd.DataFrame()
    
    df = load_labeled_data(
        csv_path=data_path,
        text_col=text_col,
        label_col=label_col,
    )
    if df.empty:
        return pd.DataFrame()

    return evaluate_model_on_labeled(
        df=df,
        model=model,
        vectorizer=vectorizer,
        max_examples_to_show=max_show,
    )

## Run Evaluation

Execute this cell to evaluate the model. Results will print below, and `eval_df` will contain the evaluation DataFrame for further analysis.

In [None]:
# Run the evaluation
eval_df = run_evaluation()

# Display first few rows of results
if not eval_df.empty:
    print("\nEvaluation DataFrame preview:")
    print(eval_df.head())
else:
    print("Evaluation failed. Check paths and files.")