# **Fake News Detection - Method 1 (Testing)**

Ignore warnings

In [1]:
import warnings

warnings.filterwarnings("ignore")

Feature extraction constants

In [2]:
VECT_TYPES = ["bow", "tfidf"] # vectorization type
MIN_DFS = [30, 40] # minimum appearance value
REDUCTIONS = ["svd", "chi2"] # reduction type
SVD_COMPONENTS = [300, 500] # maximum feature size
CHI2_K_VALS    = [500, 700] # maximum feature size

Functions for transforming test datas

In [3]:
from pathlib import Path
from typing import Any, Dict, cast

import joblib
from scipy import sparse
from sklearn.metrics import classification_report


# Builds filepaths for features
def feature_path(ds: str, vtype: str, min_df: int, red: str, param: int) -> Path:
    return Path("data/features") / ds / f"{vtype}_min{min_df}_{red}{param}.joblib"

# Applies vectorization and reduction to test datas
def transform_text(feat_obj: dict, red: str, texts):
    X_vec = feat_obj["vect"].transform(texts)
    if red == "svd":
        return feat_obj["svd"].transform(X_vec)
    elif red == "chi2":
        return feat_obj["chi2"].transform(X_vec)

# Predicts and prints report
def evaluate_combination(ds, vtype, min_df, red, param, model_key, y_true, X_test):
    ftr_tag  = f"{ds.lower()}_{vtype}_min{min_df}_{red}{param}"
    mfile = Path("saved_models/method1") / f"{ftr_tag}_{model_key}.joblib"
    if not mfile.exists():
        print(f"Missing model → {mfile.name}")
        return None

    classifier = joblib.load(mfile)

    # Make matrix dense if it is sparse for GBM
    if model_key == "gbm" and sparse.issparse(X_test):
        X_test = X_test.toarray().astype("float32")
    
    y_pred = classifier.predict(X_test)

    report = cast(
        Dict[str, Any],
        classification_report(
            y_true,
            y_pred,
            digits=4,
            output_dict=True,
            zero_division=0,
        ),
    )

    print(f"\n[{ds} | {vtype} min_df={min_df} | {red}{param} | {model_key.upper()}] — F1-macro: {report['macro avg']['f1-score']:.4f}")
    
    return {
        "dataset": ds,
        "vectorizer": vtype,
        "min_df": min_df,
        "reduction": f"{red}{param}",
        "model": model_key,
        "f1_macro": report["macro avg"]["f1-score"],
        "accuracy": report["accuracy"],
        "precision_macro": report["macro avg"]["precision"],
        "recall_macro": report["macro avg"]["recall"],
    }

Calculate each model's performance

In [4]:
import joblib


results = [] # to collect results

for ds in ["ISOT", "LIAR"]:
    # Load test datas
    test_df = joblib.load(Path("data/processed") / ds / f"{ds.lower()}_test.pkl")
    y_test  = test_df["label"].values
    texts   = test_df["cleaned"].values

    for vtype in VECT_TYPES:
        for min_df in MIN_DFS:

            # SVD variants
            for n_comp in SVD_COMPONENTS:
                feat_obj = joblib.load(feature_path(ds, vtype, min_df, "svd", n_comp))
                X_test_svd = transform_text(feat_obj, "svd", texts)

                for algo in ["gbm", "svm"]:
                    res = evaluate_combination(
                        ds, vtype, min_df, "svd", n_comp, algo, y_test, X_test_svd
                    )
                    if res: results.append(res) # append if it is not none

            # Chi-squared variants
            for k_val in CHI2_K_VALS:
                feat_obj = joblib.load(feature_path(ds, vtype, min_df, "chi2", k_val))
                X_test_chi = transform_text(feat_obj, "chi2", texts)

                for algo in ["gbm", "svm"]:
                    res = evaluate_combination(
                        ds, vtype, min_df, "chi2", k_val, algo, y_test, X_test_chi
                    )
                    if res is not None:
                        results.append(res)


[ISOT | bow min_df=30 | svd300 | GBM] — F1-macro: 0.9452

[ISOT | bow min_df=30 | svd300 | SVM] — F1-macro: 0.9856

[ISOT | bow min_df=30 | svd500 | GBM] — F1-macro: 0.9461

[ISOT | bow min_df=30 | svd500 | SVM] — F1-macro: 0.9927

[ISOT | bow min_df=30 | chi2500 | GBM] — F1-macro: 0.9960

[ISOT | bow min_df=30 | chi2500 | SVM] — F1-macro: 0.9955

[ISOT | bow min_df=30 | chi2700 | GBM] — F1-macro: 0.9962

[ISOT | bow min_df=30 | chi2700 | SVM] — F1-macro: 0.9960

[ISOT | bow min_df=40 | svd300 | GBM] — F1-macro: 0.9493

[ISOT | bow min_df=40 | svd300 | SVM] — F1-macro: 0.9861

[ISOT | bow min_df=40 | svd500 | GBM] — F1-macro: 0.9484

[ISOT | bow min_df=40 | svd500 | SVM] — F1-macro: 0.9938

[ISOT | bow min_df=40 | chi2500 | GBM] — F1-macro: 0.9960

[ISOT | bow min_df=40 | chi2500 | SVM] — F1-macro: 0.9955

[ISOT | bow min_df=40 | chi2700 | GBM] — F1-macro: 0.9962

[ISOT | bow min_df=40 | chi2700 | SVM] — F1-macro: 0.9960

[ISOT | tfidf min_df=30 | svd300 | GBM] — F1-macro: 0.9566

[IS

Display the top results

In [5]:
import pandas as pd


# Sorts results by f1 scores
summary = pd.DataFrame(results).sort_values(["dataset", "f1_macro"], ascending=[True, False]).reset_index(drop=True)

display(summary.groupby("dataset").head(10))

Unnamed: 0,dataset,vectorizer,min_df,reduction,model,f1_macro,accuracy,precision_macro,recall_macro
0,ISOT,tfidf,40,chi2700,gbm,0.996652,0.996659,0.996623,0.996683
1,ISOT,tfidf,30,chi2500,gbm,0.996541,0.996548,0.996506,0.996576
2,ISOT,tfidf,40,chi2500,gbm,0.996429,0.996437,0.996409,0.996449
3,ISOT,bow,30,chi2700,gbm,0.996206,0.996214,0.996167,0.996247
4,ISOT,bow,40,chi2700,gbm,0.996206,0.996214,0.996167,0.996247
5,ISOT,tfidf,30,chi2700,gbm,0.996206,0.996214,0.996186,0.996226
6,ISOT,bow,30,chi2500,gbm,0.995983,0.995991,0.995935,0.996034
7,ISOT,bow,40,chi2500,gbm,0.995983,0.995991,0.995935,0.996034
8,ISOT,bow,30,chi2700,svm,0.995982,0.995991,0.996025,0.995942
9,ISOT,bow,40,chi2700,svm,0.995982,0.995991,0.996025,0.995942
