# **Fake News Detection - Method 1 (Testing)**

Ignore warnings

In [8]:
import warnings

warnings.filterwarnings("ignore")

Feature extraction constants

In [9]:
VECT_TYPES = ["bow", "tfidf"] # vectorization type
MIN_DFS = [30, 40] # minimum appearance value
REDUCTIONS = ["svd", "chi2"] # reduction type
SVD_COMPONENTS = [300, 500] # maximum feature size
CHI2_K_VALS = [500, 700] # maximum feature size

Functions for transforming test datas and evaluate models(GBM and SVM)

In [10]:
from pathlib import Path
from typing import Any, Dict, cast

import joblib
from scipy import sparse
from sklearn.metrics import classification_report


# Builds filepaths for features
def feature_path(ds: str, vtype: str, min_df: int, red: str, param: int) -> Path:
    return Path("data/features") / ds / f"{vtype}_min{min_df}_{red}{param}.joblib"

# Applies vectorization and reduction to test datas
def transform_text(feat_obj: dict, red: str, texts):
    X_vec = feat_obj["vect"].transform(texts)
    if red == "svd":
        return feat_obj["svd"].transform(X_vec)
    elif red == "chi2":
        return feat_obj["chi2"].transform(X_vec)

# Predicts and prints report
def evaluate_combination(ds, vtype, min_df, red, param, model_key, y_true, X_test):
    ftr_tag = f"{ds.lower()}_{vtype}_min{min_df}_{red}{param}"
    mfile = Path("saved_models/method1") / f"{ftr_tag}_{model_key}.joblib"
    if not mfile.exists():
        print(f"Missing model: {mfile.name}")
        return None

    classifier = joblib.load(mfile)

    # Make matrix dense if it is sparse for GBM
    if model_key == "gbm" and sparse.issparse(X_test):
        X_test = X_test.toarray().astype("float32")
    
    y_pred = classifier.predict(X_test)

    report = cast(
        Dict[str, Any],
        classification_report(
            y_true,
            y_pred,
            digits=4,
            output_dict=True,
            zero_division=0,
        ),
    )

    print(f"\n[{ds} | {vtype} min_df={min_df} | {red}={param} | {model_key.upper()}] — F1-macro: {report['macro avg']['f1-score']:.4f}")
    
    return {
        "dataset": ds,
        "vectorizer": vtype,
        "min_df": min_df,
        "reduction": f"{red}_{param}",
        "model": model_key,
        "f1_macro": report["macro avg"]["f1-score"],
        "accuracy": report["accuracy"],
        "precision_macro": report["macro avg"]["precision"],
        "recall_macro": report["macro avg"]["recall"],
    }

Calculate each model's(GBM's and SVM's) performance

In [11]:
import joblib


results = [] # to collect results

for ds in ["ISOT", "LIAR"]:
    # Load test datas
    test_df = joblib.load(Path("data/processed") / ds / f"{ds.lower()}_test.pkl")
    texts = test_df["cleaned"].values
    y_test = test_df["label"].values

    for vtype in VECT_TYPES:
        for min_df in MIN_DFS:

            # SVD variants
            for n_comp in SVD_COMPONENTS:
                feat_obj = joblib.load(feature_path(ds, vtype, min_df, "svd", n_comp))
                X_test_svd = transform_text(feat_obj, "svd", texts)

                for algo in ["gbm", "svm"]:
                    res = evaluate_combination(
                        ds, vtype, min_df, "svd", n_comp, algo, y_test, X_test_svd
                    )
                    if res: results.append(res) # append if it is not none

            # Chi-squared variants
            for k_val in CHI2_K_VALS:
                feat_obj = joblib.load(feature_path(ds, vtype, min_df, "chi2", k_val))
                X_test_chi = transform_text(feat_obj, "chi2", texts)

                for algo in ["gbm", "svm"]:
                    res = evaluate_combination(
                        ds, vtype, min_df, "chi2", k_val, algo, y_test, X_test_chi
                    )
                    if res is not None:
                        results.append(res)


[ISOT | bow min_df=30 | svd=300 | GBM] — F1-macro: 0.9452

[ISOT | bow min_df=30 | svd=300 | SVM] — F1-macro: 0.9856

[ISOT | bow min_df=30 | svd=500 | GBM] — F1-macro: 0.9461

[ISOT | bow min_df=30 | svd=500 | SVM] — F1-macro: 0.9927

[ISOT | bow min_df=30 | chi2=500 | GBM] — F1-macro: 0.9960

[ISOT | bow min_df=30 | chi2=500 | SVM] — F1-macro: 0.9955

[ISOT | bow min_df=30 | chi2=700 | GBM] — F1-macro: 0.9962

[ISOT | bow min_df=30 | chi2=700 | SVM] — F1-macro: 0.9960

[ISOT | bow min_df=40 | svd=300 | GBM] — F1-macro: 0.9493

[ISOT | bow min_df=40 | svd=300 | SVM] — F1-macro: 0.9861

[ISOT | bow min_df=40 | svd=500 | GBM] — F1-macro: 0.9484

[ISOT | bow min_df=40 | svd=500 | SVM] — F1-macro: 0.9938

[ISOT | bow min_df=40 | chi2=500 | GBM] — F1-macro: 0.9960

[ISOT | bow min_df=40 | chi2=500 | SVM] — F1-macro: 0.9955

[ISOT | bow min_df=40 | chi2=700 | GBM] — F1-macro: 0.9962

[ISOT | bow min_df=40 | chi2=700 | SVM] — F1-macro: 0.9960

[ISOT | tfidf min_df=30 | svd=300 | GBM] — F1-m

Function for predicting with model(MLP)

In [12]:
from tensorflow.keras import models  # type: ignore


def predict_with_mlp(model_path, X):
    # Load model
    mdl = models.load_model(model_path, compile=False)

    # Make matrix dense
    if sparse.issparse(X):
        X = X.toarray().astype("float32")
    else:
        X = X.astype("float32")
    
    # Probabilities of predictions
    probs = mdl.predict(X, batch_size=512, verbose=0).ravel() # Reducts matrix into single vector
    return (probs >= 0.5).astype(int)

Calculate model's(MLP's) performance

In [13]:
for ds in ["ISOT", "LIAR"]:
    test_df = joblib.load(Path("data/processed") / ds / f"{ds.lower()}_test.pkl")
    texts = test_df["cleaned"].values
    y_test = test_df["label"].values

    for vtype in VECT_TYPES:
        for min_df in MIN_DFS:

            # SVD combinations
            for n in SVD_COMPONENTS:
                ftr_tag = f"{ds.lower()}_{vtype}_min{min_df}_svd{n}"
                model_path = Path("saved_models/method1") / f"{ftr_tag}_mlp.keras"
                if not model_path.exists():
                    print(f"Missing model: {model_path.name}")
                    continue

                feat_obj = joblib.load(feature_path(ds, vtype, min_df, "svd", n))
                X_test = transform_text(feat_obj, "svd", texts)
                y_pred = predict_with_mlp(model_path, X_test)

                report = cast(
                    Dict[str, Any],
                    classification_report(
                        y_test, y_pred, digits=4,
                        output_dict=True, zero_division=0
                    )
                )

                print(f"\n[{ds} | {vtype} min_df={min_df} | svd={n} | MLP] — F1-macro: {report['macro avg']['f1-score']:.4f}")

                results.append({
                    "dataset": ds,
                    "vectorizer": vtype,
                    "min_df": min_df,
                    "reduction": f"svd_{n}",
                    "model": "mlp",
                    "f1_macro": report["macro avg"]["f1-score"],
                    "accuracy": report["accuracy"],
                    "precision_macro": report["macro avg"]["precision"],
                    "recall_macro": report["macro avg"]["recall"],
                })

            # Chi-squared combinations
            for k in CHI2_K_VALS:
                ftr_tag = f"{ds.lower()}_{vtype}_min{min_df}_chi2{k}"
                model_path = Path("saved_models/method1") / f"{ftr_tag}_mlp.keras"
                if not model_path.exists():
                    print(f"Missing model: {model_path.name}")
                    continue

                feat_obj = joblib.load(feature_path(ds, vtype, min_df, "chi2", k))
                X_test = transform_text(feat_obj, "chi2", texts)
                y_pred = predict_with_mlp(model_path, X_test)

                report = cast(
                    Dict[str, Any],
                    classification_report(
                        y_test, y_pred, digits=4,
                        output_dict=True, zero_division=0
                    )
                )

                print(f"\n[{ds} | {vtype} min_df={min_df} | chi2={k} | MLP] — F1-macro: {report['macro avg']['f1-score']:.4f}")

                results.append({
                    "dataset": ds,
                    "vectorizer": vtype,
                    "min_df": min_df,
                    "reduction": f"chi2_{k}",
                    "model": "mlp",
                    "f1_macro": report["macro avg"]["f1-score"],
                    "accuracy": report["accuracy"],
                    "precision_macro": report["macro avg"]["precision"],
                    "recall_macro": report["macro avg"]["recall"],
                })


[ISOT | bow min_df=30 | svd=300 | MLP] — F1-macro: 0.9896

[ISOT | bow min_df=30 | svd=500 | MLP] — F1-macro: 0.9943

[ISOT | bow min_df=30 | chi2=500 | MLP] — F1-macro: 0.9948

[ISOT | bow min_df=30 | chi2=700 | MLP] — F1-macro: 0.9968

[ISOT | bow min_df=40 | svd=300 | MLP] — F1-macro: 0.9914

[ISOT | bow min_df=40 | svd=500 | MLP] — F1-macro: 0.9945

[ISOT | bow min_df=40 | chi2=500 | MLP] — F1-macro: 0.9953

[ISOT | bow min_df=40 | chi2=700 | MLP] — F1-macro: 0.9945

[ISOT | tfidf min_df=30 | svd=300 | MLP] — F1-macro: 0.9882

[ISOT | tfidf min_df=30 | svd=500 | MLP] — F1-macro: 0.9907

[ISOT | tfidf min_df=30 | chi2=500 | MLP] — F1-macro: 0.9963

[ISOT | tfidf min_df=30 | chi2=700 | MLP] — F1-macro: 0.9946

[ISOT | tfidf min_df=40 | svd=300 | MLP] — F1-macro: 0.9879

[ISOT | tfidf min_df=40 | svd=500 | MLP] — F1-macro: 0.9904

[ISOT | tfidf min_df=40 | chi2=500 | MLP] — F1-macro: 0.9955

[ISOT | tfidf min_df=40 | chi2=700 | MLP] — F1-macro: 0.9938

[LIAR | bow min_df=30 | svd=300

Display top results

In [14]:
import pandas as pd


# Sorts results by f1 scores
summary = pd.DataFrame(results).sort_values(["dataset", "f1_macro"], ascending=[True, False]).reset_index(drop=True)

# Display top 10 results for each dataset
display(summary.groupby("dataset").head(10))

Unnamed: 0,dataset,vectorizer,min_df,reduction,model,f1_macro,accuracy,precision_macro,recall_macro
0,ISOT,bow,30,chi2_700,mlp,0.996764,0.996771,0.99673,0.9968
1,ISOT,tfidf,40,chi2_700,gbm,0.996652,0.996659,0.996623,0.996683
2,ISOT,tfidf,30,chi2_500,gbm,0.996541,0.996548,0.996506,0.996576
3,ISOT,tfidf,40,chi2_500,gbm,0.996429,0.996437,0.996409,0.996449
4,ISOT,tfidf,30,chi2_500,mlp,0.996317,0.996325,0.996387,0.996251
5,ISOT,bow,30,chi2_700,gbm,0.996206,0.996214,0.996167,0.996247
6,ISOT,bow,40,chi2_700,gbm,0.996206,0.996214,0.996167,0.996247
7,ISOT,tfidf,30,chi2_700,gbm,0.996206,0.996214,0.996186,0.996226
8,ISOT,bow,30,chi2_500,gbm,0.995983,0.995991,0.995935,0.996034
9,ISOT,bow,40,chi2_500,gbm,0.995983,0.995991,0.995935,0.996034
