# **Fake News Detection - Method 1 (Training)**

Imported libraries and loaded functions

In [None]:
import re
import string
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import spacy
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import make_scorer, f1_score
from pathlib import Path
import joblib

# Download nltk assets
nltk.download("stopwords")
nltk.download("punkt")

# Load spacy
nlp = spacy.load("en_core_web_sm", exclude=["parser","ner"])

# Prepare stop word set
STOP_WORDS = set(stopwords.words("english"))

# Precompile regex and punctuation map
URL_PATTERN = re.compile(r"http\S+|www\S+")
PUNCT_MAP = str.maketrans("", "", string.punctuation)

# Enable tqdm
tqdm.pandas()

Wrote a function to clean a given text
- Lowercase
- Strip URLs
- Remove punctuation
- Tokenize
- Lemmatize
- Remove stop words

In [None]:
def clean_text(text: str) -> str:
    # Lowercase, remove urls and punctuation
    txt = text.lower()
    txt = URL_PATTERN.sub(" ", txt)
    txt = txt.translate(PUNCT_MAP)

    # Tokenize and lemmatize
    doc = nlp(txt)

    # Filter out stop words and non alpha tokens
    return " ".join(tok.lemma_ for tok in doc if tok.is_alpha and tok.text not in STOP_WORDS)

# An example output of the function
example = "Here's an Example: https://example.com Running, RUNNERS and ran!"
print("Input: ", example)
print("Output: ", clean_text(example))

Loaded, applied cleaning and splitted ISOT dataset

In [None]:
isot_train_df = None

# If processed data is present, load; else process
if Path("data/processed/ISOT/isot_train.pkl").exists() :
    # Load train data
    isot_train_df = joblib.load("data/processed/ISOT/isot_train.pkl")

    # Display count of rows
    print(f"ISOT processed count: {isot_train_df.shape[0]:,} rows")

    # Display first few rows
    display(isot_train_df.head())
else:
    # Load dataset
    isot_fake = pd.read_csv("data/raw/ISOT/Fake.csv")
    isot_true = pd.read_csv("data/raw/ISOT/True.csv")

    # Labels
    isot_fake["label"] = 0 # fake
    isot_true["label"] = 1 # true

    # Concatenate true and fake datas by only keeping their text and label parts
    isot_df = pd.concat([isot_fake[["text","label"]], isot_true[["text","label"]]], ignore_index=True) \
                .rename(columns={"text":"content"})

    # Display count of rows
    print(f"ISOT raw count: {isot_df.shape[0]:,} rows")

    # Clean dataset
    isot_df["cleaned"] = isot_df["content"].progress_apply(clean_text)

    # Display first few rows
    display(isot_df.head())

    # Split cleaned dataset into train and test datas
    isot_train_df, isot_test_df = train_test_split(isot_df, test_size=0.2, stratify=isot_df["label"], random_state=42)

    # Save datasets
    isot_train_filename = Path("data/processed/ISOT") / "isot_train.pkl"
    isot_test_filename = Path("data/processed/ISOT") / "isot_test.pkl"
    joblib.dump(isot_train_df, isot_train_filename)
    joblib.dump(isot_test_df, isot_test_filename)

Loaded, applied cleaning and splitted LIAR dataset

In [None]:
liar_train_df = None

# If processed data is present, load; else process
if Path("data/processed/LIAR/liar_train.pkl").exists():
    # Load train data
    liar_train_df = joblib.load("data/processed/LIAR/liar_train.pkl")
    
    # Display count of rows
    print(f"LIAR processed count: {liar_train_df.shape[0]:,} rows")

    # Display first few rows
    display(liar_train_df.head())
else:
    # Columns of dataset
    liar_cols = [
        "id_json","label","statement","subject","speaker","speaker_job",
        "state_info","party_affil","barely_true","false","half_true",
        "mostly_true","pants_fire","context"
    ]

    # Load dataset
    liar_train = pd.read_csv("data/raw/LIAR/train.tsv", sep="\t", header=None, names=liar_cols)
    liar_valid = pd.read_csv("data/raw/LIAR/valid.tsv", sep="\t", header=None, names=liar_cols)
    liar_test  = pd.read_csv("data/raw/LIAR/test.tsv",  sep="\t", header=None, names=liar_cols)

    # Concatenate train, valid and test datas
    liar_df = pd.concat([liar_train, liar_valid, liar_test], ignore_index=True)

    # Group labels to turn them into a binary classification
    liar_df["label"] = liar_df["label"].map({
        "pants-fire":0, "false":0, "barely-true":0, # fake
        "half-true":1,  "mostly-true":1,  "true":1  # true
    })
    liar_df = liar_df[["statement","label"]].rename(columns={"statement":"content"})

    # Display count of rows
    print(f"LIAR raw count: {liar_df.shape[0]:,} rows")

    # Clean dataset
    liar_df["cleaned"] = liar_df["content"].progress_apply(clean_text)

    # Display first few rows
    display(liar_df.head())

    # Split cleaned dataset into train and test datas
    liar_train_df, liar_test_df = train_test_split(liar_df, test_size=0.2, stratify=liar_df["label"], random_state=42)

    # Save datasets
    liar_train_filename = Path("data/processed/LIAR") / "liar_train.pkl"
    liar_test_filename = Path("data/processed/LIAR") / "liar_test.pkl"
    joblib.dump(liar_train_df, liar_train_filename)
    joblib.dump(liar_test_df, liar_test_filename)

Extracted features using BoW and TF-IDF for ISOT dataset

In [None]:
# BoW pipeline
bow_pipeline_isot = Pipeline([
    ("vect", CountVectorizer(ngram_range=(1,3), min_df=10)), # word sequences and minimum appearance
    ("scale", MaxAbsScaler())
])

# TF-IDF pipeline
tfidf_pipeline_isot = Pipeline([
    ("vect", TfidfVectorizer(ngram_range=(1,3), min_df=10)) # word sequences and minimum appearance
])

# Feature matrices for ISOT
X_isot_bow = bow_pipeline_isot.fit_transform(isot_train_df["cleaned"])
X_isot_tfidf = tfidf_pipeline_isot.fit_transform(isot_train_df["cleaned"])

# Sample count and feature space size of BoW and TF-IDF
print("ISOT BoW shape: ", X_isot_bow.shape)
print("ISOT TF-IDF shape: ", X_isot_tfidf.shape)

# Show first 10 feature names
feature_names_bow = bow_pipeline_isot.named_steps["vect"].get_feature_names_out()[:10]
feature_names_tfidf = tfidf_pipeline_isot.named_steps["vect"].get_feature_names_out()[:10]
print("ISOT BoW features sample: ", feature_names_bow)
print("ISOT TF-IDF features sample: ", feature_names_tfidf)

Extracted features using BoW and TF-IDF for LIAR dataset

In [None]:
# BoW pipeline
bow_pipeline_liar = Pipeline([
    ("vect", CountVectorizer(ngram_range=(1,3), min_df=5)), # word sequences and minimum appearance
    ("scale", MaxAbsScaler())
])

# TF-IDF pipeline
tfidf_pipeline_liar = Pipeline([
    ("vect", TfidfVectorizer(ngram_range=(1,3), min_df=5))  # word sequences and minimum appearance
])

# Feature matrices for LIAR
X_liar_bow = bow_pipeline_liar.fit_transform(liar_train_df["cleaned"])
X_liar_tfidf = tfidf_pipeline_liar.fit_transform(liar_train_df["cleaned"])

# Sample count and feature space size of BoW and TF-IDF
print("LIAR BoW shape: ", X_liar_bow.shape)
print("LIAR TF-IDF shape: ", X_liar_tfidf.shape)

# Show first 10 feature names
feature_names_bow = bow_pipeline_liar.named_steps["vect"].get_feature_names_out()[:10]
feature_names_tfidf = tfidf_pipeline_liar.named_steps["vect"].get_feature_names_out()[:10]
print("LIAR BoW features sample: ", feature_names_bow)
print("LIAR TF-IDF features sample: ", feature_names_tfidf)

Created models

In [None]:
# Multilayer Perceptron
mlp_parameters = {
    "hidden_layer_sizes": [(128,), (128, 64)], # number of neurons per hidden layer
    "alpha": np.logspace(-4, -2, 3) # L2 regularization strength
}

# Gradient Boosting Machine
gbm_parameters = {
    "n_estimators": [200, 400], # number of trees
    "learning_rate": [0.05, 0.1], # learning rate
    "max_depth": [3, 5] # maximum depth of trees
}

# Support Vector Machine
svm_parameters = {
    "C": np.logspace(-1, 1, 3), # L2 regularization strength
}

# Models
models = {
    "mlp": (MLPClassifier(max_iter=100, random_state=42), mlp_parameters),
    "gbm": (GradientBoostingClassifier(random_state=42), gbm_parameters),
    "svm": (LinearSVC(random_state=42), svm_parameters)
}

Wrote a function to train and save a model

In [None]:
def train_and_save(X, y, dataset: str, feature_space: str):
    # Stratified 5-fold cross validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Macro F1-score as evaluation metric
    scorer = make_scorer(f1_score, average="macro")

    for model_name, (base, grid) in models.items():
        print(f"Training on '{dataset}' with features of '{feature_space}' using '{model_name.upper()}'")

        # Randomized hyperparameter search with cross validation
        search = RandomizedSearchCV(
            estimator=base,
            param_distributions=grid,
            n_iter=4, # number of hyperparameter combinations to try
            cv=cv,
            scoring=scorer,
            random_state=42,
            n_jobs=8,
            verbose=2
        )
        
        # Fit model and select best estimator
        search.fit(X, y)
        best = search.best_estimator_

        # Save the model with best hyperparameters
        filename = Path("saved_models/method1") / f"{dataset}_{feature_space}_{model_name.upper()}.pkl"
        joblib.dump(best, filename)

        print()

Trained and saved models for every dataset/feature pair

In [None]:

train_and_save(X_liar_bow, liar_train_df["label"].values, "LIAR", "BoW")
print("-----")
train_and_save(X_liar_tfidf, liar_train_df["label"].values, "LIAR", "TF-IDF")
print("-----")
train_and_save(X_isot_bow, isot_train_df["label"].values, "ISOT", "BoW")
print("-----")
train_and_save(X_isot_tfidf, isot_train_df["label"].values, "ISOT", "TF-IDF")