# **Fake News Detection - Method 1 (Training)**

Load assets

In [1]:
import re
import string
import nltk
import spacy
from tqdm import tqdm
from nltk.corpus import stopwords


# Download nltk assets
nltk.download("stopwords")
nltk.download("punkt")

# Load spacy
nlp = spacy.load("en_core_web_sm", exclude=["parser","ner"])

# Prepare stop word set
STOP_WORDS = set(stopwords.words("english"))

# Precompile regex and punctuation map
URL_PATTERN = re.compile(r"http\S+|www\S+")
PUNCT_MAP = str.maketrans("", "", string.punctuation)

# Enable tqdm
tqdm.pandas()

[nltk_data] Downloading package stopwords to /home/uk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/uk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


A function to clean a given text
- Lowercase
- Strip URLs
- Remove punctuation
- Tokenize
- Lemmatize
- Remove stop words

In [2]:
def clean_text(text: str) -> str:
    # Lowercase, remove urls and punctuation
    txt = text.lower()
    txt = URL_PATTERN.sub(" ", txt)
    txt = txt.translate(PUNCT_MAP)

    # Tokenize and lemmatize
    doc = nlp(txt)

    # Filter out stop words and non alpha tokens
    return " ".join(tok.lemma_ for tok in doc if tok.is_alpha and tok.text not in STOP_WORDS)

# An example output of the function
example = "Here's an Example: https://example.com Running, RUNNERS and ran!"
print("Input: ", example)
print("Output: ", clean_text(example))

Input:  Here's an Example: https://example.com Running, RUNNERS and ran!
Output:  here example run runner run


Load, apply cleaning and split datasets

In [3]:
from pathlib import Path
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split


# If processed data is not present, process
if not Path("data/processed/ISOT/isot_train.pkl").exists():
    # Load dataset
    isot_fake = pd.read_csv("data/raw/ISOT/Fake.csv")
    isot_true = pd.read_csv("data/raw/ISOT/True.csv")

    # Labels
    isot_fake["label"] = 0 # fake
    isot_true["label"] = 1 # true

    # Concatenate true and fake datas by only keeping their text and label parts
    isot_df = pd.concat([isot_fake[["text","label"]], isot_true[["text","label"]]], ignore_index=True) \
                .rename(columns={"text":"content"})

    # Display count of rows
    print(f"ISOT raw count: {isot_df.shape[0]:,} rows")

    # Clean dataset
    isot_df["cleaned"] = isot_df["content"].progress_apply(clean_text)

    # Display first few rows
    display(isot_df.head())

    # Split cleaned dataset into train and test datas
    isot_train_df, isot_test_df = train_test_split(isot_df, test_size=0.2, stratify=isot_df["label"], random_state=42)

    # Save datasets
    isot_train_filename = Path("data/processed/ISOT") / "isot_train.pkl"
    isot_test_filename = Path("data/processed/ISOT") / "isot_test.pkl"
    joblib.dump(isot_train_df, isot_train_filename)
    joblib.dump(isot_test_df, isot_test_filename)


# If processed data is not present, process
if not Path("data/processed/LIAR/liar_train.pkl").exists():
    # Columns of dataset
    liar_cols = [
        "id_json","label","statement","subject","speaker","speaker_job",
        "state_info","party_affil","barely_true","false","half_true",
        "mostly_true","pants_fire","context"
    ]

    # Load dataset
    liar_train = pd.read_csv("data/raw/LIAR/train.tsv", sep="\t", header=None, names=liar_cols)
    liar_valid = pd.read_csv("data/raw/LIAR/valid.tsv", sep="\t", header=None, names=liar_cols)
    liar_test  = pd.read_csv("data/raw/LIAR/test.tsv",  sep="\t", header=None, names=liar_cols)

    # Concatenate train, valid and test datas
    liar_df = pd.concat([liar_train, liar_valid, liar_test], ignore_index=True)

    # Group labels to turn them into a binary classification
    liar_df["label"] = liar_df["label"].map({
        "pants-fire":0, "false":0, "barely-true":0, # fake
        "half-true":1,  "mostly-true":1,  "true":1  # true
    })
    liar_df = liar_df[["statement","label"]].rename(columns={"statement":"content"})

    # Display count of rows
    print(f"LIAR raw count: {liar_df.shape[0]:,} rows")

    # Clean dataset
    liar_df["cleaned"] = liar_df["content"].progress_apply(clean_text)

    # Display first few rows
    display(liar_df.head())

    # Split cleaned dataset into train and test datas
    liar_train_df, liar_test_df = train_test_split(liar_df, test_size=0.2, stratify=liar_df["label"], random_state=42)

    # Save datasets
    liar_train_filename = Path("data/processed/LIAR") / "liar_train.pkl"
    liar_test_filename = Path("data/processed/LIAR") / "liar_test.pkl"
    joblib.dump(liar_train_df, liar_train_filename)
    joblib.dump(liar_test_df, liar_test_filename)

Load train datas

In [4]:
# Load train data
isot_train_df = joblib.load("data/processed/ISOT/isot_train.pkl")

# Display count of rows
print(f"ISOT processed count: {isot_train_df.shape[0]:,} rows")

# Display first few rows
display(isot_train_df.head())


# Load train data
liar_train_df = joblib.load("data/processed/LIAR/liar_train.pkl")

# Display count of rows
print(f"LIAR processed count: {liar_train_df.shape[0]:,} rows")

# Display first few rows
display(liar_train_df.head())

ISOT processed count: 35,918 rows


Unnamed: 0,content,label,cleaned
37645,PARIS (Reuters) - French bank Societe Generale...,1,paris reuters french bank societe generale wed...
30390,"WINSTON-SALEM, N.C. (Reuters) - North Carolina...",1,winstonsalem nc reuters north carolina governo...
18191,Civil political discourse took a beating in We...,0,civil political discourse take beating west vi...
25384,(Reuters) - New York and Washington state on M...,1,reuters new york washington state monday vow s...
32622,"ORLANDO, Fla. (Reuters) - Orlando nightclub ki...",1,orlando fla reuters orlando nightclub killer o...


LIAR processed count: 10,232 rows


Unnamed: 0,content,label,cleaned
6920,"On average, Americans spend less than 10 perce...",1,average americans spend less percent disposabl...
3934,The deficit this year could pay all of the 201...,1,deficit year could pay salary every profession...
9131,Farouk is on fire.,1,farouk fire
2685,Says Jacky Rosen has refused to tell us whethe...,0,say jacky rosen refuse tell we whether support...
12122,On Common Core.,0,common core


Create a dictionary out of cleaned train datasets

In [5]:
datasets = {
    "ISOT": isot_train_df,
    "LIAR": liar_train_df
}

Feature extraction constants

In [6]:
VECT_TYPES = ["bow", "tfidf"] # vectorization type
MIN_DFS = [30, 40] # minimum appearance value
REDUCTIONS = ["svd", "chi2"] # reduction type
SVD_COMPONENTS = [300, 500] # maximum feature size
CHI2_K_VALS    = [500, 700] # maximum feature size

Functions for feature extraction

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2


# Builds vectorizer according to given type and minimum appearance
def build_vectorizer(vtype: str, min_df: int):
    if vtype == "bow":
        return CountVectorizer(ngram_range=(1, 3), lowercase=True, min_df=min_df)
    elif vtype == "tfidf":
        return TfidfVectorizer(ngram_range=(1, 3), lowercase=True, min_df=min_df, norm="l2")
    else:
        raise ValueError("vtype must be 'bow' or 'tfidf'")

# Builds filepaths for features
def feature_path(dataset: str, vtype: str, min_df: int, red: str, param: int) -> Path:
    return Path("data/features") / dataset / f"{vtype}_min{min_df}_{red}{param}.joblib"

# Applies several different feature extraction methods and saves the generated features
def generate_features(dataset: str, texts, labels):
    for vtype in VECT_TYPES:
        for min_df in MIN_DFS:
            vect = build_vectorizer(vtype, min_df)
            X_vect = vect.fit_transform(texts)

            # TruncatedSVD variants
            for n_comp in SVD_COMPONENTS:
                f_path = feature_path(dataset, vtype, min_df, "svd", n_comp)
                print(f"\nGenerating SVD features: dataset={dataset}, vtype={vtype}, min_df={min_df}, n_comp={n_comp}")
                if f_path.exists():
                    print(f"Output shape: {joblib.load(f_path)["X"].shape}")
                    continue
                else:
                    n = min(n_comp, X_vect.shape[1] - 1)
                    svd = TruncatedSVD(n_components=n, algorithm="randomized", random_state=42)
                    X_red = svd.fit_transform(X_vect)
                    print(f"Output shape: {X_red.shape}")
                    joblib.dump({"vect": vect, "svd": svd, "X": X_red, "y": labels}, f_path)

            # Chi-squared variants
            for k_val in CHI2_K_VALS:
                f_path = feature_path(dataset, vtype, min_df, "chi2", k_val)
                print(f"\nGenerating Chi2 features: dataset={dataset}, vtype={vtype}, min_df={min_df}, k_val={k_val}")
                if f_path.exists():
                    print(f"Output shape: {joblib.load(f_path)["X"].shape}")
                    continue
                else:
                    k = min(k_val, X_vect.shape[1] - 1)
                    chi = SelectKBest(chi2, k=k)
                    X_red = chi.fit_transform(X_vect, labels)
                    print(f"Output shape: {X_red.shape}")
                    joblib.dump({"vect": vect, "chi2": chi, "X": X_red, "y": labels}, f_path)

Extract features from each dataset

In [8]:
for ds, df in datasets.items():
    generate_features(dataset=ds, texts=df["cleaned"].values, labels=df["label"].values)


Generating SVD features: dataset=ISOT, vtype=bow, min_df=30, n_comp=300
Output shape: (35918, 300)

Generating SVD features: dataset=ISOT, vtype=bow, min_df=30, n_comp=500
Output shape: (35918, 500)

Generating Chi2 features: dataset=ISOT, vtype=bow, min_df=30, k_val=500
Output shape: (35918, 500)

Generating Chi2 features: dataset=ISOT, vtype=bow, min_df=30, k_val=700
Output shape: (35918, 700)

Generating SVD features: dataset=ISOT, vtype=bow, min_df=40, n_comp=300
Output shape: (35918, 300)

Generating SVD features: dataset=ISOT, vtype=bow, min_df=40, n_comp=500
Output shape: (35918, 500)

Generating Chi2 features: dataset=ISOT, vtype=bow, min_df=40, k_val=500
Output shape: (35918, 500)

Generating Chi2 features: dataset=ISOT, vtype=bow, min_df=40, k_val=700
Output shape: (35918, 700)

Generating SVD features: dataset=ISOT, vtype=tfidf, min_df=30, n_comp=300
Output shape: (35918, 300)

Generating SVD features: dataset=ISOT, vtype=tfidf, min_df=30, n_comp=500
Output shape: (35918, 5

Models and parameters

In [9]:
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.svm import LinearSVC


models = {
    # Gradient Boosting Machine
    "gbm": {
        "estimator": HistGradientBoostingClassifier(random_state=42),
        "param_dist": {
            "learning_rate": [0.05, 0.1],
            "max_depth": [2, 3]
        }
    },

    # Support Vector Machine
    "svm": {
        "estimator": LinearSVC(class_weight="balanced", random_state=42),
        "param_dist": {
            "C": np.logspace(-2, 1, 5)
        }
    }
}

A function to train models

In [None]:
from scipy import sparse
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold


# Train a model with RandomizedSearchCV
def train_model(X, y, ftr_tag: str, model_key: str):
    mfile = Path("saved_models/method1") / f"{ftr_tag}_{model_key}.joblib"
    if mfile.exists():
        return

    model = models[model_key]
    cv  = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # 5 folds

    # Make matrix dense if it is sparse for GBM
    if model_key in {"gbm"} and sparse.issparse(X):
        X = X.toarray()

    search = RandomizedSearchCV(
        estimator=model["estimator"],
        param_distributions=model["param_dist"],
        n_iter=2,
        scoring="f1_macro",
        n_jobs=4,
        cv=cv,
        verbose=0,
        random_state=42
    ).fit(X, y)

    joblib.dump(search.best_estimator_, mfile)

Train models

In [11]:
for ds in datasets.keys():
    for vtype in VECT_TYPES:
        for min_df in MIN_DFS:

            # SVD combinations
            for n in SVD_COMPONENTS:
                ftr_obj = joblib.load(feature_path(ds, vtype, min_df, "svd", n))
                X, y = ftr_obj["X"], ftr_obj["y"]
                ftr_tag  = f"{ds.lower()}_{vtype}_min{min_df}_svd{n}"
                for model in models.keys():
                    print(f"\nTraining on: dataset={ds}, vtype={vtype}, min_df={min_df}, reduction=svd, n_comp={n}, model={model}")
                    train_model(X, y, ftr_tag, model)

            # Chi-squared combinations
            for k in CHI2_K_VALS:
                ftr_obj = joblib.load(feature_path(ds, vtype, min_df, "chi2", k))
                X, y = ftr_obj["X"], ftr_obj["y"]
                ftr_tag  = f"{ds.lower()}_{vtype}_min{min_df}_chi2{k}"
                for model in models.keys():
                    print(f"\nTraining on: dataset={ds}, vtype={vtype}, min_df={min_df}, reduction=chi2, k_val={k}, model={model}")
                    train_model(X, y, ftr_tag, model)


Training on: dataset=ISOT, vtype=bow, min_df=30, reduction=svd, n_comp=300, model=gbm

Training on: dataset=ISOT, vtype=bow, min_df=30, reduction=svd, n_comp=300, model=svm

Training on: dataset=ISOT, vtype=bow, min_df=30, reduction=svd, n_comp=500, model=gbm

Training on: dataset=ISOT, vtype=bow, min_df=30, reduction=svd, n_comp=500, model=svm

Training on: dataset=ISOT, vtype=bow, min_df=30, reduction=chi2, k_val=500, model=gbm

Training on: dataset=ISOT, vtype=bow, min_df=30, reduction=chi2, k_val=500, model=svm

Training on: dataset=ISOT, vtype=bow, min_df=30, reduction=chi2, k_val=700, model=gbm

Training on: dataset=ISOT, vtype=bow, min_df=30, reduction=chi2, k_val=700, model=svm

Training on: dataset=ISOT, vtype=bow, min_df=40, reduction=svd, n_comp=300, model=gbm

Training on: dataset=ISOT, vtype=bow, min_df=40, reduction=svd, n_comp=300, model=svm

Training on: dataset=ISOT, vtype=bow, min_df=40, reduction=svd, n_comp=500, model=gbm

Training on: dataset=ISOT, vtype=bow, min_