# **Research Notebook 02: ISOT Data Preparation**

**Objective:** This script applies our final, rigorous preprocessing logic.
1. Shared `base_article_cleaner`: Removes leakage (headers, footers, metadata) contextually.
2. DistilBERT gets "rich" text (linguistic structure intact).
3. TF-IDF gets "clean" text (special chars and clickbait markers removed).

In [1]:
# Import Libraries

import pandas as pd
import re
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# Define Text Cleaning Functions

def base_article_cleaner(text):
    """
    Removes dataset-specific headers, footers, and boilerplate artifacts
    that are not part of the journalistic content.
    Applied to BOTH TF-IDF and DistilBERT.
    """
    if not isinstance(text, str):
        return ""

    # --- Remove Reuters-style datelines (headers) ---
    # Examples:
    # WASHINGTON (Reuters) -
    # SEATTLE/WASHINGTON (Reuters) -
    # (Reuters) -
    text = re.sub(
        r'^[A-Z\s\/,.\-]+\s*\(reuters\)\s*[—-]*\s*',
        ' ',
        text,
        flags=re.IGNORECASE
    )

    # --- Remove Reuters source link footers ---
    # Examples:
    # [1024 EST] -- Source link: (bit.ly/...)
    text = re.sub(
        r'\[\d+\s+est\].*$',
        ' ',
        text,
        flags=re.IGNORECASE
    )

    # --- Remove trailing attribution footers (NEW - V8) ---
    # Captures: "Via: WND", "Read more: WSJ", "Via: Washington Times"
    text = re.sub(
        r'(?:via|read more):\s*[A-Za-z0-9\s\.]+$',
        ' ',
        text,
        flags=re.IGNORECASE
    )

    # --- Remove trailing image-credit boilerplate ---
    # Captures:
    # Featured image via Getty Images
    # Read more:Featured image via Handout/Getty Images
    # Photo by Andrew Burton/Getty Images
    # Image via video screen capture
    text = re.sub(
        r'(read more:)?\s*(featured image|image|photo)\s*(by|via)\s*.*$',
        ' ',
        text,
        flags=re.IGNORECASE
    )

    # --- Remove media credit providers (metadata only) ---
    text = re.sub(
        r'\b(getty images?|ap images?|afp|stringer)\b',
        ' ',
        text,
        flags=re.IGNORECASE
    )

    # --- Remove URLs and short links ---
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    text = re.sub(r'bit\.ly/\S+', ' ', text)

    # --- Normalize whitespace ---
    text = re.sub(r'\s+', ' ', text).strip()

    return text


def preprocess_for_tfidf(text):
    """
    Aggressive preprocessing for TF-IDF + Logistic Regression.
    Removes formatting cues and normalizes text for bag-of-words models.
    """
    text = base_article_cleaner(text)

    # Lowercase (TF-IDF is case-sensitive)
    text = text.lower()

    # Remove punctuation and digits
    text = re.sub(r'[^a-z\s]', ' ', text)

    # Remove leftover clickbait / boilerplate markers
    text = re.sub(r'\bread more:\b', ' ', text)
    text = re.sub(r'\bwatch:\b', ' ', text)
    text = re.sub(r'\bclick here\b', ' ', text)

    # Normalize whitespace again
    text = re.sub(r'\s+', ' ', text).strip()

    return text


def preprocess_for_bert(text):
    """
    Minimal preprocessing for DistilBERT.
    Preserves linguistic structure while removing non-semantic artifacts.
    """
    text = base_article_cleaner(text)

    # - Do NOT lowercase
    # - Do NOT remove punctuation
    # - Do NOT remove stopwords
    # DistilBERT tokenizer handles these internally

    return text

In [3]:
# Mount Google Drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Define File Paths

base_path = '/content/drive/MyDrive/Research/datasets/'

true_path = f'{base_path}True.csv'
fake_path = f'{base_path}Fake.csv'

train_output_path = f'{base_path}train_dataset_clean.csv'
test_output_path = f'{base_path}test_dataset_clean.csv'

In [5]:
# Load Datasets

print("--- ISOT Data Preparation ---")

print("\nLoading datasets...")
df_true = pd.read_csv(true_path)
df_fake = pd.read_csv(fake_path)
print("Datasets loaded successfully.")

--- ISOT Data Preparation ---

Loading datasets...
Datasets loaded successfully.


In [6]:
# Add 'label' column

df_true['label'] = 'real'
df_fake['label'] = 'fake'

In [7]:
# Combine datasets before dropping duplicates

df_combined = pd.concat([df_true, df_fake]).reset_index(drop=True)
print(f"Combined dataset size (before duplicate removal): {len(df_combined)}")

Combined dataset size (before duplicate removal): 44898


In [8]:
# Handle Duplicates & Nulls (from EDA)

print("Removing duplicates and null values...")

# Drop rows where text is missing
df_combined = df_combined.dropna(subset=['text'])

# Drop exact duplicate rows based on 'text'
# This is the step that finds cross-dataset duplicates.
rows_before_duplicates = len(df_combined)
df_combined = df_combined.drop_duplicates(subset=['text'])
print(f"\nRemoved {rows_before_duplicates - len(df_combined)} duplicate/null articles.")
print(f"Combined dataset size (after duplicate removal): {len(df_combined)}")

Removing duplicates and null values...

Removed 6252 duplicate/null articles.
Combined dataset size (after duplicate removal): 38646


In [9]:
# Filter by Subject (Only Political News)

print("Filtering articles by subject...")

# We now filter from the combined, de-duplicated dataframe
df_true_filtered = df_combined[df_combined['subject'] == 'politicsNews'].copy()
df_fake_filtered = df_combined[df_combined['subject'] == 'politics'].copy()

print(f"\nFiltered 'real' Political News: {len(df_true_filtered)} articles")
print(f"Filtered 'fake' Political News: {len(df_fake_filtered)} articles")

Filtering articles by subject...

Filtered 'real' Political News: 11214 articles
Filtered 'fake' Political News: 6424 articles


In [10]:
# Re-combine the two filtered dataframes

df_combined_filtered = pd.concat([df_true_filtered, df_fake_filtered]).reset_index(drop=True)
print(f"Combined filtered dataset size (before cleaning): {len(df_combined_filtered)} articles")

Combined filtered dataset size (before cleaning): 17638 articles


In [11]:
# Create the Two Text Columns (Cleaning the article text)

print("Applying the two cleaning pipelines...")

# Pipeline 1: For DistilBERT
print("\nApplying text cleaning for DistilBERT...")
df_combined_filtered['preprocessed_text_bert'] = df_combined_filtered['text'].apply(preprocess_for_bert)

# Pipeline 2: For TF-IDF
print("Applying text cleaning for TF-IDF...")
df_combined_filtered['cleaned_text_tfidf'] = df_combined_filtered['text'].apply(preprocess_for_tfidf)

print("\nText cleaning complete.")

Applying the two cleaning pipelines...

Applying text cleaning for DistilBERT...
Applying text cleaning for TF-IDF...

Text cleaning complete.


In [12]:
# Filter by text length

print("Filtering articles by length...")

# We will filter based on the TF-IDF text length, as it's the stricter one.
df_combined_filtered['word_count_tfidf'] = df_combined_filtered['cleaned_text_tfidf'].apply(lambda x: len(x.split()))

# Remove empty/null articles
rows_before = len(df_combined_filtered)
df_combined_filtered = df_combined_filtered[df_combined_filtered['cleaned_text_tfidf'].str.len() > 0]
df_combined_filtered = df_combined_filtered.dropna(subset=['cleaned_text_tfidf'])
print(f"\nRemoved {rows_before - len(df_combined_filtered)} empty/null articles.")

# Remove articles with fewer than 50 words
rows_before = len(df_combined_filtered)
min_word_count = 50
df_combined_filtered = df_combined_filtered[df_combined_filtered['word_count_tfidf'] >= min_word_count]
print(f"Removed {rows_before - len(df_combined_filtered)} articles with fewer than {min_word_count} words.")
print(f"\nCleaned and filtered dataset size: {len(df_combined_filtered)} articles")

Filtering articles by length...

Removed 64 empty/null articles.
Removed 880 articles with fewer than 50 words.

Cleaned and filtered dataset size: 16694 articles


In [13]:
# Balance the dataset

print("Balancing the dataset...")
print("\nClass counts before balancing:")
print(df_combined_filtered['label'].value_counts())

# Determine majority and minority
class_counts = df_combined_filtered['label'].value_counts()
majority_label = class_counts.idxmax()
minority_label = class_counts.idxmin()
minority_size = class_counts.min()

df_majority = df_combined_filtered[df_combined_filtered['label'] == majority_label]
df_minority = df_combined_filtered[df_combined_filtered['label'] == minority_label]

# Downsample the majority class
df_majority_downsampled = df_majority.sample(
    n=minority_size,
    random_state=101
)

# Combine and shuffle
df_balanced = pd.concat([df_majority_downsampled, df_minority])
df_balanced = df_balanced.sample(frac=1, random_state=101).reset_index(drop=True)

print(f"\nFinal balanced dataset size: {len(df_balanced)} articles")
print("\nClass counts after balancing:")
print(df_balanced['label'].value_counts())


Balancing the dataset...

Class counts before balancing:
label
real    11054
fake     5640
Name: count, dtype: int64

Final balanced dataset size: 11280 articles

Class counts after balancing:
label
fake    5640
real    5640
Name: count, dtype: int64


In [14]:
# Train-Test Split (80/20)

print("Splitting into training and testing sets...")

# Select only the columns we need
final_columns = ['label', 'preprocessed_text_bert', 'cleaned_text_tfidf']
df_final = df_balanced[final_columns]

train_df, test_df = train_test_split(
    df_final,
    test_size=0.2,
    random_state=101,
    stratify=df_final['label']
)
print(f"\nTraining set size: {len(train_df)}")
print(f"Testing set size: {len(test_df)}")

Splitting into training and testing sets...

Training set size: 9024
Testing set size: 2256


In [15]:
# Overview of the final cleaned training dataset

print("Final training dataset overview:\n")
train_df.head()

Final training dataset overview:



Unnamed: 0,label,preprocessed_text_bert,cleaned_text_tfidf
2347,fake,What Catholic college campus would be complete...,what catholic college campus would be complete...
8664,fake,In asking KT McFarland to become his Deputy Na...,in asking kt mcfarland to become his deputy na...
294,real,U.S. Defense Secretary Ash Carter called his J...,u s defense secretary ash carter called his ja...
5606,real,Puerto Rico Governor Ricardo Rossello said on ...,puerto rico governor ricardo rossello said on ...
8231,fake,"""Fake news"" was born in August 2014 in Ferguso...",fake news was born in august in ferguson mo wh...


In [16]:
# Overview of the final cleaned test dataset

print("Final test dataset overview:\n")
test_df.head()

Final test dataset overview:



Unnamed: 0,label,preprocessed_text_bert,cleaned_text_tfidf
10128,fake,Joe Piscopo is hysterical! He was on with Neil...,joe piscopo is hysterical he was on with neil ...
6690,real,The United States on Monday described as “trou...,the united states on monday described as troub...
902,real,Staff at the U.S. Environmental Protection Age...,staff at the u s environmental protection agen...
8456,fake,This announcement seems to indicate that the O...,this announcement seems to indicate that the o...
7304,real,Members of the Democratic Party’s liberal wing...,members of the democratic party s liberal wing...


In [17]:
# Save the Final Datasets

print("Saving the final training and testing datasets...")

train_df.to_csv(train_output_path, index=False)
test_df.to_csv(test_output_path, index=False)

print(f"\nTraining data saved to: {train_output_path}")
print(f"Testing data saved to: {test_output_path}")

print("\n--- Data Preparation Complete ---")

Saving the final training and testing datasets...

Training data saved to: /content/drive/MyDrive/Research/datasets/train_dataset_clean.csv
Testing data saved to: /content/drive/MyDrive/Research/datasets/test_dataset_clean.csv

--- Data Preparation Complete ---
