# **Research Notebook 02: ISOT Data Preparation**

**Objective:** This script loads, combines, de-duplicates, filters, balances and cleans the ISOT dataset to create our two final model-ready text columns and split them into train/test sets.

In [1]:
# Import Libraries

import pandas as pd
import re
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# Define Text Cleaning Functions (Two Pipelines)
# This function will be applied to every article.

def minimal_preprocess_for_bert(text):
    """
    This function performs only the most essential preprocessing for DistilBERT.
    We just remove the Reuters dateline to prevent data leakage.
    """
    # Ensure text is a string
    if not isinstance(text, str):
        return ""

    # 1. Remove Reuters dateline (to prevent data leakage)
    # This finds patterns like "washington (reuters) -" or just "(reuters) -"
    text = re.sub(r'^[a-z\s]*\(reuters\)\s*[—-]*\s*', '', text, flags=re.IGNORECASE)

    # 2. Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 3. Remove extra whitespace that might result from URL removal
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def clean_for_tfidf(text):
    """
    This function performs the "classic" NLP pipeline for our TF-IDF model.
    It lowercases, removes punctuation, and numbers.
    """
    # Ensure text is a string
    if not isinstance(text, str):
        return ""

    # 1. Convert to lowercase
    text = text.lower()

    # 2. Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove Reuters dateline (to prevent data leakage)
    # This finds patterns like "washington (reuters) -" or just "(reuters) -"
    text = re.sub(r'^[a-z\s]*\(reuters\)\s*[—-]*\s*', '', text, flags=re.IGNORECASE)

    # 3. Remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)

    # 4. Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [3]:
# Mount Google Drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Define File Paths

base_path = '/content/drive/MyDrive/Research/datasets/'

true_path = f'{base_path}True.csv'
fake_path = f'{base_path}Fake.csv'

train_output_path = f'{base_path}train_dataset.csv'
test_output_path = f'{base_path}test_dataset.csv'

In [5]:
# Load Datasets

print("--- ISOT Data Preparation ---")

print("\nLoading datasets...")
df_true = pd.read_csv(true_path)
df_fake = pd.read_csv(fake_path)
print("Datasets loaded successfully.")

--- ISOT Data Preparation ---

Loading datasets...
Datasets loaded successfully.


In [6]:
# Add 'label' column

df_true['label'] = 'real'
df_fake['label'] = 'fake'

In [7]:
# Combine datasets before dropping duplicates

df_combined = pd.concat([df_true, df_fake]).reset_index(drop=True)
print(f"Combined dataset size (before duplicate removal): {len(df_combined)}")

Combined dataset size (before duplicate removal): 44898


In [8]:
# Handle Duplicates & Nulls (from EDA)

print("Removing duplicates and null values...")

# Drop rows where text is missing
df_combined = df_combined.dropna(subset=['text'])

# Drop exact duplicate rows based on 'text'
# This is the step that finds cross-dataset duplicates.
rows_before_duplicates = len(df_combined)
df_combined = df_combined.drop_duplicates(subset=['text'])
print(f"\nRemoved {rows_before_duplicates - len(df_combined)} duplicate/null articles.")
print(f"Combined dataset size (after duplicate removal): {len(df_combined)}")

Removing duplicates and null values...

Removed 6252 duplicate/null articles.
Combined dataset size (after duplicate removal): 38646


In [9]:
# Filter by Subject (Only Political News)

print("Filtering articles by subject...")

# We now filter from the combined, de-duplicated dataframe
df_true_filtered = df_combined[df_combined['subject'] == 'politicsNews'].copy()
df_fake_filtered = df_combined[df_combined['subject'] == 'politics'].copy()

print(f"\nFiltered 'real' Political News: {len(df_true_filtered)} articles")
print(f"Filtered 'fake' Political News: {len(df_fake_filtered)} articles")

Filtering articles by subject...

Filtered 'real' Political News: 11214 articles
Filtered 'fake' Political News: 6424 articles


In [11]:
# Re-combine the two filtered dataframes

df_combined_filtered = pd.concat([df_true_filtered, df_fake_filtered]).reset_index(drop=True)
print(f"Combined filtered dataset size (before cleaning): {len(df_combined_filtered)} articles")

Combined filtered dataset size (before cleaning): 17638 articles


In [12]:
# Create the Two Text Columns (Cleaning the article text)

print("Applying the two cleaning pipelines...")

# Pipeline 1: For DistilBERT
# We apply the minimal cleaning to the original 'text' column.
print("\nApplying minimal preprocessing for DistilBERT...")
df_combined_filtered['preprocessed_text_bert'] = df_combined_filtered['text'].apply(minimal_preprocess_for_bert)

# Pipeline 2: For TF-IDF
# We apply the *full* cleaning to the original 'text' column.
print("Applying full cleaning for TF-IDF...")
df_combined_filtered['cleaned_text_tfidf'] = df_combined_filtered['text'].apply(clean_for_tfidf)

print("\nText cleaning complete.")

Applying the two cleaning pipelines...

Applying minimal preprocessing for DistilBERT...
Applying full cleaning for TF-IDF...

Text cleaning complete.


In [13]:
# Filter by text length

print("Filtering articles by length...")

# We will filter based on the TF-IDF text length, as it's the stricter one.
df_combined_filtered['word_count_tfidf'] = df_combined_filtered['cleaned_text_tfidf'].apply(lambda x: len(x.split()))

# Remove empty/null articles
rows_before = len(df_combined_filtered)
df_combined_filtered = df_combined_filtered[df_combined_filtered['cleaned_text_tfidf'].str.len() > 0]
df_combined_filtered = df_combined_filtered.dropna(subset=['cleaned_text_tfidf'])
print(f"\nRemoved {rows_before - len(df_combined_filtered)} empty/null articles.")

# Remove articles with fewer than 50 words
rows_before = len(df_combined_filtered)
min_word_count = 50
df_combined_filtered = df_combined_filtered[df_combined_filtered['word_count_tfidf'] >= min_word_count]
print(f"Removed {rows_before - len(df_combined_filtered)} articles with fewer than {min_word_count} words.")
print(f"\nCleaned and filtered dataset size: {len(df_combined_filtered)} articles")

Filtering articles by length...

Removed 54 empty/null articles.
Removed 884 articles with fewer than 50 words.

Cleaned and filtered dataset size: 16700 articles


In [14]:
# Balance the dataset

print("Balancing the dataset...")
print("\nClass counts before balancing:")
print(df_combined_filtered['label'].value_counts())

# Determine majority and minority
class_counts = df_combined_filtered['label'].value_counts()
majority_label = class_counts.idxmax()
minority_label = class_counts.idxmin()
minority_size = class_counts.min()

df_majority = df_combined_filtered[df_combined_filtered['label'] == majority_label]
df_minority = df_combined_filtered[df_combined_filtered['label'] == minority_label]

# Downsample the majority class
df_majority_downsampled = df_majority.sample(
    n=minority_size,
    random_state=42
)

# Combine and shuffle
df_balanced = pd.concat([df_majority_downsampled, df_minority])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nFinal balanced dataset size: {len(df_balanced)} articles")
print("\nClass counts after balancing:")
print(df_balanced['label'].value_counts())


Balancing the dataset...

Class counts before balancing:
label
real    11057
fake     5643
Name: count, dtype: int64

Final balanced dataset size: 11286 articles

Class counts after balancing:
label
fake    5643
real    5643
Name: count, dtype: int64


In [15]:
# Train-Test Split (80/20)

print("Splitting into training and testing sets...")

# Select only the columns we need
final_columns = ['label', 'preprocessed_text_bert', 'cleaned_text_tfidf']
df_final = df_balanced[final_columns]

train_df, test_df = train_test_split(
    df_final,
    test_size=0.2,
    random_state=42,
    stratify=df_final['label']
)
print(f"\nTraining set size: {len(train_df)}")
print(f"Testing set size: {len(test_df)}")

Splitting into training and testing sets...

Training set size: 9028
Testing set size: 2258


In [16]:
# Overview of the final cleaned training dataset

print("Final training dataset overview:\n")
train_df.head()

Final training dataset overview:



Unnamed: 0,label,preprocessed_text_bert,cleaned_text_tfidf
755,fake,"The Republican Party should be so fortunate, a...",the republican party should be so fortunate as...
571,fake,"Glenn Beck has just proven once again, (to any...",glenn beck has just proven once again to anyon...
2941,fake,The Democrat s war on mostly ugly women Former...,the democrat s war on mostly ugly women former...
861,fake,GREG GUTFELD IS SO PASSIONATE and so on target...,greg gutfeld is so passionate and so on target...
2189,fake,"When asked by the other hosts on the show, Whe...",when asked by the other hosts on the show wher...


In [17]:
# Overview of the final cleaned test dataset

print("Final test dataset overview:\n")
test_df.head()

Final test dataset overview:



Unnamed: 0,label,preprocessed_text_bert,cleaned_text_tfidf
6122,fake,President Trump is scheduled to attend the Wre...,president trump is scheduled to attend the wre...
7627,fake,The breaking point is fast approaching in a di...,the breaking point is fast approaching in a di...
5075,fake,Andrew McCarthy is just awesome! He goes throu...,andrew mccarthy is just awesome he goes throug...
10144,real,The U.S. Department of Defense and Lockheed Ma...,the u s department of defense and lockheed mar...
6094,real,Chancellor Angela Merkel is working to set a d...,chancellor angela merkel is working to set a d...


In [18]:
# Save the Final Datasets

print("Saving the final training and testing datasets...")
train_df.to_csv(train_output_path, index=False)
test_df.to_csv(test_output_path, index=False)
print(f"\nTraining data saved to: {train_output_path}")
print(f"Testing data saved to: {test_output_path}")

print("\n--- Data Preparation Complete ---")

Saving the final training and testing datasets...

Training data saved to: /content/drive/MyDrive/Research/datasets/train_dataset.csv
Testing data saved to: /content/drive/MyDrive/Research/datasets/test_dataset.csv

--- Data Preparation Complete ---
