In [4]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

In [7]:
# ----------------------------
# 1. Load dataset
# ----------------------------
df = pd.read_csv("dataset.csv")  # replace with your CSV path
print(f"Original dataset size: {df.shape[0]} rows")

# ----------------------------
# 2. Binary classification
# ----------------------------
df['text_type'] = df['text_type'].str.lower()
before_filter = df.shape[0]

df = df[df['text_type'].isin(['ham', 'smishing', 'spam'])]
after_filter = df.shape[0]
print(f"After filtering text_type: {after_filter} rows ({before_filter - after_filter} rows dropped)")

label_map = {'ham': 0, 'spam': 1}
df['label_encoded'] = df['text_type'].map(label_map)

# ----------------------------
# 3. FAIR Text Cleaning Function
# ----------------------------
def clean_text(text):
    text = text.lower()

    # Normalize phishing features (KEEP them)
    text = re.sub(r"http\S+", "<URL>", text)
    text = re.sub(r"\S+@\S+", "<EMAIL>", text)
    text = re.sub(r"\b\d{8,}\b", "<PHONE>", text)  # long numbers, phones

    # Remove other punctuation/noise (but keep <URL> etc.)
    text = re.sub(r"[^a-zA-Z0-9\s<>]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

df['cleaned_text'] = df['text'].astype(str).apply(clean_text)

# Check duplicates
before_dup = df.shape[0]
df = df.drop_duplicates(subset='cleaned_text')
after_dup = df.shape[0]
print(f"After dropping duplicates: {after_dup} rows ({before_dup - after_dup} duplicates removed)")

# Check missing text
missing_count = df['cleaned_text'].isna().sum()
print(f"Missing text before filling: {missing_count}")
df['cleaned_text'] = df['cleaned_text'].fillna('')

# ----------------------------
# 4. Train/test split
# ----------------------------
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['cleaned_text'],
    df['label_encoded'],
    test_size=0.2,
    random_state=42,
    stratify=df['label_encoded']
)
print(f"Train size: {X_train_text.shape[0]} rows, Test size: {X_test_text.shape[0]} rows")

# Saved
output_file = "Cleaned_Dataset_new.csv"
df.to_csv(output_file, index=False)
print(f"Cleaned dataset saved to '{output_file}' with {df.shape[0]} rows")


Original dataset size: 20348 rows
After filtering text_type: 20348 rows (0 rows dropped)
After dropping duplicates: 20104 rows (244 duplicates removed)
Missing text before filling: 0
Train size: 16083 rows, Test size: 4021 rows
Cleaned dataset saved to 'Cleaned_Dataset_new.csv' with 20104 rows
