In [5]:
import pandas as pd
import os

# Input and output directories
raw_meta_dir = "../data/processed/metadata_v2"
clean_meta_dir = "../data/processed/metadata_cleaned"

# Ensure output directory exists
os.makedirs(clean_meta_dir, exist_ok=True)

# Client metadata files
clients = {
    "client_1": "client_1_meta_v2.csv",
    "client_2": "client_2_meta_v2.csv",
    "client_3": "client_3_meta_v2.csv",
    "client_4": "client_4_meta_v2.csv"
}


In [2]:
def engineer_metadata_features(df, client_id):
    # Derived Features
    if all(col in df.columns for col in ["url_avg_length", "url_max_length"]):
        df["url_length_ratio"] = df["url_avg_length"] / (df["url_max_length"] + 1e-5)
    
    if all(col in df.columns for col in ["url_count_in_body", "body_length"]):
        df["url_density"] = df["url_count_in_body"] / (df["body_length"] + 1e-5)
    
    # Drop known redundant or highly correlated features
    drop_cols = [
        "url_avg_length", "url_max_length", "url_count_in_body",
        "has_valid_date", "is_midnight_hour", "email_weekday"
    ]

    # Handle client-specific constant features
    if client_id == "client_4" or df["is_weekend"].nunique() == 1:
        drop_cols.append("is_weekend")

    df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)
    
    return df


In [3]:
for client_id, filename in clients.items():
    print(f"🔧 Processing metadata for {client_id}...")

    # Load original metadata
    df = pd.read_csv(os.path.join(raw_meta_dir, filename))

    # Feature engineering
    df_clean = engineer_metadata_features(df, client_id)

    # Save cleaned metadata
    new_filename = filename.replace("_meta_v2", "_meta_cleaned")
    save_path = os.path.join(clean_meta_dir, new_filename)
    df_clean.to_csv(save_path, index=False)

    print(f"✅ Saved cleaned metadata: {save_path}\n")


🔧 Processing metadata for client_1...
✅ Saved cleaned metadata: ../data/processed/metadata_cleaned/client_1_meta_cleaned.csv

🔧 Processing metadata for client_2...
✅ Saved cleaned metadata: ../data/processed/metadata_cleaned/client_2_meta_cleaned.csv

🔧 Processing metadata for client_3...
✅ Saved cleaned metadata: ../data/processed/metadata_cleaned/client_3_meta_cleaned.csv

🔧 Processing metadata for client_4...
✅ Saved cleaned metadata: ../data/processed/metadata_cleaned/client_4_meta_cleaned.csv



In [4]:
print("📊 Cleaned Feature Columns Per Client:")
for client_id, filename in clients.items():
    new_filename = filename.replace("_meta_v2", "_meta_cleaned")
    df_clean = pd.read_csv(os.path.join(clean_meta_dir, new_filename))
    print(f"{client_id}: {list(df_clean.columns)}")


📊 Cleaned Feature Columns Per Client:
client_1: ['sender_domain_length', 'sender_has_digits', 'sender_has_special_chars', 'sender_tld', 'sender_is_public_domain', 'receiver_is_undisclosed', 'receiver_is_public_domain', 'sender_equals_receiver', 'email_hour', 'is_weekend', 'url_present', 'url_has_ip', 'url_has_special_chars', 'url_has_redirect', 'url_suspicious_tld', 'subject_length', 'body_length', 'text_combined_length', 'uppercase_ratio', 'exclamation_count', 'label', 'url_length_ratio', 'url_density']
client_2: ['sender_domain_length', 'sender_has_digits', 'sender_has_special_chars', 'sender_tld', 'sender_is_public_domain', 'receiver_is_undisclosed', 'receiver_is_public_domain', 'sender_equals_receiver', 'email_hour', 'is_weekend', 'url_present', 'url_has_ip', 'url_has_special_chars', 'url_has_redirect', 'url_suspicious_tld', 'subject_length', 'body_length', 'text_combined_length', 'uppercase_ratio', 'exclamation_count', 'label', 'url_length_ratio', 'url_density']
client_3: ['sender