In [1]:
import pandas as pd
import numpy as np
import os
import re
from urllib.parse import urlparse
from sklearn.preprocessing import LabelEncoder

# Paths
CLIENT_PATH = "../data/clients"
TEXT_OUT = "../data/processed/text"
META_OUT = "../data/processed/metadata"

# Create folders if missing
os.makedirs(TEXT_OUT, exist_ok=True)
os.makedirs(META_OUT, exist_ok=True)

client_files = {
    "client_1": "client_1.csv",
    "client_2": "client_2.csv",
    "client_3": "client_3.csv",
    "client_4": "client_4.csv"
}


In [2]:
def prepare_text_df(df):
    text = (df["subject"].fillna("") + " " + df["body"].fillna("")).str.strip()
    text = text.str.replace(r'\s+', ' ', regex=True).str.lower()
    return pd.DataFrame({
        "text": text,
        "label": df["label"]
    })


In [5]:
def extract_domain(email):
    try:
        return email.split("@")[-1].lower()
    except:
        return "unknown"

def prepare_metadata_df(df):
    df = df.copy()
    df["sender_domain"] = df["sender"].apply(extract_domain)
    df["receiver_domain"] = df["receiver"].apply(extract_domain)

    # Convert to datetime safely
    df["date"] = pd.to_datetime(df["date"], errors='coerce', utc=True)

    # Fill NaTs with defaults before extracting time-based features
    df["hour"] = df["date"].dt.hour.fillna(-1).astype(int)
    df["weekday"] = df["date"].dt.weekday.fillna(-1).astype(int)

    # Keep only useful columns
    meta = df[["sender_domain", "receiver_domain", "hour", "weekday", "urls", "label"]].fillna("unknown")

    # Label encode categorical
    for col in ["sender_domain", "receiver_domain"]:
        le = LabelEncoder()
        meta[col] = le.fit_transform(meta[col])

    return meta



In [6]:
for client_id, filename in client_files.items():
    path = os.path.join(CLIENT_PATH, filename)
    df = pd.read_csv(path)

    # Text Processing
    text_df = prepare_text_df(df)
    text_df.to_csv(os.path.join(TEXT_OUT, f"{client_id}_text.csv"), index=False)

    # Metadata Processing
    meta_df = prepare_metadata_df(df)
    meta_df.to_csv(os.path.join(META_OUT, f"{client_id}_meta.csv"), index=False)

    print(f"✅ Processed {client_id}: text + metadata")


✅ Processed client_1: text + metadata
✅ Processed client_2: text + metadata
✅ Processed client_3: text + metadata
✅ Processed client_4: text + metadata


  df["date"] = pd.to_datetime(df["date"], errors='coerce', utc=True)
