In [1]:
import pandas as pd
import numpy as np
import os
import re
from urllib.parse import urlparse
from sklearn.preprocessing import LabelEncoder
import warnings
from dateutil.parser import UnknownTimezoneWarning

warnings.filterwarnings("ignore", category=UnknownTimezoneWarning)


In [2]:
def extract_domain(email):
    try:
        return email.split("@")[-1].lower()
    except:
        return "unknown"

def is_public_domain(domain):
    public_domains = {"gmail.com", "yahoo.com", "outlook.com", "hotmail.com", "aol.com", "protonmail.com"}
    return int(domain in public_domains)

def count_suspicious_chars(s):
    return sum(s.count(c) for c in ['-', '_', '~', '%', '='])

def has_digits(s):
    return int(any(char.isdigit() for char in s))

def get_tld(domain):
    parts = domain.split(".")
    return parts[-1] if len(parts) > 1 else "unknown"


In [3]:
def extract_url_features(text):
    url_pattern = r"(https?://[^\s]+)"
    urls = re.findall(url_pattern, str(text))
    url_count = len(urls)
    url_lengths = [len(u) for u in urls]
    avg_length = np.mean(url_lengths) if url_lengths else 0
    max_length = np.max(url_lengths) if url_lengths else 0
    has_ip = int(any(re.search(r"http[s]?://(?:\d{1,3}\.){3}\d{1,3}", u) for u in urls))
    has_special_chars = sum(u.count('?') + u.count('&') + u.count('=') + u.count('@') + u.count('%') for u in urls)
    has_redirect = int(any(u.count("//") > 1 or u.count("http") > 1 for u in urls))

    suspicious_tlds = {"tk", "xyz", "ru", "top", "ml", "ga", "cf", "gq"}
    tlds = []
    for u in urls:
        try:
            parsed = urlparse(u)
            domain = parsed.netloc
            if domain:
                tlds.append(get_tld(domain))
        except:
            continue
    has_suspicious_tld = int(any(tld in suspicious_tlds for tld in tlds))
    return pd.Series([url_count, avg_length, max_length, has_ip, has_special_chars, has_redirect, has_suspicious_tld])


In [6]:
def build_metadata_features(df):
    df = df.copy()
    df["sender_domain"] = df["sender"].apply(extract_domain)
    df["receiver_domain"] = df["receiver"].apply(extract_domain)

    df["sender_domain_length"] = df["sender_domain"].apply(len)
    df["sender_has_digits"] = df["sender_domain"].apply(has_digits)
    df["sender_has_special_chars"] = df["sender_domain"].apply(count_suspicious_chars)
    df["sender_tld"] = df["sender_domain"].apply(get_tld)
    df["sender_is_public_domain"] = df["sender_domain"].apply(is_public_domain)
    df["receiver_is_undisclosed"] = df["receiver"].fillna("").str.contains("undisclosed", case=False).astype(int)
    df["receiver_is_public_domain"] = df["receiver_domain"].apply(is_public_domain)
    df["sender_equals_receiver"] = (df["sender_domain"] == df["receiver_domain"]).astype(int)

    # Date-based features
    df["date"] = pd.to_datetime(df["date"], errors='coerce', utc=True)
    df["email_hour"] = df["date"].dt.hour.fillna(-1).astype(int)
    df["email_weekday"] = df["date"].dt.weekday.fillna(-1).astype(int)
    df["is_weekend"] = df["email_weekday"].isin([5, 6]).astype(int)
    df["is_midnight_hour"] = df["email_hour"].isin(range(0, 6)).astype(int)
    df["has_valid_date"] = df["email_hour"].apply(lambda x: 1 if x != -1 else 0)

    # URL-based features from body
    url_feats = df["body"].apply(extract_url_features)
    url_feats.columns = ["url_count_in_body", "url_avg_length", "url_max_length",
                         "url_has_ip", "url_has_special_chars", "url_has_redirect", "url_suspicious_tld"]
    df = pd.concat([df, url_feats], axis=1)

    # Text statistics
    df["subject_length"] = df["subject"].fillna("").apply(len)
    df["body_length"] = df["body"].fillna("").apply(len)
    df["text_combined_length"] = df["subject_length"] + df["body_length"]
    df["uppercase_ratio"] = df["body"].apply(lambda x: sum(1 for c in str(x) if c.isupper()) / len(str(x)) if len(str(x)) > 0 else 0)
    df["exclamation_count"] = df["body"].fillna("").str.count("!")

    df["url_present"] = df["urls"]
    df["label"] = df["label"]

    le = LabelEncoder()
    df["sender_tld"] = le.fit_transform(df["sender_tld"].astype(str))

    selected_columns = [
        "sender_domain_length", "sender_has_digits", "sender_has_special_chars",
        "sender_tld", "sender_is_public_domain", "receiver_is_undisclosed",
        "receiver_is_public_domain", "sender_equals_receiver", "email_hour",
        "email_weekday", "is_weekend", "is_midnight_hour", "has_valid_date",  # ✅ NEW
        "url_present", "url_count_in_body", "url_avg_length", "url_max_length",
        "url_has_ip", "url_has_special_chars", "url_has_redirect", "url_suspicious_tld",
        "subject_length", "body_length", "text_combined_length", "uppercase_ratio",
        "exclamation_count", "label"
    ]
    return df[selected_columns]


In [7]:
client_paths = {
    "client_1": "../data/clients/client_1.csv",
    "client_2": "../data/clients/client_2.csv",
    "client_3": "../data/clients/client_3.csv",
    "client_4": "../data/clients/client_4.csv"
}

output_dir = "../data/processed/metadata_v2"
os.makedirs(output_dir, exist_ok=True)

for client, path in client_paths.items():
    df = pd.read_csv(path)
    processed = build_metadata_features(df)
    out_path = os.path.join(output_dir, f"{client}_meta_v2.csv")
    processed.to_csv(out_path, index=False)
    print(f"✅ Saved enhanced metadata for {client} → {out_path}")


✅ Saved enhanced metadata for client_1 → ../data/processed/metadata_v2/client_1_meta_v2.csv
✅ Saved enhanced metadata for client_2 → ../data/processed/metadata_v2/client_2_meta_v2.csv
✅ Saved enhanced metadata for client_3 → ../data/processed/metadata_v2/client_3_meta_v2.csv


  df["date"] = pd.to_datetime(df["date"], errors='coerce', utc=True)


✅ Saved enhanced metadata for client_4 → ../data/processed/metadata_v2/client_4_meta_v2.csv
