In [1]:
import re
from urllib.parse import urlparse

def contains_random_string(parts):
    return any(len(p) > 10 and not re.search(r'[aeiou]', p.lower()) for p in parts)

def extract_url_features(urls):
    features = {
        "NumDots": 0, "SubdomainLevel": 0, "PathLevel": 0, "UrlLength": 0,
        "NumDash": 0, "NumDashInHostname": 0, "AtSymbol": 0, "TildeSymbol": 0,
        "NumUnderscore": 0, "NumPercent": 0, "NumQueryComponents": 0,
        "NumAmpersand": 0, "NumHash": 0, "NumNumericChars": 0, "NoHttps": 0,
        "RandomString": 0, "IpAddress": 0, "HttpsInHostname": 0,
        "HostnameLength": 0, "PathLength": 0, "QueryLength": 0,
        "DoubleSlashInPath": 0,
    }

    for url in urls:
        parsed = urlparse(url)
        hostname = parsed.hostname or ""
        path = parsed.path or ""
        query = parsed.query or ""

        features["NumDots"] += url.count(".")
        features["SubdomainLevel"] += max(0, hostname.count(".") - 1)
        features["PathLevel"] += path.count("/")
        features["UrlLength"] += len(url)
        features["NumDash"] += url.count("-")
        features["NumDashInHostname"] += hostname.count("-")
        features["AtSymbol"] += url.count("@")
        features["TildeSymbol"] += url.count("~")
        features["NumUnderscore"] += url.count("_")
        features["NumPercent"] += url.count("%")
        features["NumQueryComponents"] += len(query.split("&")) if query else 0
        features["NumAmpersand"] += url.count("&")
        features["NumHash"] += url.count("#")
        features["NumNumericChars"] += sum(c.isdigit() for c in url)
        features["NoHttps"] += int(not url.startswith("https"))
        features["RandomString"] += int(any(len(part) > 10 and not re.search(r'[aeiou]', part) for part in path.split("/")))
        features["IpAddress"] += int(bool(re.match(r"\d+\.\d+\.\d+\.\d+", hostname)))
        features["HttpsInHostname"] += int("https" in hostname)
        features["HostnameLength"] += len(hostname)
        features["PathLength"] += len(path)
        features["QueryLength"] += len(query)
        features["DoubleSlashInPath"] += path.count("//")

    return features

def calculate_external_link_ratios(body: str, sender: str):
    # Sender-Domain extrahieren
    sender_domain = sender.split("@")[-1].lower() if sender and "@" in sender else ""

    # Alle Links aus dem Body
    urls = re.findall(r'https?://[^\s"<>()]+', body)

    # Ressourcen-URLs (src= / href=)
    resource_urls = re.findall(r'(?:src|href)=["\'](https?://[^"\']+)["\']', body.lower())

    # Funktion zum Prüfen, ob URL extern ist
    def is_external(url, ref_domain):
        parsed = urlparse(url)
        hostname = parsed.hostname or ""
        return ref_domain not in hostname.lower()

    # Berechnung
    num_ext_links = sum(is_external(u, sender_domain) for u in urls)
    pct_ext_links = (num_ext_links / len(urls)) * 100 if urls else 0

    num_ext_resources = sum(is_external(u, sender_domain) for u in resource_urls)
    pct_ext_resources = (num_ext_resources / len(resource_urls)) * 100 if resource_urls else 0

    return pct_ext_links, pct_ext_resources

def domain_from_email(email_address):
    if not email_address or "@" not in email_address:
        return ""
    return email_address.split("@")[-1].lower()

def get_main_domain(hostname):
    parts = hostname.split(".")
    return ".".join(parts[-2:]) if len(parts) >= 2 else hostname

def extract_features_from_eml(subject, sender, recipient, body, filename=""):
    features = {}

    urls = re.findall(r'https?://[^\s"<>()]+', body)

    known_brands = ["paypal", "apple", "sparkasse", "amazon"]
    
    sensitive_words = [
        "account", "verify", "secure", "login", "update", "confirm",
        "password", "click", "access", "billing", "bank", "credit", "ssn",
        "social", "security", "alert", "unusual", "attempt", "fraud",
        "locked", "expired", "immediately", "urgent", "attention",
        "suspend", "important", "reset", "re-enter",
    
        "konto", "verifizieren", "sicher", "einloggen", "anmelden",
        "passwort", "aktualisieren", "bestätigen", "zahlung", "abrechnung",
        "kreditkarte", "sofort", "dringend", "wichtig", "gesperrt",
        "abgelaufen", "zugang", "identität", "prüfung", "eingabe",
        "sicherheitsüberprüfung", "reaktivieren", "informationen", "bank",
        "onlinebanking", "pin", "tan", "sicherheitscode", "freischalten"
    ]

    url_features = extract_url_features(urls)
    features.update(url_features)

    parsed = urlparse(urls[0]) if urls else None
    hostname = parsed.hostname if parsed else ""
    path = parsed.path if parsed else ""
    query = parsed.query if parsed else ""
    
    # Lokaler Teil der Empfängeradresse extrahieren (alles vor dem "@")
    recipient_local = recipient.split("@")[0] if recipient and "@" in recipient else ""   
    
    features["NumSensitiveWords"] = sum(
        bool(re.search(rf'\b{re.escape(word)}\b', body, flags=re.IGNORECASE))
        for word in sensitive_words
    )

    sender_domain = domain_from_email(sender)
    link_domain = get_main_domain(hostname)    
    features["FrequentDomainNameMismatch"] = int(sender_domain and link_domain and sender_domain != link_domain)   
    
    pct_ext_links, pct_ext_resources = calculate_external_link_ratios(body, sender)
    features["PctExtHyperlinks"] = pct_ext_links
    features["PctExtResourceUrls"] = pct_ext_resources

    form_actions = re.findall(r'action=["\'](https?://[^"\']+)["\']', body.lower())
    features["AbnormalFormAction"] = int(any(
        urlparse(url).hostname and sender_domain not in urlparse(url).hostname.lower()
        for url in form_actions
    ))
    
    features["HttpsInHostname"] = int("https" in hostname) if hostname else 0
    features["DomainInSubdomains"] = int(recipient_local in ".".join(hostname.split(".")[:-2])) if hostname else 0
    features["DomainInPaths"] = int(recipient_local in path)
    features["HostnameLength"] = len(hostname) if hostname else 0
    features["PathLength"] = len(path)
    features["QueryLength"] = len(query)
    features["DoubleSlashInPath"] = path.count("//")
    features["EmbeddedBrandName"] = int(any(b in body.lower() for b in known_brands))
    features["ExtFavicon"] = int("favicon" in body.lower() and "http" in body.lower())
    features["InsecureForms"] = int("<form" in body.lower() and "http:" in body.lower())
    features["RelativeFormAction"] = int('action="/' in body.lower())
    features["ExtFormAction"] = int('action="http' in body.lower())
    features["PctNullSelfRedirectHyperlinks"] = int('href="#"' in body.lower()) / len(urls) * 100 if urls else 0    
    features["FakeLinkInStatusBar"] = int("onmouseover" in body.lower() and "status" in body.lower())
    features["RightClickDisabled"] = int("event.button==2" in body.lower())
    features["PopUpWindow"] = int("window.open" in body.lower())
    features["SubmitInfoToEmail"] = int("mailto:" in body.lower())
    features["IframeOrFrame"] = int("<iframe" in body.lower() or "<frame" in body.lower())
    features["MissingTitle"] = int("<title" not in body.lower())
    features["ImagesOnlyInForm"] = int("<form" in body.lower() and "<img" in body.lower())
    features["SubdomainLevelRT"] = features["SubdomainLevel"]
    features["UrlLengthRT"] = features["UrlLength"]
    features["PctExtResourceUrlsRT"] = features["PctExtResourceUrls"]
    features["AbnormalExtFormActionR"] = features["AbnormalFormAction"]
    features["ExtMetaScriptLinkRT"] = int("<script" in body.lower() or "<meta" in body.lower() or "<link" in body.lower())
    features["PctExtNullSelfRedirectHyperlinksRT"] = features["PctNullSelfRedirectHyperlinks"]

    if "phish" in filename.lower():
        features["CLASS_LABEL"] = 1
    elif "legit" in filename.lower():
        features["CLASS_LABEL"] = 0
    else:
        features["CLASS_LABEL"] = "unknown"

    return features

In [2]:
import os
import email
from email import policy
import pandas as pd

def process_eml_folder(folder_path):
    all_features = []

    for file in os.listdir(folder_path):
        if file.endswith(".eml"):
            file_path = os.path.join(folder_path, file)

            # .eml Datei einlesen
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                msg = email.message_from_file(f, policy=policy.default)

            subject = msg['subject']
            sender = msg['from']
            recipient = msg['to']
            body = ""

            if msg.is_multipart():
                for part in msg.walk():
                    content_type = part.get_content_type()
                    if content_type in ["text/plain", "text/html"]:
                        body += part.get_content()
            else:
                body = msg.get_content()

            # Nur Dateiname extrahieren
            filename = os.path.basename(file_path)

            # Features berechnen
            features = extract_features_from_eml(subject, sender, recipient, body, filename=filename)

            features["FILENAME"] = file
            
            # Resultat speichern
            all_features.append(features)

    df = pd.DataFrame(all_features)
    return df

In [3]:
folder = "../data/mails"
df = process_eml_folder(folder)

In [4]:
df['PctExtHyperlinks']

0    100.0
1    100.0
2    100.0
3      0.0
4    100.0
5    100.0
6     50.0
Name: PctExtHyperlinks, dtype: float64

In [5]:
df['PctExtResourceUrls']

0    100.0
1    100.0
2    100.0
3      0.0
4    100.0
5    100.0
6     50.0
Name: PctExtResourceUrls, dtype: float64

In [6]:
df['FrequentDomainNameMismatch']

0    1
1    1
2    1
3    0
4    1
5    1
6    0
Name: FrequentDomainNameMismatch, dtype: int64

In [7]:
df['NumSensitiveWords']

0    3
1    2
2    5
3    0
4    3
5    2
6    5
Name: NumSensitiveWords, dtype: int64

In [8]:
df

Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL,FILENAME
0,65,25,87,2833,50,13,0,22,46,0,...,0,0,25,2833,100.0,0,1,0.0,0,legit_DELTARUNE.eml
1,403,113,228,16869,206,87,0,0,155,0,...,0,0,113,16869,100.0,0,1,0.0,0,legit_fitbit.eml
2,233,150,290,12083,135,0,0,174,149,0,...,0,0,150,12083,100.0,0,1,0.0,0,legit_ikea.eml
3,1,0,1,34,0,0,0,0,0,0,...,0,0,0,34,0.0,0,0,0.0,0,legit_test1.eml
4,30,10,26,955,0,0,0,0,0,0,...,0,0,10,955,100.0,0,1,0.0,1,phish_mail1.eml
5,2,1,4,67,0,0,0,0,0,0,...,0,0,1,67,100.0,0,0,0.0,1,phish_sogo-bitpanda.eml
6,7,1,4,128,3,3,0,0,0,0,...,1,0,1,128,50.0,0,0,0.0,1,phish_test1.eml
