# === Needed imports ===

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Heatmap
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

pd.set_option('display.max_columns', 100)

from xgboost import XGBClassifier

df = pd.read_csv('../data/Phishing_Legitimate_full.csv')

# === Analyzing the dataset ===

In [None]:
# First 5 entries in the dataframe
df.head()

In [None]:
# Description of all columns
df.info()

In [None]:
# Special Values
df.describe()

In [None]:
# Label-column = classification (phishing / no phishing)
df.rename(columns={'CLASS_LABEL': 'labels'}, inplace=True)

# Check Count of "Phishing-Entries" and "Non-Phishing-Entries" in the dataframe
df['labels'].value_counts().plot(kind='bar')

In [None]:
# Check if there are null-values in the data
print(df.isnull().sum())

In [None]:
# Column "id" is not important for training a model
df = df.drop(columns=['id'])

In [None]:
# Spot outliners in "UrlLength"
df['UrlLength'].plot(kind='box')

In [None]:
Q1 = df['UrlLength'].quantile(0.25)
Q3 = df['UrlLength'].quantile(0.75)
IQR = Q3 - Q1
print(f"IQR: {IQR}")

# Outliners = +/- 1.5 * IQR
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['UrlLength'] < lower_bound) | (df['UrlLength'] > upper_bound)]
print(f"Count of Outliners (UrlLength): {len(outliers)}")

In [None]:
# Spot outliners in "NumDots"
df['NumDots'].plot(kind='box')

In [None]:
Q1 = df['NumDots'].quantile(0.25)
Q3 = df['NumDots'].quantile(0.75)
IQR = Q3 - Q1
print(f"IQR: {IQR}")

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['NumDots'] < lower_bound) | (df['NumDots'] > upper_bound)]
print(f"Count of Outliners (NumDots): {len(outliers)}")

In [None]:
# Calculate correlation and sort in descending order
correlation = df.corr()['labels'].drop('labels').sort_values(ascending=False)

# Plotting

# Set figure size for better readability
plt.figure(figsize=(8, 15))

# Create heatmap with sorted correlations
sns.heatmap(correlation.to_frame(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=1, cbar=False)

plt.title("Correlation of Features with 'labels'")
plt.xticks(rotation=45)

plt.show()

## == Analysis of Empty Column: "HttpsInHostname" ==

In [None]:
# Check unique values
print(df['HttpsInHostname'].value_counts())  

In [None]:
# HttpInHostname will be dropped because the correlation cannot be calculated
df = df.drop(columns=['HttpsInHostname'])

# === RandomForest for Calculation of importance of the features ===

In [None]:
X = df.drop(columns=['labels'])
y = df['labels']

model = RandomForestClassifier(n_estimators=100, random_state=77)
model.fit(X, y)
feature_importance = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
feature_importance.plot(kind='bar', color='steelblue')
plt.title("Feature Importance from Random Forest")
plt.show()

# === RandomForest with reduced amount of features ===

In [None]:
# Top 15 most important features
top_features = feature_importance[:15].index  

X_reduced = df[top_features]
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=77)

model_reduced = RandomForestClassifier(n_estimators=100, random_state=77)
model_reduced.fit(X_train, y_train)

y_pred = model_reduced.predict(X_test)

feature_importance = pd.Series(model_reduced.feature_importances_, index=X_reduced.columns).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
feature_importance.plot(kind='bar', color='steelblue')
plt.title("Feature Importance for reduced features from Random Forest")
plt.show()

print(classification_report(y_test, y_pred))

In [None]:
# Cross-Validation with 5 folds

scores = cross_val_score(model_reduced, X_reduced, y, cv=5)

print(f"Cross-Validation Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.2f}")

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(df[top_features].corr(), cmap="coolwarm", annot=False)
plt.title("Correlation between Features")

plt.show()

### == Check correlation for the 15 most important features ==

In [None]:
print(X_reduced.corrwith(y).sort_values(ascending=False))

# === XGBoost with Top-Features ===

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=77)

model_xgb = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, eval_metric='logloss')

model_xgb.fit(X_train, y_train)

y_pred_xgb = model_xgb.predict(X_test)

print(classification_report(y_test, y_pred_xgb))

# === Reserve 5% of the original data for testing ===

In [None]:
# Prepare full feature matrix and labels
X_full_all = df.drop("labels", axis=1)
y_full_all = df["labels"]

# 95% train, 5% test (stratified)
X_train_full, X_eval_full, y_train_full, y_eval_full = train_test_split(
    X_full_all, y_full_all, test_size=0.05, random_state=77, stratify=y_full_all
)

# Train full-feature model on 95% of data
model_full_split = RandomForestClassifier(n_estimators=100, random_state=77)
model_full_split.fit(X_train_full, y_train_full)

# Feature importances from model_full_split
importances_split = pd.Series(model_full_split.feature_importances_, index=X_train_full.columns)
top_features_split = importances_split.sort_values(ascending=False)[:15].index

# Reduce datasets to top features
X_train_reduced = X_train_full[top_features_split]
X_eval_reduced = X_eval_full[top_features_split]

# Train reduced model
model_reduced_split = RandomForestClassifier(n_estimators=100, random_state=77)
model_reduced_split.fit(X_train_reduced, y_train_full)

# Predictions
y_pred_full_split = model_full_split.predict(X_eval_full)
y_pred_reduced_split = model_reduced_split.predict(X_eval_reduced)

# Evaluation: Full-feature model
print("Evaluation (Full Feature Model, 5% test split)")
print(classification_report(y_eval_full, y_pred_full_split))
print(confusion_matrix(y_eval_full, y_pred_full_split))

# Evaluation: Top-feature model
print("\nEvaluation (Top Feature Model, 5% test split)")
print(classification_report(y_eval_full, y_pred_reduced_split))
print(confusion_matrix(y_eval_full, y_pred_reduced_split))

# === Test the model (all features) with generated data (10 rows) ===

In [None]:
# Let ChatGPT generate 10 rows similar to the original data
df_testset = pd.read_csv("../data/phishing_mini_testset_ten-rows.csv")

X_test = df_testset.drop(columns=["CLASS_LABEL", "HttpsInHostname"])
y_test = df_testset["CLASS_LABEL"]

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

print("\nMisclassifications")
df_errors = df_testset[y_pred != y_test]
display(df_errors)

# === Test the model with generated data (100 rows) - but many duplicates ===

In [None]:
df_testset_100_rows_with_duplicates = pd.read_csv("../data/phishing_testset_hundred-rows.csv")

X_test_full = df_testset_100_rows_with_duplicates.drop(columns=["CLASS_LABEL", "HttpsInHostname"])
X_test_top = X_test_full[top_features]
y_test = df_testset_100_rows_with_duplicates["CLASS_LABEL"]

y_pred_full = model.predict(X_test_full)
y_pred_top = model_reduced.predict(X_test_top)

print(classification_report(y_test, y_pred_full))
print(confusion_matrix(y_test, y_pred_full))
print(classification_report(y_test, y_pred_top))
print(confusion_matrix(y_test, y_pred_top))

### == Test the model with generated data (100 rows) - but many duplicates V2 ==

Although the models used (model & model_reduced) have not been modified, a 0% phishing detection raises suspicions.

For this reason, two new models have been trained for this specific case.

In [None]:
# Retrieve the new X from the full csv
X_full_100 = df.drop(columns=["labels"])
y_full_100 = df["labels"]

# Retrieve the new X but only with the 15 top features
X_top_100 = df[top_features]

# Train full modell again
model_full_100 = RandomForestClassifier(n_estimators=100, random_state=77)
model_full_100.fit(X_full_100, y_full_100)

# Train reduced-feature model again
model_top_100 = RandomForestClassifier(n_estimators=100, random_state=77)
model_top_100.fit(X_top_100, y_full_100)

# Load generated test set (with duplicates)

df_testset_100_faulty = pd.read_csv("../data/phishing_testset_hundred-rows.csv")

X_test_full = df_testset_100_faulty.drop(columns=["CLASS_LABEL", "HttpsInHostname"])
X_test_top = X_test_full[top_features]
y_test = df_testset_100_faulty["CLASS_LABEL"]

# Evaluate full-feature model
y_pred_full = model_full_100.predict(X_test_full)

print("Evaluation: Full-Feature Model")
print(classification_report(y_test, y_pred_full))
print(confusion_matrix(y_test, y_pred_full))

# Evaluate top-feature model
y_pred_top = model_top_100.predict(X_test_top)

print("\nEvaluation: Top-Feature Model")
print(classification_report(y_test, y_pred_top))
print(confusion_matrix(y_test, y_pred_top))

These results indicate that the reduced model fails to identify generated phishing samples in this dataset, likely due to insufficient representation of relevant features.

# === Test the model with generated data (100 rows) - unique rows ===

In [None]:
# ChatGPT generated 100 unique rows similar to the original data
df_testset_100_unique = pd.read_csv("../data/phishing_testset_100_unique.csv")

# Input features and labels
X_test_full = df_testset_100_unique.drop(columns=["CLASS_LABEL", "HttpsInHostname"])
y_test = df_testset_100_unique["CLASS_LABEL"]

# Prepare reduced feature input
X_test_top = X_test_full[top_features]

# Predictions
y_pred_full = model.predict(X_test_full)
y_pred_top = model_reduced.predict(X_test_top)

# Full-feature model evaluation
print("Evaluation: Full-Feature Model")
print(classification_report(y_test, y_pred_full))
print(confusion_matrix(y_test, y_pred_full))

# Top-feature model evaluation
print("\nEvaluation: Top-Feature Model")
print(classification_report(y_test, y_pred_top))
print(confusion_matrix(y_test, y_pred_top))

### == Test the model with generated data (100 rows) - unique rows V2 ==

Again, two new models were trained.

In [None]:
# Train new models for the unique 100-row test set

X_full_unique = df.drop(columns=["labels"])
y_full_unique = df["labels"]

X_top_unique = df[top_features]

# Train full-feature model again
model_full_unique = RandomForestClassifier(n_estimators=100, random_state=77)
model_full_unique.fit(X_full_unique, y_full_unique)

# Train top-feature model again
model_top_unique = RandomForestClassifier(n_estimators=100, random_state=77)
model_top_unique.fit(X_top_unique, y_full_unique)

# Load generated test set (100 unique rows)

df_testset_100_unique = pd.read_csv("../data/phishing_testset_100_unique.csv")

X_test_full = df_testset_100_unique.drop(columns=["CLASS_LABEL", "HttpsInHostname"])
X_test_top = X_test_full[top_features]
y_test = df_testset_100_unique["CLASS_LABEL"]

# Evaluate full-feature model
y_pred_full = model_full_unique.predict(X_test_full)

print("Evaluation: Full-Feature Model")
print(classification_report(y_test, y_pred_full))
print(confusion_matrix(y_test, y_pred_full))

# Evaluate top-feature model
y_pred_top = model_top_unique.predict(X_test_top)

print("\nEvaluation: Top-Feature Model")
print(classification_report(y_test, y_pred_top))
print(confusion_matrix(y_test, y_pred_top))

# === Reading E-Mails from .eml-Files ===

In [None]:
import re
from urllib.parse import urlparse

# Clamp a value between a minimum and a maximum (used to normalize feature values)
def clamp(value, min_val, max_val):
    return max(min(value, max_val), min_val)

# Extract technical URL-based features from url-list
def extract_url_features(urls):
    features = {
        "NumDots": 0, "SubdomainLevel": 0, "PathLevel": 0, "UrlLength": 0,
        "NumDash": 0, "NumDashInHostname": 0, "AtSymbol": 0, "TildeSymbol": 0,
        "NumUnderscore": 0, "NumPercent": 0, "NumQueryComponents": 0,
        "NumAmpersand": 0, "NumHash": 0, "NumNumericChars": 0, "NoHttps": 0,
        "RandomString": 0, "IpAddress": 0, "HttpsInHostname": 0,
        "HostnameLength": 0, "PathLength": 0, "QueryLength": 0,
        "DoubleSlashInPath": 0,
    }

    for url in urls:
        parsed = urlparse(url)
        hostname = parsed.hostname or ""
        path = parsed.path or ""
        query = parsed.query or ""

        # Counts how often '//' appears inside the path (hidden redirects?)
        features["DoubleSlashInPath"] += path.count("//")

        # 0 / 1 if the hostname itself contains the string "https"
        features["HttpsInHostname"] += int("https" in hostname)

        # 0 / 1 if the hostname is an IP address instead of domain name
        features["IpAddress"] += int(bool(re.match(r"\d+\.\d+\.\d+\.\d+", hostname)))
    
        # Number of key=value components in the query string (e.g., ?a=1&b=2 -> 2 components)
        features["NumQueryComponents"] += len(query.split("&")) if query else 0
    
        # 0 / 1 if any path segment is unusually long and lacks vowels (often machine-generated)
        features["RandomString"] += int(any(
            len(part) > 10 and not re.search(r'[aeiou]', part)
            for part in path.split("/")
        ))
    
        # Number of dots in the hostname, minus one to exclude the top-level domain
        features["SubdomainLevel"] += max(0, hostname.count(".") - 1)       
        
        features["AtSymbol"] += url.count("@")
        features["HostnameLength"] += len(hostname) if hostname else 0
        features["NoHttps"] += int(not url.startswith("https"))
        features["NumAmpersand"] += url.count("&")
        features["NumDash"] += url.count("-")
        features["NumDashInHostname"] += hostname.count("-")
        features["NumDots"] += url.count(".")
        features["NumHash"] += url.count("#")
        features["NumNumericChars"] += sum(c.isdigit() for c in url)
        features["NumPercent"] += url.count("%")
        features["NumUnderscore"] += url.count("_")
        features["PathLength"] += len(path)
        features["PathLevel"] += path.count("/")
        features["QueryLength"] += len(query)
        features["TildeSymbol"] += url.count("~")
        features["UrlLength"] += len(url)

    # Clamp selected numeric features to keep them within expected ranges
    features["HostnameLength"] = clamp(features["HostnameLength"], 4, 137)
    features["PathLength"] = clamp(features["PathLength"], 0, 161)
    features["QueryLength"] = clamp(features["QueryLength"], 0, 188)
    features["UrlLength"] = clamp(features["UrlLength"], 12, 253)
    features["SubdomainLevel"] = clamp(features["SubdomainLevel"], 0, 14)
    
    return features

# Calculate the ratio of external hyperlinks and resources in the email body
def calculate_external_link_ratios(body: str, sender: str):
    snd_domain = sender.split("@")[-1].lower() if sender and "@" in sender else ""

    urls = re.findall(r'https?://[^\s"<>()]+', body)

    # Resource-URLs (src= / href=)
    resource_urls = re.findall(r'(?:src|href)=["\'](https?://[^"\']+)["\']', body.lower())

    # A link is considered external if its domain differs from the senders domain
    def is_external(url, ref_domain):
        parsed = urlparse(url)
        hostname = parsed.hostname or ""
        return ref_domain not in hostname.lower()

    num_ext_links = sum(is_external(u, snd_domain) for u in urls)
    pct_ext_links = (num_ext_links / len(urls)) if urls else 0

    num_ext_resources = sum(is_external(u, snd_domain) for u in resource_urls)
    pct_ext_resources = (num_ext_resources / len(resource_urls)) if resource_urls else 0

    return pct_ext_links, pct_ext_resources

def domain_from_email(email_address):
    if not email_address or "@" not in email_address:
        return ""
    return email_address.split("@")[-1].lower()

# Get the main domain (second-level + top-level) from a full hostname
def get_main_domain(hostname):
    parts = hostname.split(".")
    return ".".join(parts[-2:]) if len(parts) >= 2 else hostname

# Extracts large set of phishing-related features
def extract_features_from_eml(subject, sender, recipient, body, filename=""):
    features = {}

    urls = re.findall(r'https?://[^\s"<>()]+', body)

    known_brands = [
        "paypal", "apple", "amazon", "sparkasse", "dhl", "deutschebank",
        "commerzbank", "ebay", "netflix", "microsoft", "google", "instagram",
        "facebook", "linkedin", "postbank", "visa", "mastercard", "americanexpress",
        "fedex", "ups", "klarna", "telekom", "vodafone", "o2", "spotify", "airbnb"
    ]
    
    sensitive_words = [
        "account", "verify", "secure", "login", "update", "confirm",
        "password", "click", "access", "billing", "bank", "credit", "ssn",
        "social", "security", "alert", "unusual", "attempt", "fraud",
        "locked", "expired", "immediately", "urgent", "attention",
        "suspend", "important", "reset", "re-enter",
    
        "konto", "verifizieren", "sicher", "einloggen", "anmelden",
        "passwort", "aktualisieren", "bestätigen", "zahlung", "abrechnung",
        "kreditkarte", "sofort", "dringend", "wichtig", "gesperrt",
        "abgelaufen", "zugang", "identität", "prüfung", "eingabe",
        "sicherheitsüberprüfung", "reaktivieren", "informationen", "bank",
        "onlinebanking", "pin", "tan", "sicherheitscode", "freischalten"
    ]

    # Aggregate technical URL-based features across all URLs in the body
    url_features = extract_url_features(urls)
    features.update(url_features)

    parsed = urlparse(urls[0]) if urls else None
    hostname = parsed.hostname if parsed else ""
    path = parsed.path if parsed else ""
    query = parsed.query if parsed else ""
    
    # Extract local part of the recipient (e.g., "john" from john@example.com)
    recipient_local = recipient.split("@")[0] if recipient and "@" in recipient else ""   

    # Count occurrences of sensitive keywords (case-insensitive)
    features["NumSensitiveWords"] = sum(
        bool(re.search(rf'\b{re.escape(word)}\b', body, flags=re.IGNORECASE))
        for word in sensitive_words
    )

    # Compare sender domain with the domain of the first link
    sender_domain = domain_from_email(sender)
    link_domain = get_main_domain(hostname)    
    features["FrequentDomainNameMismatch"] = int(sender_domain != link_domain) if sender_domain and link_domain else 0

    # Calculate percentage of external links and resources
    pct_ext_links, pct_ext_resources = calculate_external_link_ratios(body, sender)
    features["PctExtHyperlinks"] = clamp(pct_ext_links, 0, 1)
    features["PctExtResourceUrls"] = clamp(pct_ext_resources, 0, 1)

    # Check if form actions post to external domains (not sender's)
    form_actions = re.findall(r'action=["\'](https?://[^"\']+)["\']', body.lower())
    features["AbnormalFormAction"] = int(any(
        urlparse(url).hostname and sender_domain not in urlparse(url).hostname.lower()
        for url in form_actions
    ))
    
    features["DomainInSubdomains"] = int(recipient_local in ".".join(hostname.split(".")[:-2])) if hostname else 0
    features["DomainInPaths"] = int(recipient_local in path)
    features["EmbeddedBrandName"] = int(any(b in body.lower() for b in known_brands))

    # Technical "tricks"
    features["ExtFavicon"] = int("favicon" in body.lower() and "http" in body.lower())
    features["InsecureForms"] = int("<form" in body.lower() and "http:" in body.lower())
    features["RelativeFormAction"] = int('action="/' in body.lower())
    features["ExtFormAction"] = int('action="http' in body.lower())
    features["PctNullSelfRedirectHyperlinks"] = clamp(int('href="#"' in body.lower()) / len(urls) if urls else 0, 0, 1)

    # Client-side deception techniques
    features["FakeLinkInStatusBar"] = int("onmouseover" in body.lower() and "status" in body.lower())
    features["RightClickDisabled"] = int("event.button==2" in body.lower())
    features["PopUpWindow"] = int("window.open" in body.lower())
    features["SubmitInfoToEmail"] = int("mailto:" in body.lower())

    # HTML structure anomalies
    features["IframeOrFrame"] = int("<iframe" in body.lower() or "<frame" in body.lower())
    features["MissingTitle"] = int("<title" not in body.lower())
    features["ImagesOnlyInForm"] = int("<form" in body.lower() and "<img" in body.lower())

    # Redundant (clamped or grouped) features – required by model
    features["SubdomainLevelRT"] = features["SubdomainLevel"]
    features["UrlLengthRT"] = features["UrlLength"]
    features["PctExtResourceUrlsRT"] = features["PctExtResourceUrls"]
    features["AbnormalExtFormActionR"] = features["AbnormalFormAction"]
    features["PctExtNullSelfRedirectHyperlinksRT"] = features["PctNullSelfRedirectHyperlinks"]
    features["ExtMetaScriptLinkRT"] = int("<script" in body.lower() or "<meta" in body.lower() or "<link" in body.lower())

    # Assign label from filename if known (phish/legit), else mark unknown
    if "phish" in filename.lower():
        features["CLASS_LABEL"] = 1
    elif "legit" in filename.lower():
        features["CLASS_LABEL"] = 0
    else:
        features["CLASS_LABEL"] = "unknown"

    return features

In [None]:
import os
from email import policy
from email.parser import BytesParser

def process_eml_folder(folder_path):
    all_features = []

    for file in os.listdir(folder_path):
        if file.endswith(".eml"):
            file_path = os.path.join(folder_path, file)

            with open(file_path, 'rb') as f:
                msg = BytesParser(policy=policy.default).parse(f)

            subject = msg['subject']
            sender = msg['from']
            recipient = msg['to']

            body = ""

            if msg.is_multipart():
                for part in msg.walk():
                    if part.get_content_type() == "text/html":
                        body = part.get_content()
                        break
                    elif part.get_content_type() == "text/plain":
                        body = part.get_content()
            else:
                body = msg.get_content()

            filename = os.path.basename(file_path)

            features = extract_features_from_eml(subject, sender, recipient, body, filename=filename)
            features["FILENAME"] = file

            all_features.append(features)

    df = pd.DataFrame(all_features)
    return df

# === Full output and information of eml-Data ===

In [None]:
folder_path = "../data/mails/real_mails_train"
df_eml = process_eml_folder(folder_path)

df_eml

In [None]:
# Check the columns if everything is still correct

print(df.columns)
print(f"\nCount of features: {len(df.columns)}")

## == Full Feature Set Used – Performance decreased drastically ==

In [None]:
## Use all available features except the label column
#model_full_features = df.columns.drop("labels")
#X_train_full = df[model_full_features]
#y_train_full = df["labels"]

## Train Random Forest on full feature set
#model_full = RandomForestClassifier(n_estimators=100, random_state=77)
#model_full.fit(X_train_full, y_train_full)

#print("Full-feature model trained on", len(model_full_features), "features")

# === Test .eml-Data on Model (top features) ===

In [None]:
# Select same features as used during full-model training
X_eml_top_features = df_eml[top_features]

# Predict phishing likelihood
y_pred_eml_top_features = model_reduced.predict(X_eml_top_features)

df_eml["Predicted"] = y_pred_eml_top_features

print(classification_report(df_eml["CLASS_LABEL"], df_eml["Predicted"]))
print(confusion_matrix(df_eml["CLASS_LABEL"], df_eml["Predicted"]))

false_positives = df_eml[(df_eml["CLASS_LABEL"] == 0) & (df_eml["Predicted"] == 1)]
false_negatives = df_eml[(df_eml["CLASS_LABEL"] == 1) & (df_eml["Predicted"] == 0)]

display(false_positives[["FILENAME", "Predicted"]])
display(false_negatives[["FILENAME", "Predicted"]])

## == Analysis of Unexpected Model Behavior ==

In [None]:
df_eml.describe()

## == Comparison of false-positives with the distribution of legitimate training data ==

In [None]:
# Select only legitimate samples from training data (used for reference distribution)
X_legit = X_train[y_train == 0][top_features]

# mean and standard deviation for each feature (based on legit emails)
mean_legit = X_legit.mean()
std_legit = X_legit.std()

# Extract false positives
false_positives = df_eml[(df_eml["CLASS_LABEL"] == 0) & (df_eml["Predicted"] == 1)]

for _, row in false_positives.iterrows():
    print(f"False-Positive: {row['FILENAME']}")
    print("## Feature (value) - Z-score vs legit mean ##\n")

    for feat in top_features:
        value = row[feat]
        mean = mean_legit[feat]
        std = std_legit[feat]
        z = (value - mean) / std if std > 0 else 0
        print(f"{feat}: {value:.2f} (z = {z:.2f})")

    print("-" * 50)

# === Integrating .eml-Data into the training set ===

In [None]:
df_real_mails = process_eml_folder("../data/mails/real_mails_train")

# Load original CSV-based dataset and drop unused columns
df_csv_base = pd.read_csv("../data/Phishing_Legitimate_full.csv")
df_csv_base = df_csv_base.drop(columns=["id", "HttpsInHostname"])

# Ensure matching columns
df_real_mails = df_real_mails[df_csv_base.columns]
df_combined = pd.concat([df_csv_base, df_real_mails], ignore_index=True)

# Final training dataset
df = df_combined

df

In [None]:
desc = df_csv_base.describe().T
display(desc)

# More Details of original - count of uniques, range of values 
for col in df_csv_base.columns:
    print(f"{col}: unique={df_csv_base[col].nunique()}, min={df_csv_base[col].min()}, max={df_csv_base[col].max()}")

# === Training on combined dataset ===

# === CSV + full EML-Train-Data (20 legit, 20 phish) Training ===

In [None]:
df_csv = pd.read_csv("../data/Phishing_Legitimate_full.csv")
df_csv = df_csv.drop(columns=["id", "HttpsInHostname"])

df_real_train = process_eml_folder("../data/mails/real_mails_train")

# Assure same features
df_real_train = df_real_train[df_csv.columns]

df_train_full_combined  = pd.concat([df_csv, df_real_train], ignore_index=True)

# Full Feature Training
X_train_full_combined = df_train_full_combined.drop(columns=["CLASS_LABEL"])
y_train_full_combined = df_train_full_combined["CLASS_LABEL"]

model_full_combined_csv_eml = RandomForestClassifier(n_estimators=100, random_state=77)
model_full_combined_csv_eml.fit(X_train_full_combined, y_train_full_combined)

# Reduced Feature Training
# Use the same top_features as in earlier analysis (from feature_importance[:15].index)
X_train_top_combined = df_train_full_combined[top_features]

model_top_combined_csv_eml = RandomForestClassifier(n_estimators=100, random_state=77)
model_top_combined_csv_eml.fit(X_train_top_combined, y_train_full_combined)

In [None]:
df_test_real = process_eml_folder("../data/mails/real_mails_test")

# Full feature input
X_test_full = df_test_real[X_train_full_combined.columns]

# Reduced feature input
X_test_top = df_test_real[top_features]
y_test = df_test_real["CLASS_LABEL"]

# Predictions
y_pred_full = model_full_combined_csv_eml.predict(X_test_full)
y_pred_top = model_top_combined_csv_eml.predict(X_test_top)

df_test_real["Predicted_Full"] = y_pred_full
df_test_real["Predicted_Top"] = y_pred_top

# Evaluation: Full model
print("Evaluation on Real EML Test Data (Full Feature Model)")
print(classification_report(y_test, y_pred_full))
print(confusion_matrix(y_test, y_pred_full))

false_pos_full = df_test_real[(df_test_real["CLASS_LABEL"] == 0) & (df_test_real["Predicted_Full"] == 1)]
false_neg_full = df_test_real[(df_test_real["CLASS_LABEL"] == 1) & (df_test_real["Predicted_Full"] == 0)]

print("\nFalse Positives (Full Feature Model)")
display(false_pos_full[["FILENAME", "Predicted_Full"]])

print("\nFalse Negatives (Full Feature Model)")
display(false_neg_full[["FILENAME", "Predicted_Full"]])

# Evaluation: Top feature model
print("\nEvaluation on Real EML Test Data (Top Feature Model)")
print(classification_report(y_test, y_pred_top))
print(confusion_matrix(y_test, y_pred_top))

false_pos_top = df_test_real[(df_test_real["CLASS_LABEL"] == 0) & (df_test_real["Predicted_Top"] == 1)]
false_neg_top = df_test_real[(df_test_real["CLASS_LABEL"] == 1) & (df_test_real["Predicted_Top"] == 0)]

print("\nFalse Positives (Top Feature Model)")
display(false_pos_top[["FILENAME", "Predicted_Top"]])

print("\nFalse Negatives (Top Feature Model)")
display(false_neg_top[["FILENAME", "Predicted_Top"]])

# === CSV + full EML-Train-Data (20 legit, 20 phish) XGB Training ===

In [None]:
df_csv = pd.read_csv("../data/Phishing_Legitimate_full.csv")
df_csv = df_csv.drop(columns=["id", "HttpsInHostname"])

df_real_train = process_eml_folder("../data/mails/real_mails_train")
df_real_train = df_real_train[df_csv.columns]

df_train_combined = pd.concat([df_csv, df_real_train], ignore_index=True)

# XGBoost - Full Feature Model
X_train_full_combined = df_train_combined.drop(columns=["CLASS_LABEL"])
y_train_full_combined = df_train_combined["CLASS_LABEL"]

model_xgb_combined_csv_eml = XGBClassifier(
    n_estimators=100,
    eval_metric="logloss",
    random_state=77
)
model_xgb_combined_csv_eml.fit(X_train_full_combined, y_train_full_combined)

# XGBoost - Top Feature Model
X_train_top_combined = df_train_combined[top_features]

model_xgb_top_combined_csv_eml = XGBClassifier(
    n_estimators=100,
    eval_metric="logloss",
    random_state=77
)
model_xgb_top_combined_csv_eml.fit(X_train_top_combined, y_train_full_combined)

In [None]:
df_test_real = process_eml_folder("../data/mails/real_mails_test")

X_test_full = df_test_real[X_train_full_combined.columns]
X_test_top = df_test_real[top_features]
y_test = df_test_real["CLASS_LABEL"]

y_pred_full = model_xgb_combined_csv_eml.predict(X_test_full)
y_pred_top = model_xgb_top_combined_csv_eml.predict(X_test_top)

df_test_real["Predicted_Full_XGB"] = y_pred_full
df_test_real["Predicted_Top_XGB"] = y_pred_top

# Evaluation: Full Feature Model
print("Evaluation (XGBoost Full Feature Model)")
print(classification_report(y_test, y_pred_full))
print(confusion_matrix(y_test, y_pred_full))

# Evaluation: Top Feature Model
print("\nEvaluation (XGBoost Top Feature Model)")
print(classification_report(y_test, y_pred_top))
print(confusion_matrix(y_test, y_pred_top))

# False Positives / Negatives - Full Feature
fp_full = df_test_real[(df_test_real["CLASS_LABEL"] == 0) & (df_test_real["Predicted_Full_XGB"] == 1)]
fn_full = df_test_real[(df_test_real["CLASS_LABEL"] == 1) & (df_test_real["Predicted_Full_XGB"] == 0)]

print("\nFalse Positives (Full Feature)")
display(fp_full[["FILENAME", "Predicted_Full_XGB"]])

print("\nFalse Negatives (Full Feature)")
display(fn_full[["FILENAME", "Predicted_Full_XGB"]])

# False Positives / Negatives - Top Feature
fp_top = df_test_real[(df_test_real["CLASS_LABEL"] == 0) & (df_test_real["Predicted_Top_XGB"] == 1)]
fn_top = df_test_real[(df_test_real["CLASS_LABEL"] == 1) & (df_test_real["Predicted_Top_XGB"] == 0)]

print("\nFalse Positives (Top Feature)")
display(fp_top[["FILENAME", "Predicted_Top_XGB"]])

print("\nFalse Negatives (Top Feature)")
display(fn_top[["FILENAME", "Predicted_Top_XGB"]])

# === Train only on EML training data ===

In [None]:
df_train_real = process_eml_folder("../data/mails/real_mails_train")

# Full Feature Model
X_train_real_full = df_train_real.drop(columns=["CLASS_LABEL", "FILENAME"])
y_train_real = df_train_real["CLASS_LABEL"]

model_real_full = RandomForestClassifier(n_estimators=100, random_state=77)
model_real_full.fit(X_train_real_full, y_train_real)

# Top Feature Model
X_train_real_top = df_train_real[top_features]

model_real_top = RandomForestClassifier(n_estimators=100, random_state=77)
model_real_top.fit(X_train_real_top, y_train_real)

In [None]:
df_test_real = process_eml_folder("../data/mails/real_mails_test")

X_test_real_full = df_test_real[X_train_real_full.columns]
X_test_real_top = df_test_real[top_features]
y_test_real = df_test_real["CLASS_LABEL"]

# Predictions
y_pred_real_full = model_real_full.predict(X_test_real_full)
y_pred_real_top = model_real_top.predict(X_test_real_top)

df_test_real["Predicted_Full"] = y_pred_real_full
df_test_real["Predicted_Top"] = y_pred_real_top

# Evaluation Full Feature Model
print("Evaluation (Full Feature Model trained on real_mails_train only)")
print(classification_report(y_test_real, y_pred_real_full))
print(confusion_matrix(y_test_real, y_pred_real_full))

# Evaluation Top Feature Model
print("\nEvaluation (Top Feature Model trained on real_mails_train only)")
print(classification_report(y_test_real, y_pred_real_top))
print(confusion_matrix(y_test_real, y_pred_real_top))

# False Positives / Negatives: Full Model
fp_real_full = df_test_real[(df_test_real["CLASS_LABEL"] == 0) & (df_test_real["Predicted_Full"] == 1)]
fn_real_full = df_test_real[(df_test_real["CLASS_LABEL"] == 1) & (df_test_real["Predicted_Full"] == 0)]

print("\nFalse Positives (Full Model)")
display(fp_real_full[["FILENAME", "Predicted_Full"]])

print("\nFalse Negatives (Full Model)")
display(fn_real_full[["FILENAME", "Predicted_Full"]])

# False Positives / Negatives: Top Feature Model
fp_real_top = df_test_real[(df_test_real["CLASS_LABEL"] == 0) & (df_test_real["Predicted_Top"] == 1)]
fn_real_top = df_test_real[(df_test_real["CLASS_LABEL"] == 1) & (df_test_real["Predicted_Top"] == 0)]

print("\nFalse Positives (Top Feature Model)")
display(fp_real_top[["FILENAME", "Predicted_Top"]])

print("\nFalse Negatives (Top Feature Model)")
display(fn_real_top[["FILENAME", "Predicted_Top"]])

# === Train only on EML training data XGB ===

In [None]:
df_train_real = process_eml_folder("../data/mails/real_mails_train")

# Full features
X_train_real_full = df_train_real.drop(columns=["CLASS_LABEL", "FILENAME"])
y_train_real = df_train_real["CLASS_LABEL"]

model_real_full_xgb = XGBClassifier(
    n_estimators=100,
    eval_metric="logloss",
    random_state=77
)
model_real_full_xgb.fit(X_train_real_full, y_train_real)

# Top features
X_train_real_top = df_train_real[top_features]

model_real_top_xgb = XGBClassifier(
    n_estimators=100,
    eval_metric="logloss",
    random_state=77
)
model_real_top_xgb.fit(X_train_real_top, y_train_real)

In [None]:
df_test_real = process_eml_folder("../data/mails/real_mails_test")
y_test_real = df_test_real["CLASS_LABEL"]

X_test_real_full = df_test_real[X_train_real_full.columns]
X_test_real_top = df_test_real[top_features]

y_pred_real_full = model_real_full_xgb.predict(X_test_real_full)
y_pred_real_top = model_real_top_xgb.predict(X_test_real_top)

df_test_real["Predicted_Full_XGB"] = y_pred_real_full
df_test_real["Predicted_Top_XGB"] = y_pred_real_top

# Evaluation Full Feature Model
print("Evaluation (XGBoost Full Feature Model)")
print(classification_report(y_test_real, y_pred_real_full))
print(confusion_matrix(y_test_real, y_pred_real_full))

# Evaluation Top Feature Model
print("\nEvaluation (XGBoost Top Feature Model)")
print(classification_report(y_test_real, y_pred_real_top))
print(confusion_matrix(y_test_real, y_pred_real_top))

# False Positives / Negatives: Full Feature
fp_full = df_test_real[(y_test_real == 0) & (df_test_real["Predicted_Full_XGB"] == 1)]
fn_full = df_test_real[(y_test_real == 1) & (df_test_real["Predicted_Full_XGB"] == 0)]

print("\nFalse Positives (Full Model)")
display(fp_full[["FILENAME", "Predicted_Full_XGB"]])

print("\nFalse Negatives (Full Model)")
display(fn_full[["FILENAME", "Predicted_Full_XGB"]])

# False Positives / Negatives: Top Feature
fp_top = df_test_real[(y_test_real == 0) & (df_test_real["Predicted_Top_XGB"] == 1)]
fn_top = df_test_real[(y_test_real == 1) & (df_test_real["Predicted_Top_XGB"] == 0)]

print("\nFalse Positives (Top Model)")
display(fp_top[["FILENAME", "Predicted_Top_XGB"]])

print("\nFalse Negatives (Top Model)")
display(fn_top[["FILENAME", "Predicted_Top_XGB"]])