<a href="https://colab.research.google.com/github/yeho/pishing-detection-AI/blob/master/Email_Phishing_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
ethancratchley_email_phishing_dataset_path = kagglehub.dataset_download('ethancratchley/email-phishing-dataset')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/datasets/download/ethancratchley/email-phishing-dataset?dataset_version_number=1...


100%|██████████| 3.18M/3.18M [00:00<00:00, 103MB/s]

Extracting files...
Data source import complete.





In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Loading and inference

In [4]:
data = pd.read_csv("/kaggle/input/email-phishing-dataset/email_phishing_data.csv")
df = data.copy()
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/email-phishing-dataset/email_phishing_data.csv'

In [None]:
df.info()

In [None]:
df.describe()

# Basic Data Analysis

In [None]:
for i in df.columns:
    print(f"{i}: Max: {df[i].max()}, Min: {df[i].min()}")

In [None]:
for i in df.columns:
    print(f"{i}: {df[i].nunique()}")

# Exploratory Data Analysis

In [None]:
print(df.isnull().sum())

In [None]:
sns.countplot(x="label", data=df, palette="Set2")
plt.title("Distribution of Email Labels")
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

1. Words and spelling errors are positively correlated. (More words -> more spelling errors)
2. Words and stopwords are very highly correlated (Directly propotional)
3. More unique words leads to more spelling errors
4. More unique words leads to more number of urgent keywords

# Feature Importance

In [None]:
import sklearn
from sklearn.ensemble import RandomForestClassifier

X = df.drop("label", axis=1)
y = df["label"]

# model = RandomForestClassifier()
# model.fit(X, y)


# feat_importances = pd.Series(model.feature_importances_, index=X.columns)
# feat_importances.nlargest(10).plot(kind="barh")
# plt.title("Feature Importance")
# plt.show()

# Model Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
new_cols = ["num_words", "num_unique_words", "num_stopwords", "num_links"]
X = df[new_cols]

In [None]:
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
scaler = MinMaxScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)

model = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    eval_metric="logloss"
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

print("Classification Report:\n", classification_report(y_test, y_pred))

plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d")
plt.xlabel("Predicted Label")
plt.ylabel("Actual Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
import numpy as np
from sklearn.metrics import classification_report

y_probs = model.predict_proba(X_test)[:, 1]
optimal_thresh = 0.75

y_pred_thresh = (y_probs >= optimal_thresh).astype(int)
print(classification_report(y_test, y_pred_thresh))

✅ Achieved 96% accuracy and improved phishing detection F1-score to 0.23, reducing false positives by 2.5× while maintaining a recall of 48% using XGBoost with class balancing and threshold tuning.

In [None]:
# prompt: como puedo integrar un pishing a la prueba para que el modelo lo evalue y me diga si es pishing o no, toma el contexto del codigo de arriba (nombres de variables que ya existen etc), quiero que pueda admitir el codigo html del correo o solo el texto cualquiera de los dos

import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize # Import sent_tokenize explicitly

# Ensure necessary NLTK data is downloaded
try:
    # Attempt to access a resource to check if punkt is downloaded
    sent_tokenize("This is a test sentence.")
    stopwords = stopwords.words('english')
except LookupError:
    print("NLTK data not found. Downloading necessary resources...")
    nltk.download('stopwords')
    nltk.download('punkt')
    nltk.download('punkt_tab') # Explicitly download punkt_tab
    stopwords = stopwords.words('english')
    print("NLTK data download complete.")


def extract_text_from_html(html_content):
    """Extracts plain text from HTML content."""
    if not html_content:
        return ""
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup.get_text(separator=' ', strip=True)

def count_words(text):
    """Counts the total number of words in a text."""
    if not text:
        return 0
    words = word_tokenize(text)
    return len(words)

def count_unique_words(text):
    """Counts the number of unique words in a text."""
    if not text:
        return 0
    words = word_tokenize(text.lower())
    return len(set(words))

def count_stopwords(text):
    """Counts the number of stopwords in a text."""
    if not text:
        return 0
    words = word_tokenize(text.lower())
    return len([word for word in words if word in stopwords])

def count_links(text):
    """Counts the number of links (URLs) in a text (simplified)."""
    if not text:
        return 0
    # A simple regex to find URLs
    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    return len(urls)


def predict_phishing(email_content, model, scaler):
    """
    Predicts if an email is phishing using the trained model.

    Args:
        email_content (str): The content of the email (can be plain text or HTML).
        model: The trained XGBoost model.
        scaler: The fitted MinMaxScaler.

    Returns:
        str: 'Phishing' or 'Not Phishing'.
        float: The probability of being phishing.
    """
    # Try to extract text if it seems like HTML
    if bool(BeautifulSoup(email_content, "html.parser").find()):
         text_content = extract_text_from_html(email_content)
    else:
         text_content = email_content

    # Extract features
    num_words = count_words(text_content)
    num_unique_words = count_unique_words(text_content)
    num_stopwords = count_stopwords(text_content)
    num_links = count_links(text_content)

    # Create a DataFrame for the input features
    input_data = pd.DataFrame([[num_words, num_unique_words, num_stopwords, num_links]],
                               columns=["num_words", "num_unique_words", "num_stopwords", "num_links"])

    # Scale the input data using the same scaler trained on the training data
    scaled_input_data = scaler.transform(input_data)

    # Predict the probability
    phishing_prob = model.predict_proba(scaled_input_data)[:, 1][0]

    # Make a prediction based on the optimal threshold (or 0.5 if threshold tuning wasn't used)
    # We'll use the optimal_thresh identified in the original notebook
    prediction = 'Phishing' if phishing_prob >= optimal_thresh else 'Not Phishing'

    return prediction, phishing_prob

# Example Usage:

# Define some example email content (replace with actual email content)
# Example of a potentially phishing email (simplified)
phishing_email_html = """
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"><table dir="ltr" style="height: 323px;">
<tbody>
<tr style="height: 22px;">
<td id="i1" style="padding: 0px; font-family: 'Segoe UI Semibold', 'Segoe UI Bold', 'Segoe UI', 'Helvetica Neue Medium', Arial, sans-serif; font-size: 17px; color: #707070; height: 22px; width: 696px;">Microsoft account</td>
</tr>
<tr style="height: 53px;">
<td id="i2" style="padding: 0px; font-family: 'Segoe UI Light', 'Segoe UI', 'Helvetica Neue Medium', Arial, sans-serif; font-size: 41px; color: #2672ec; height: 53px; width: 696px;">Unusual sign.in activity</td>
</tr>
<tr style="height: 18px;">
<td id="i3" style="padding: 25px 0px 0px; font-size: 14px; font-family: 'Segoe UI', Tahoma, Verdana, Arial, sans-serif; color: #2a2a2a; height: 18px; width: 696px;">We detected something unusual about a recent sign-in to the Microsoft account <a id="iAccount" class="link" dir="ltr" style="color: #2672ec; text-decoration: none;" href="mailto:sotrecognizd@gmail.com?&amp;cc=sotrecognizd@gmail.com&amp;Subject=Report+The+User">phishing@pot</a>.</td>
</tr>
<tr style="height: 18px;">
<td id="i4" style="padding: 25px 0px 0px; font-family: 'Segoe UI Bold', 'Segoe UI Semibold', 'Segoe UI', 'Helvetica Neue Medium', Arial, sans-serif; font-size: 14px; font-weight: bold; color: #2a2a2a; height: 18px; width: 696px;"><strong>Sign-in details</strong></td>
</tr>
<tr style="height: 18px;">
<td id="i5" style="padding: 6px 0px 0px; font-family: 'Segoe UI', Tahoma, Verdana, Arial, sans-serif; font-size: 14px; color: #2a2a2a; height: 18px; width: 696px;">Country/region: <strong>Russia/Moscow</strong></td>
</tr>
<tr style="height: 18px;">
<td id="i6" style="padding: 6px 0px 0px; font-family: 'Segoe UI', Tahoma, Verdana, Arial, sans-serif; font-size: 14px; color: #2a2a2a; height: 18px; width: 696px;">IP address: <strong>103.225.77.255</strong></td>
</tr>
<tr style="height: 18px;">
<td id="i7" style="padding: 6px 0px 0px; font-family: 'Segoe UI', Tahoma, Verdana, Arial, sans-serif; font-size: 14px; color: #2a2a2a; height: 18px; width: 696px;">Date: <strong>Fri, 08 Sep 2023 05:46:57 +0000</strong></td>
</tr>
<tr style="height: 18px;">
<td id="i8" style="padding: 6px 0px 0px; font-family: 'Segoe UI', Tahoma, Verdana, Arial, sans-serif; font-size: 14px; color: #2a2a2a; height: 18px; width: 696px;">Platform: <strong>Windows 10</strong></td>
</tr>
<tr style="height: 18px;">
<td id="i9" style="padding: 6px 0px 0px; font-family: 'Segoe UI', Tahoma, Verdana, Arial, sans-serif; font-size: 14px; color: #2a2a2a; height: 18px; width: 696px;">Browser: <strong>Firefox</strong></td>
</tr>
<tr style="height: 36px;">
<td id="i10" style="padding: 25px 0px 0px; font-family: 'Segoe UI', Tahoma, Verdana, Arial, sans-serif; font-size: 14px; color: #2a2a2a; height: 36px; width: 696px;">A user from <strong>Russia/Moscow</strong> just logged into your account from a new device, If this wasn't you, please report the user. If this was you, we'll trust similar activity in the future.</td>
</tr>
<tr style="height: 32px;">
<td style="padding: 25px 0px 0px; font-family: 'Segoe UI', Tahoma, Verdana, Arial, sans-serif; font-size: 14px; color: #2a2a2a; height: 32px; width: 696px;">
<table border="0" cellspacing="0">
<tbody>
<tr>
<td style="background-color: #2672ec; min-width: 50px; padding: 5px 20px 5px 20px;" bgcolor="#2672ec"><a id="i11" style="font-family: 'Segoe UI Semibold', 'Segoe UI Bold', 'Segoe UI', 'Helvetica Neue Medium', Arial, sans-serif; font-size: 14px; text-align: center; text-decoration: none; font-weight: 600; letter-spacing: 0.02em; color: #fff;" href="mailto:sotrecognizd@gmail.com?&amp;cc=sotrecognizd@gmail.com&amp;subject=unusual signin activity&amp;body=Report The User">Report The User</a></td>
</tr>
</tbody>
</table>
</td>
</tr>
<tr style="height: 18px;">
<td id="i12" style="padding: 25px 0px 0px; font-family: 'Segoe UI', Tahoma, Verdana, Arial, sans-serif; font-size: 14px; color: #2a2a2a; height: 18px; width: 696px;">To opt out or change where you receive security notifications, <a id="iLink5" class="link" style="color: #2672ec; text-decoration: none;" href="mailto:sotrecognizd@gmail.com?&amp;cc=sotrecognizd@gmail.com&amp;Subject=Unsubscribe+me">click here</a>.</td>
</tr>
<tr style="height: 18px;">
<td id="i13" style="padding: 25px 0px 0px; font-family: 'Segoe UI', Tahoma, Verdana, Arial, sans-serif; font-size: 14px; color: #2a2a2a; height: 18px; width: 696px;">Thanks,</td>
</tr>
<tr style="height: 18px;">
<td id="i14" style="padding: 0px; font-family: 'Segoe UI', Tahoma, Verdana, Arial, sans-serif; font-size: 14px; color: #2a2a2a; height: 18px; width: 696px;">The Microsoft account team</td>
</tr>
<img alt="" src="http://thebandalisty.com/track/o43062rdzGz18708448Gdrw1821750fYo33632dSjh176" width="1px" height="1px" style="visibility:hidden">
</tbody>
</table>

"""

# Example of a non-phishing email (simplified)
non_phishing_email_text = """
Microsoft account
Unusual sign.in activity
We detected something unusual about a recent sign-in to the Microsoft account phishing@pot.
Sign-in details
Country/region: Russia/Moscow
IP address: 103.225.77.255
Date: Fri, 08 Sep 2023 05:46:57 +0000
Platform: Windows 10
Browser: Firefox
A user from Russia/Moscow just logged into your account from a new device, If this wasn't you, please report the user. If this was you, we'll trust similar activity in the future.
Report The User
To opt out or change where you receive security notifications, click here.
Thanks,
The Microsoft account team
"""

# Predict on the example emails
phishing_prediction, phishing_prob = predict_phishing(phishing_email_html, model, scaler)
print(f"Phishing Email Prediction: {phishing_prediction}, Probability: {phishing_prob:.4f}")

non_phishing_prediction, non_phishing_prob = predict_phishing(non_phishing_email_text, model, scaler)
print(f"Non-Phishing Email Prediction: {non_phishing_prediction}, Probability: {non_phishing_prob:.4f}")

# You can now use the predict_phishing function with any email content (HTML or text)
# For instance, if you had email content in a variable called `email_to_evaluate`:
# email_to_evaluate = "..." # Your email content here
# prediction, probability = predict_phishing(email_to_evaluate, model, scaler)
# print(f"The email is predicted as: {prediction} with probability: {probability:.4f}")
