In [1]:
# Cell 1: Introduction
"""
# PhishPatrol Demonstration
This notebook demonstrates the following:
1. Training a phishing detection model using a sample dataset.
2. Testing the model with sample phishing and legitimate emails.
3. Instructions for running the Flask server to enable frontend functionality.

**Authors:** Varad Rasalkar
"""


'\n# PhishPatrol Demonstration\nThis notebook demonstrates the following:\n1. Training a phishing detection model using a sample dataset.\n2. Testing the model with sample phishing and legitimate emails.\n3. Instructions for running the Flask server to enable frontend functionality.\n\n**Authors:** Varad Rasalkar, Megan Kerni, Khushi Raju, Fiza Khwaja\n'

In [2]:
# Cell 2: Setup
# Install required libraries (uncomment the lines below if needed)
# !pip install pandas scikit-learn flask
# !pip3 install scikit-learn

# Import required libraries
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import re
import string




In [4]:
# Cell 3: Preprocessing Function
def preprocess_email(email_text):
    email_text = email_text.lower()
    email_text = re.sub(r'http\S+|www\.\S+', '', email_text)
    email_text = email_text.translate(str.maketrans('', '', string.punctuation))
    email_text = re.sub(r'\d+', '', email_text)
    return email_text

In [5]:
# Cell 4: Load Dataset
# Specify the dataset path
dataset_path = 'emails_dataset.csv'  # Ensure this file is in the same directory

# Load the dataset
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"Dataset not found at {dataset_path}")

data = pd.read_csv(dataset_path)
print(f"Loaded dataset with {len(data)} emails.")

# Preprocess the email content
data['email'] = data['email'].apply(preprocess_email)

# Display a sample of the dataset
data.head()


Loaded dataset with 33715 emails.


Unnamed: 0,email,label
0,subject ena sales on hpl\njust to update you o...,0
1,subject for ua issues \nthe above re...,0
2,subject hpl nominations for december \n see ...,0
3,subject revised nom kcs resources\ndaren \nit...,0
4,subject new production sitara deals needed\nd...,0


In [6]:
# Cell 5: Train the Model
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    data['email'], data['label'], test_size=0.2, random_state=42
)

# Extract features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

print("Model training completed.")

Model training completed.


In [10]:
# Cell 6: Test the Model
# Define sample emails
phishing_email = """
Congrats! You've won a free lottery ticket. Click here to claim your prize.
"""
legitimate_email = """
Hi team, let's schedule a meeting to discuss the project timeline.
"""

# Preprocess and transform the sample emails
phishing_tfidf = vectorizer.transform([preprocess_email(phishing_email)])
legitimate_tfidf = vectorizer.transform([preprocess_email(legitimate_email)])

# Predict probabilities
phishing_pred = model.predict_proba(phishing_tfidf)[0]
legitimate_pred = model.predict_proba(legitimate_tfidf)[0]

# Display predictions
print("Phishing Email Prediction:")
print(f"Phishing Likelihood: {phishing_pred[1] * 100:.2f}%")
print(f"Legitimate Likelihood: {phishing_pred[0] * 100:.2f}%\n")

print("Legitimate Email Prediction:")
print(f"Phishing Likelihood: {legitimate_pred[1] * 100:.2f}%")
print(f"Legitimate Likelihood: {legitimate_pred[0] * 100:.2f}%")


Phishing Email Prediction:
Phishing Likelihood: 86.00%
Legitimate Likelihood: 14.00%

Legitimate Email Prediction:
Phishing Likelihood: 3.00%
Legitimate Likelihood: 97.00%


In [11]:
# Cell 7: Running the Flask Server
"""
## Running the Flask Server
To run the frontend, follow these steps:
1. Open a terminal in the project directory.
2. Start the Flask server by running the following command: python3 app.py
3. Open your browser and navigate to `http://127.0.0.1:5000/` to access the frontend.
"""
## OR ---

# Uncomment line below if needed
#!pip install flask
import subprocess

# Start Flask server in a non-blocking way
print("Starting the Flask server. Access the frontend at http://127.0.0.1:5000/")
process = subprocess.Popen(["python3", "app.py"])

# Optionally, stop the Flask server if needed
# process.terminate()


Starting the Flask server. Access the frontend at http://127.0.0.1:5000/


Model training completed.
 * Serving Flask app 'app'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
 * Restarting with stat


Model training completed.


 * Debugger is active!
 * Debugger PIN: 100-450-595
127.0.0.1 - - [08/Dec/2024 17:02:40] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [08/Dec/2024 17:02:40] "[36mGET /static/styles.css HTTP/1.1[0m" 304 -
127.0.0.1 - - [08/Dec/2024 17:02:40] "[36mGET /static/script.js HTTP/1.1[0m" 304 -
127.0.0.1 - - [08/Dec/2024 17:02:40] "[36mGET /static/images/logo.svg HTTP/1.1[0m" 304 -
127.0.0.1 - - [08/Dec/2024 17:02:46] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Dec/2024 17:02:56] "POST /predict HTTP/1.1" 200 -
