In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Load the dataset
df = pd.read_csv('CEAS_08.csv')

In [5]:
def preprocess_email(email):
    # Remove HTML tags
    email = re.sub(r'<.*?>', '', email)
    # Remove punctuation and numbers
    email = re.sub(r'[^a-zA-Z\s]', '', email)
    # Convert to lowercase
    email = email.lower()
    # Tokenize
    tokens = word_tokenize(email)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing to the email content
df['body'] = df['body'].astype(str).fillna('')
df['cleaned_email'] = df['body'].apply(preprocess_email)

In [6]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_email'])

In [7]:
# Labels: 1 for phishing, 0 for legitimate
labels = df['label']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train the model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9853147746137148
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      3490
           1       1.00      0.98      0.99      4341

    accuracy                           0.99      7831
   macro avg       0.98      0.99      0.99      7831
weighted avg       0.99      0.99      0.99      7831



In [8]:
joblib.dump(model, 'phishing_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']