In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re

In [15]:
pd.set_option("display.max_row",None)
pd.set_option("display.max_column",None)

In [16]:
df=pd.read_csv('Phishing_Email.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


# Dataset Preprocessing

In [17]:
df.isna().sum()

Unnamed: 0     0
Email Text    16
Email Type     0
dtype: int64

In [18]:
df.dropna(thresh=1*df.shape[1],inplace=True)

In [19]:
df=df.rename(columns={'Email Text' : 'Body' , 'Email Type' : 'Type'})

In [20]:
df=df.drop('Unnamed: 0', axis=1)

In [153]:
emails_df = df.copy()

In [154]:
emails_df['Type'] = emails_df['Type'].replace({'Safe Email': 0, 'Phishing Email': 1}).astype(int)

  emails_df['Type'] = emails_df['Type'].replace({'Safe Email': 0, 'Phishing Email': 1}).astype(int)


In [155]:
emails_df.head()

Unnamed: 0,Body,Type
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",0
1,the other side of * galicismos * * galicismo *...,0
2,re : equistar deal tickets are you still avail...,0
3,\nHello I am your hot lil horny toy.\n I am...,1
4,software at incredibly low prices ( 86 % lower...,1


# Detect Malicious Email Content

In [156]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
    text = re.sub(r'\s+', ' ', text)         # Remove extra spaces
    return text.lower()                      # Convert to lowercase

In [157]:
emails_df['Body'] = emails_df['Body'].apply(preprocess_text)

In [158]:
def word_count(text):
    return len(text.split())

In [159]:
def count_suspicious_words(text):
    suspicious_words = ['urgent', 'click here', 'limited time', 'verify account', 'password reset', 
                         'act now', 'suspicious activity', 'bank', 'invoice', 'free', 'credit card']
    return sum(word in text for word in suspicious_words)

In [160]:
def malicious_content(text):
    text = text.lower()
    
    malicious_phrases = ['urgent', 'click here', 'limited time', 'verify account', 'password reset', 
                         'act now', 'suspicious activity', 'bank', 'invoice', 'free', 'credit card']
    
    # Count occurrences of malicious phrases
    phrase_count = sum([1 for phrase in malicious_phrases if phrase in text])
    
    # Detect if the email contains HTML tags
    html_tag_pattern = r'<[^>]+>'
    contains_html = int(bool(re.search(html_tag_pattern, text)))

    # Combine all detected suspicious elements into a feature score
    malicious_score = phrase_count + contains_html
    
    # Return 1 if malicious content is detected, 0 otherwise
    return int(malicious_score > 0)


In [161]:
emails_df['word_count'] = emails_df['Body'].apply(word_count)
emails_df['suspicious_word_count'] = emails_df['Body'].apply(count_suspicious_words)
emails_df['malicious_email_content'] = emails_df['Body'].apply(malicious_content)

In [162]:
emails_df.sample(5)

Unnamed: 0,Body,Type,word_count,suspicious_word_count,malicious_email_content
2144,folks my first time posting have a bit of unix...,0,220,0,0
17115,url httpboingboingnet date not suppliedmodern ...,0,71,0,0
9610,save your money buy getting this thing here yo...,1,80,0,0
8845,re netco items you did it was part of another ...,0,607,0,0
10552,url httpdiveintomarkorgarchiveshtmladvancedcss...,0,42,0,0


In [144]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from scipy.sparse import hstack, csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming emails_df is your DataFrame
X_text = emails_df['Body']
y = emails_df['Type']  # Target variable (0 = Safe, 1 = Phishing)

# Split the dataset into training and testing sets
X_train_text, X_test_text, y_train, y_test = train_test_split(X_text, y, train_size=0.7, shuffle=True, random_state=1)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

# Reduce dimensions of TF-IDF features
svd = TruncatedSVD(n_components=100, random_state=42)
X_train_tfidf_reduced = svd.fit_transform(X_train_tfidf)
X_test_tfidf_reduced = svd.transform(X_test_tfidf)

# Get the indices for the training and testing sets
train_indices = X_train_text.index
test_indices = X_test_text.index

# Extract additional features
train_additional_features = emails_df.loc[train_indices, ['word_count', 'malicious_email_content']].values
test_additional_features = emails_df.loc[test_indices, ['word_count', 'malicious_email_content']].values

# Convert additional features to sparse matrices
train_additional_features_sparse = csr_matrix(train_additional_features)
test_additional_features_sparse = csr_matrix(test_additional_features)

# Combine the TF-IDF features with the additional features
X_train_features = hstack([X_train_tfidf_reduced, train_additional_features_sparse])
X_test_features = hstack([X_test_tfidf_reduced, test_additional_features_sparse])

# Optional: Check the shape of the combined features
print("X_train_features shape:", X_train_features.shape)  # Should be (num_samples, 102)
print("X_test_features shape:", X_test_features.shape)    # Should be (num_samples, 102)


X_train_features shape: (13043, 102)
X_test_features shape: (5591, 102)


# Feature Engineering: Combining TF-IDF and Additional Email Features

In [163]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from scipy.sparse import hstack, csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [164]:
X_text = emails_df['Body']
y = emails_df['Type']  # Target variable (0 = Safe, 1 = Phishing)

# Split the dataset into training and testing sets
X_train_text, X_test_text, y_train, y_test = train_test_split(X_text, y, train_size=0.7, shuffle=True, random_state=1)

In [165]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

In [166]:
# Reduce dimensions of TF-IDF features
svd = TruncatedSVD(n_components=100, random_state=42)
X_train_tfidf_reduced = svd.fit_transform(X_train_tfidf)
X_test_tfidf_reduced = svd.transform(X_test_tfidf)

In [167]:
# Get the indices for the training and testing sets
train_indices = X_train_text.index
test_indices = X_test_text.index

# Extract additional features
train_additional_features = emails_df.loc[train_indices, ['word_count', 'malicious_email_content']].values
test_additional_features = emails_df.loc[test_indices, ['word_count', 'malicious_email_content']].values

# Convert additional features to sparse matrices
train_additional_features_sparse = csr_matrix(train_additional_features)
test_additional_features_sparse = csr_matrix(test_additional_features)


# Combine the TF-IDF features with the additional features
X_train_features = hstack([X_train_tfidf_reduced, train_additional_features_sparse])
X_test_features = hstack([X_test_tfidf_reduced, test_additional_features_sparse])

# Training the Model

In [168]:
# Decision tree algorithms
classifiers = {
    "J48_DecisionTree": DecisionTreeClassifier(criterion='entropy', random_state=42),
    "Simple_CART": DecisionTreeClassifier(criterion='gini', random_state=42),
    "Random_Forest": RandomForestClassifier(class_weight='balanced', random_state=42),
    "Random_Tree": DecisionTreeClassifier(random_state=42),
    "ADTree": DecisionTreeClassifier(random_state=42),
    "REPTree": DecisionTreeClassifier(random_state=42)
}

In [169]:
for name, clf in classifiers.items():
    clf.fit(X_train_features, y_train)
    y_pred = clf.predict(X_test_features)
    print(f"Results for {name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_pred):.4f}")
    print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}\n")

Results for J48_DecisionTree:
Accuracy: 0.9252
Precision: 0.9027
Recall: 0.9108
F1-Score: 0.9067
ROC AUC: 0.9228
Confusion Matrix: 
[[3142  219]
 [ 199 2031]]

Results for Simple_CART:
Accuracy: 0.9170
Precision: 0.8876
Recall: 0.9067
F1-Score: 0.8971
ROC AUC: 0.9153
Confusion Matrix: 
[[3105  256]
 [ 208 2022]]

Results for Random_Forest:
Accuracy: 0.9530
Precision: 0.9448
Recall: 0.9368
F1-Score: 0.9408
ROC AUC: 0.9502
Confusion Matrix: 
[[3239  122]
 [ 141 2089]]

Results for Random_Tree:
Accuracy: 0.9170
Precision: 0.8876
Recall: 0.9067
F1-Score: 0.8971
ROC AUC: 0.9153
Confusion Matrix: 
[[3105  256]
 [ 208 2022]]

Results for ADTree:
Accuracy: 0.9170
Precision: 0.8876
Recall: 0.9067
F1-Score: 0.8971
ROC AUC: 0.9153
Confusion Matrix: 
[[3105  256]
 [ 208 2022]]

Results for REPTree:
Accuracy: 0.9170
Precision: 0.8876
Recall: 0.9067
F1-Score: 0.8971
ROC AUC: 0.9153
Confusion Matrix: 
[[3105  256]
 [ 208 2022]]



In [170]:
# Multi-Model Majority Voting Classification System
voting_clf = VotingClassifier(estimators=[(name, clf) for name, clf in classifiers.items()], voting='hard')
voting_clf.fit(X_train_features, y_train)
y_pred_voting = voting_clf.predict(X_test_features)

In [152]:
print("Results for Majority Voting Model:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_voting):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_voting):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_voting):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_voting):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_voting):.4f}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred_voting)}\n")

Results for Majority Voting Model:
Accuracy: 0.9170
Precision: 0.8876
Recall: 0.9067
F1-Score: 0.8971
ROC AUC: 0.9153
Confusion Matrix: 
[[3105  256]
 [ 208 2022]]

