<a href="https://colab.research.google.com/github/uthumss/Phishing_Email_Detection/blob/main/Phishing_Email_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd

df = pd.read_csv('Phishing_Email.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [12]:
df['Email Type'].value_counts()

Unnamed: 0_level_0,count
Email Type,Unnamed: 1_level_1
Safe Email,11322
Phishing Email,7328


In [17]:
sampled_dfs = []

# Iterate through each unique class in the 'Email Type' column
for email_type in df['Email Type'].unique():
    # Sample 1000 rows from each class
    sampled_df = df[df['Email Type'] == email_type].sample(n=1000, random_state=42, replace=True)
    # Append the sampled dataframe to the list
    sampled_dfs.append(sampled_df)

# Concatenate all the sampled dataframes into a new dataframe
df = pd.concat(sampled_dfs)

# Print the value counts of the new dataframe to verify the sampling
df['Email Type'].value_counts()


Unnamed: 0_level_0,count
Email Type,Unnamed: 1_level_1
Safe Email,1000
Phishing Email,1000


In [18]:
df['label_num'] = df['Email Type'].map({'Phishing Email': 0, 'Safe Email': 1})
df.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type,label_num
4294,4294,Sean O'Donnell wrote:\n> Doesnt answer your qu...,Safe Email,1
7354,7355,empty,Safe Email,1
16347,16348,"Hi folks,I have just taken delivery of three A...",Safe Email,1
11810,11811,re : seismic data via satellite fyi - - - - - ...,Safe Email,1
15993,15994,enron mentions enron discusses credit line of ...,Safe Email,1


In [19]:
df['label_num'].value_counts()

Unnamed: 0_level_0,count
label_num,Unnamed: 1_level_1
1,1000
0,1000


In [20]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [21]:
df['vector'] = df['Email Text'].astype(str).apply(lambda x: nlp(x).vector)
df.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type,label_num,vector
4294,4294,Sean O'Donnell wrote:\n> Doesnt answer your qu...,Safe Email,1,"[-0.0021050689, -0.3438314, 0.043865297, -0.10..."
7354,7355,empty,Safe Email,1,"[0.94832253, -0.40132424, 0.8501252, -0.468703..."
16347,16348,"Hi folks,I have just taken delivery of three A...",Safe Email,1,"[-0.13039935, -0.33465102, -0.037675023, -0.06..."
11810,11811,re : seismic data via satellite fyi - - - - - ...,Safe Email,1,"[-0.008120069, -0.21134396, 0.08685339, 0.0239..."
15993,15994,enron mentions enron discusses credit line of ...,Safe Email,1,"[0.002534582, -0.1568654, 0.037821356, 0.01606..."


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.vector.values,
    df.label_num,
    test_size=0.2,
    random_state=2022
)

In [24]:
# Converting training and testing data to 2d arrays
import numpy as np

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [25]:
# USing Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)


clf = MultinomialNB()
clf.fit(scaled_train_embed, y_train)

In [27]:
from sklearn.metrics import classification_report

y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.79      0.78       195
           1       0.79      0.77      0.78       205

    accuracy                           0.78       400
   macro avg       0.78      0.78      0.78       400
weighted avg       0.78      0.78      0.78       400



In [28]:
# Using knn

from  sklearn.neighbors import KNeighborsClassifier

#1. creating a KNN model object
clf = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean')

#2. fit with all_train_embeddings and y_train
clf.fit(X_train_2d, y_train)

#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred = clf.predict(X_test_2d)

#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85       195
           1       0.87      0.84      0.86       205

    accuracy                           0.85       400
   macro avg       0.86      0.86      0.85       400
weighted avg       0.86      0.85      0.86       400



Confusion Matrix

In [22]:
#finally print the confusion matrix for the best model
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

from matplotlib import pyplot as plt
import seaborn as sn
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('Truth')