In [3]:
#imports
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold
import lime
import lime.lime_text
from sklearn.pipeline import make_pipeline

In [4]:
# ata cleaning and processing 
def clean_email(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower().strip()
    return text

def preprocess(df):
    df['clean_email'] = df['body'].astype(str).apply(clean_email)
    return df

In [5]:
#data cleaning
df = pd.read_csv('./data/emails.csv')  # change path as needed
assert 'body' in df.columns and 'label' in df.columns, "Missing required columns."

# ========== 2. Text Preprocessing ==========
def clean_email(text):
    text = re.sub(r'<[^>]+>', '', text)  # remove HTML tags
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'\d+', '', text)      # remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = text.lower().strip()
    return text

df['clean_email'] = df['body'].astype(str).apply(clean_email)


In [6]:
df = pd.read_csv('./data/emails.csv')  # Update path if necessary
assert 'body' in df.columns and 'label' in df.columns, "Missing required columns."
df = preprocess(df)

X = df['clean_email']
y = df['label']

In [7]:
#Vectorizing
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = vectorizer.fit_transform(df['clean_email'])
y = df['label']

In [8]:
#Different Models
kf = KFold(n_splits=10)
kf.get_n_splits(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [9]:
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       496
           1       1.00      0.92      0.96        76

    accuracy                           0.99       572
   macro avg       0.99      0.96      0.98       572
weighted avg       0.99      0.99      0.99       572



In [10]:
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       496
           1       0.93      0.87      0.90        76

    accuracy                           0.97       572
   macro avg       0.95      0.93      0.94       572
weighted avg       0.97      0.97      0.97       572



In [11]:
from sklearn import svm
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)
clf = svm.SVC()
y_pred = clf.fit(X_train, y_train).predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       496
           1       1.00      0.96      0.98        76

    accuracy                           0.99       572
   macro avg       1.00      0.98      0.99       572
weighted avg       0.99      0.99      0.99       572



In [19]:
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)
clf = svm.SVC()

clf = RandomForestClassifier(max_depth=20, random_state=0)
y_pred = clf.fit(X_train, y_train).predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       496
           1       1.00      0.89      0.94        76

    accuracy                           0.99       572
   macro avg       0.99      0.95      0.97       572
weighted avg       0.99      0.99      0.99       572



In [13]:
feature_names = np.array(vectorizer.get_feature_names_out())
coefs = model.coef_[0]
top_phishing_idx = np.argsort(coefs)[-10:]
top_legit_idx = np.argsort(coefs)[:10]

In [14]:
print("\nTop indicative words for phishing:")
for word, coef in zip(feature_names[top_phishing_idx], coefs[top_phishing_idx]):
    print(f"{word}: {coef:.4f}")

print("\nTop indicative words for legitimate:")
for word, coef in zip(feature_names[top_legit_idx], coefs[top_legit_idx]):
    print(f"{word}: {coef:.4f}")


Top indicative words for phishing:
this: 1.3220
company: 1.4602
click here: 1.4798
http: 1.9205
our: 1.9692
click: 2.1652
com: 2.2415
free: 2.5656
your: 4.1485
you: 5.6629

Top indicative words for legitimate:
of: -3.1070
in: -1.9988
the: -1.9029
language: -1.8777
university: -1.7201
linguistics: -1.5473
edu: -1.4992
de: -1.3688
conference: -1.2213
english: -1.2133


In [15]:
raw_X_train, raw_X_test = train_test_split(df['clean_email'], test_size=0.2, random_state=42)

pipeline = make_pipeline(vectorizer, model)
explainer = lime.lime_text.LimeTextExplainer(class_names=['Legitimate', 'Phishing'])

In [16]:
idx = 0
print("\nExplaining instance:", raw_X_test.iloc[idx])
exp = explainer.explain_instance(raw_X_test.iloc[idx], pipeline.predict_proba, num_features=10)


Explaining instance: call for papers  exhibits                                           appel aux communications  expositions                      international conference on natural language processing and industrial applications nlp  ia     special accent on computer assisted language learning    attention speciale portee a l  enseignement de la langue    august  aout      moncton  new  brunswick  canada come to canada this summer    iwnlg august    in niagara  onthe  lake coling  acl  workshops august  in montreal nlp  ia  call august  in moncton topics of interest  the nlp study group  gretal  at l  universite de moncton is organizing its second international conference on nlp and industrial applications  this year a special attention is given to computer assisted language learning  teaching  papers are invited on all aspects of natural language processing  including  but not limited to   computer assisted language learning  teaching   natural language understanding and generatio

In [21]:
import os
output_dir = './generated'
os.makedirs(output_dir, exist_ok=True)
exp.save_to_file(os.path.join(output_dir, 'lime_explanation.html'))

In [22]:
import joblib
joblib.dump(model, './generated/phishing_model.pkl')
joblib.dump(vectorizer, './generated/tfidf_vectorizer.pkl')

print("\nModel and vectorizer saved.")


Model and vectorizer saved.
