In [2]:
#imports
import pandas as pd
import numpy as np
import re
import string
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn import svm
import lime
import lime.lime_text
from sklearn.pipeline import make_pipeline

In [3]:
import platform
platform.uname()

uname_result(system='Linux', node='scc-505', release='4.18.0-553.54.1.el8_10.x86_64', version='#1 SMP Tue May 27 22:49:52 EDT 2025', machine='x86_64')

In [4]:
def clean_email(text: str) -> str:
    text = re.sub(r'<[^>]+>', '', text) # remove HTML tags
    text = re.sub(r'http\S+', '', text) # remove URLs
    # TODO: add a count of URLs to email data
    text = re.sub(r'\d+', '', text) # remove numerical text
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    text = text.lower().strip() # lowercase
    return text

def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """
    """
    df['clean_email'] = df['body'].astype(str).apply(clean_email)
    return df

In [5]:
df = pd.read_csv('./data/emails_augmented.csv')  # Update path if necessary
assert 'body' in df.columns and 'label' in df.columns, "Missing required columns."
df = preprocess(df)
X = df['clean_email']
y = df['label']

KeyboardInterrupt: 

In [5]:
#Vectorizing
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = vectorizer.fit_transform(df['clean_email'])
y = df['label']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
results = []

NameError: name 'X' is not defined

In [7]:
#Different Models
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

NameError: name 'X_train' is not defined

In [1]:
#Logistic Regression with Grid Search and K-fold cross-validation
model = LogisticRegression()

param_grid = [
    {
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['liblinear'],  # Efficient for sparse text features
        'max_iter': [500]
    },
    {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['saga'],  # Faster on large data
        'max_iter': [1000]
    }
]
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X, y) 

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

results.append({
    'Model': 'LogisticRegression()',
    'Best Params': grid_search.best_params_,
    'CV Accuracy': grid_search.best_score_,
    'Test Accuracy': accuracy_score(y_test, y_pred),
    'Test Precision': precision_score(y_test, y_pred, average='weighted'),
    'Test Recall': recall_score(y_test, y_pred, average='weighted'),
    'Test F1': f1_score(y_test, y_pred, average='weighted')
})
print(classification_report(y_test, y_pred))

NameError: name 'LogisticRegression' is not defined

In [9]:
y_pred = best_model.predict(X)
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     39527
           1       0.99      0.99      0.99     42611

    accuracy                           0.99     82138
   macro avg       0.99      0.99      0.99     82138
weighted avg       0.99      0.99      0.99     82138



In [10]:
"""
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
"""

'\nX_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)\ngnb = GaussianNB()\ny_pred = gnb.fit(X_train, y_train).predict(X_test)\nprint("Classification Report:")\nprint(classification_report(y_test, y_pred))\n'

In [11]:
#Gaussian Naive Bayes with Grid Search and K-fold cross-validation
model = MultinomialNB()

param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0],  
    'fit_prior': [True, False]
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

results.append({
    'Model': 'MultinomialNB',
    'Best Params': grid_search.best_params_,
    'CV Accuracy': grid_search.best_score_,
    'Test Accuracy': accuracy_score(y_test, y_pred),
    'Test Precision': precision_score(y_test, y_pred, average='weighted'),
    'Test Recall': recall_score(y_test, y_pred, average='weighted'),
    'Test F1': f1_score(y_test, y_pred, average='weighted')
})

print(classification_report(y_test, y_pred))

Fitting 10 folds for each of 8 candidates, totalling 80 fits
Best parameters found:  {'alpha': 0.01, 'fit_prior': True}
Best cross-validation score:  0.9334652259929996
              precision    recall  f1-score   support

           0       0.89      0.98      0.93      7815
           1       0.98      0.89      0.93      8613

    accuracy                           0.93     16428
   macro avg       0.93      0.93      0.93     16428
weighted avg       0.94      0.93      0.93     16428



In [12]:
y_pred = best_model.predict(X)
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.98      0.93     39527
           1       0.98      0.89      0.93     42611

    accuracy                           0.93     82138
   macro avg       0.94      0.94      0.93     82138
weighted avg       0.94      0.93      0.93     82138



In [13]:
"""
from sklearn import svm
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)
clf = svm.SVC()
y_pred = clf.fit(X_train, y_train).predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
"""

'\nfrom sklearn import svm\nX_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)\nclf = svm.SVC()\ny_pred = clf.fit(X_train, y_train).predict(X_test)\nprint("Classification Report:")\nprint(classification_report(y_test, y_pred))\n'

In [14]:
#Support Vector Classification with Grid Search and K-fold cross-validation
model = svm.SVC()

param_grid = [
    {
        'kernel': ['linear'],
        'C': [0.1, 1, 10],
    },
    {
        'kernel': ['rbf'],
        'C': [0.1, 1, 10],
        'gamma': [1e-3, 1e-4]
    }
]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X, y) 

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

results.append({
    'Model': 'SVC',
    'Best Params': grid_search.best_params_,
    'CV Accuracy': grid_search.best_score_,
    'Test Accuracy': accuracy_score(y_test, y_pred),
    'Test Precision': precision_score(y_test, y_pred, average='weighted'),
    'Test Recall': recall_score(y_test, y_pred, average='weighted'),
    'Test F1': f1_score(y_test, y_pred, average='weighted')
})

print(classification_report(y_test, y_pred))

Fitting 10 folds for each of 9 candidates, totalling 90 fits


KeyboardInterrupt: 

In [None]:
y_pred = best_model.predict(X)
print(classification_report(y, y_pred))

In [None]:
"""
#default random forest 
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(max_depth=20, random_state=0)
y_pred = clf.fit(X_train, y_train).predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
"""

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94      7815
           1       0.93      0.97      0.95      8613

    accuracy                           0.95     16428
   macro avg       0.95      0.94      0.95     16428
weighted avg       0.95      0.95      0.95     16428



In [15]:
#Random forest with Grid Search and K-fold cross-validation
model = RandomForestClassifier()

param_grid = {
    'n_estimators': [100, 200],              
    'max_depth': [10, 20, None],             
    'max_features': ['sqrt', 'log2'],        
    'min_samples_leaf': [1, 2]               
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X, y) 

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

results.append({
    'Model': 'Random Forest Classifier',
    'Best Params': grid_search.best_params_,
    'CV Accuracy': grid_search.best_score_,
    'Test Accuracy': accuracy_score(y_test, y_pred),
    'Test Precision': precision_score(y_test, y_pred, average='weighted'),
    'Test Recall': recall_score(y_test, y_pred, average='weighted'),
    'Test F1': f1_score(y_test, y_pred, average='weighted')
})

print(classification_report(y_test, y_pred))

Fitting 10 folds for each of 252 candidates, totalling 2520 fits


KeyboardInterrupt: 

In [None]:
y_pred = best_model.predict(X)
print(classification_report(y, y_pred))

In [None]:
feature_names = np.array(vectorizer.get_feature_names_out())
coefs = model.coef_[0]
top_phishing_idx = np.argsort(coefs)[-10:]
top_legit_idx = np.argsort(coefs)[:10]

In [None]:
results = pd.DataFrame(results)
print(results)

In [None]:


# Melt the DataFrame for seaborn
metrics_df = results.melt(id_vars='Model', 
                             value_vars=['Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1'], 
                             var_name='Metric', 
                             value_name='Score')

plt.figure(figsize=(10,6))
sns.barplot(data=metrics_df, x='Model', y='Score', hue='Metric')
plt.title('Model Performance Comparison')
plt.ylim(0, 1)
plt.xticks(rotation=15)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:

import torch
from transformers import BertTokenizer, BertForSequenceClassification
import shap


label_map = {
    0: "Not Phishing",
    1: "Bank Scam",
    2: "Credential Harvesting",
    3: "Fake Invoice Scam",
    4: "Tech Support Scam"
}


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_map)
)
model.eval()

test_email = """
Hello,

We have detected unusual activity on your bank account. 
Please verify your login details immediately to prevent suspension.

Click here to verify now.

Thank you,
Fake Bank Support
"""



def predict_proba(texts):
    encodings = tokenizer(
        texts,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=128
    )
    with torch.no_grad():
        logits = model(**encodings).logits
        probs = torch.nn.functional.softmax(logits, dim=1).numpy()
    return probs


probs = predict_proba([test_email])[0]
predicted_label = probs.argmax()
confidence = probs[predicted_label]

print(f"Prediction: {label_map[predicted_label]}")
print(f"Confidence: {confidence:.2%}")


explainer = shap.Explainer(
    predict_proba,
    masker=shap.maskers.Text(tokenizer)
)

shap_values = explainer([test_email])

shap.plots.text(shap_values[0])


In [None]:
print("\nTop indicative words for phishing:")
for word, coef in zip(feature_names[top_phishing_idx], coefs[top_phishing_idx]):
    print(f"{word}: {coef:.4f}")

print("\nTop indicative words for legitimate:")
for word, coef in zip(feature_names[top_legit_idx], coefs[top_legit_idx]):
    print(f"{word}: {coef:.4f}")


Top indicative words for phishing:
remove: 3.8784
our: 3.9179
investment: 3.9707
men: 4.1906
money: 4.2702
http: 4.6352
you: 5.0106
love: 5.3526
josemonkeyorg: 6.0801
your: 6.5893

Top indicative words for legitimate:
enron: -11.0770
wrote: -10.2384
thanks: -8.1074
url: -6.6347
vince: -6.1347
pm: -6.1271
louise: -5.7562
date: -5.6297
feb: -4.7627
university: -4.7434


In [None]:
pipeline = make_pipeline(vectorizer, model)
explainer = lime.lime_text.LimeTextExplainer(class_names=['Legitimate', 'Phishing'])

In [None]:
import random
idx = random.randint(0, len(df) - 1)
print("\nExplaining instance:", df['clean_email'].iloc[idx])
exp = explainer.explain_instance(df['clean_email'].iloc[idx], pipeline.predict_proba, num_features=10)


Explaining instance: cnn alerts my custom alert






 



alert name my custom alert
girl surives  storey fall

fri  aug   

full story



you have agreed to receive this email from cnncom as a result of your cnncom preference settings
to manage your settings click here
to alter your alert criteria or frequency or to unsubscribe from receiving custom email alerts click here


cable news network one cnn center atlanta georgia 
©  cable news network
a time warner company
all rights reserved
view our privacy policy and terms


In [None]:
import os
output_dir = './generated'
os.makedirs(output_dir, exist_ok=True)
exp.save_to_file(os.path.join(output_dir, 'lime_explanation.html'))

In [None]:
import joblib
joblib.dump(model, './generated/phishing_model.pkl')
joblib.dump(vectorizer, './generated/tfidf_vectorizer.pkl')

print("\nModel and vectorizer saved.")


Model and vectorizer saved.
