In [1]:
#imports
import pandas as pd
import numpy as np
import re
import string
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn import svm
import lime
import lime.lime_text
from sklearn.pipeline import make_pipeline

In [2]:
import platform
platform.uname()

uname_result(system='Darwin', node='crc-dot1x-nat-10-239-237-84.bu.edu', release='24.2.0', version='Darwin Kernel Version 24.2.0: Fri Dec  6 18:40:14 PST 2024; root:xnu-11215.61.5~2/RELEASE_ARM64_T8103', machine='arm64')

In [3]:
def clean_email(text: str) -> str:
    text = re.sub(r'<[^>]+>', '', text) # remove HTML tags
    text = re.sub(r'http\S+', '', text) # remove URLs
    # TODO: add a count of URLs to email data
    text = re.sub(r'\d+', '', text) # remove numerical text
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    text = text.lower().strip() # lowercase
    return text

def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """
    """
    df['clean_email'] = df['body'].astype(str).apply(clean_email)
    return df

In [4]:
df = pd.read_csv('./data/emails_augmented.csv')  # Update path if necessary
assert 'body' in df.columns and 'label' in df.columns, "Missing required columns."
df = preprocess(df)
X = df['clean_email']
y = df['label']

In [5]:
#Vectorizing
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = vectorizer.fit_transform(df['clean_email'])
y = df['label']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
results = []

NameError: name 'X' is not defined

In [7]:
#Different Models
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

NameError: name 'X_train' is not defined

In [1]:
#Logistic Regression with Grid Search and K-fold cross-validation
model = LogisticRegression()

param_grid = [
    {
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['liblinear'],  # Efficient for sparse text features
        'max_iter': [500]
    },
    {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['saga'],  # Faster on large data
        'max_iter': [1000]
    }
]
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X, y) 

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

results.append({
    'Model': 'LogisticRegression()',
    'Best Params': grid_search.best_params_,
    'CV Accuracy': grid_search.best_score_,
    'Test Accuracy': accuracy_score(y_test, y_pred),
    'Test Precision': precision_score(y_test, y_pred, average='weighted'),
    'Test Recall': recall_score(y_test, y_pred, average='weighted'),
    'Test F1': f1_score(y_test, y_pred, average='weighted')
})
print(classification_report(y_test, y_pred))

NameError: name 'LogisticRegression' is not defined

In [9]:
y_pred = best_model.predict(X)
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     39527
           1       0.99      0.99      0.99     42611

    accuracy                           0.99     82138
   macro avg       0.99      0.99      0.99     82138
weighted avg       0.99      0.99      0.99     82138



In [10]:
"""
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
"""

'\nX_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)\ngnb = GaussianNB()\ny_pred = gnb.fit(X_train, y_train).predict(X_test)\nprint("Classification Report:")\nprint(classification_report(y_test, y_pred))\n'

In [11]:
#Gaussian Naive Bayes with Grid Search and K-fold cross-validation
model = MultinomialNB()

param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0],  
    'fit_prior': [True, False]
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

results.append({
    'Model': 'MultinomialNB',
    'Best Params': grid_search.best_params_,
    'CV Accuracy': grid_search.best_score_,
    'Test Accuracy': accuracy_score(y_test, y_pred),
    'Test Precision': precision_score(y_test, y_pred, average='weighted'),
    'Test Recall': recall_score(y_test, y_pred, average='weighted'),
    'Test F1': f1_score(y_test, y_pred, average='weighted')
})

print(classification_report(y_test, y_pred))

Fitting 10 folds for each of 8 candidates, totalling 80 fits
Best parameters found:  {'alpha': 0.01, 'fit_prior': True}
Best cross-validation score:  0.9334652259929996
              precision    recall  f1-score   support

           0       0.89      0.98      0.93      7815
           1       0.98      0.89      0.93      8613

    accuracy                           0.93     16428
   macro avg       0.93      0.93      0.93     16428
weighted avg       0.94      0.93      0.93     16428



In [12]:
y_pred = best_model.predict(X)
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.98      0.93     39527
           1       0.98      0.89      0.93     42611

    accuracy                           0.93     82138
   macro avg       0.94      0.94      0.93     82138
weighted avg       0.94      0.93      0.93     82138



In [13]:
"""
from sklearn import svm
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)
clf = svm.SVC()
y_pred = clf.fit(X_train, y_train).predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
"""

'\nfrom sklearn import svm\nX_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)\nclf = svm.SVC()\ny_pred = clf.fit(X_train, y_train).predict(X_test)\nprint("Classification Report:")\nprint(classification_report(y_test, y_pred))\n'

In [14]:
#Support Vector Classification with Grid Search and K-fold cross-validation
model = svm.SVC()

param_grid = [
    {
        'kernel': ['linear'],
        'C': [0.1, 1, 10],
    },
    {
        'kernel': ['rbf'],
        'C': [0.1, 1, 10],
        'gamma': [1e-3, 1e-4]
    }
]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X, y) 

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

results.append({
    'Model': 'SVC',
    'Best Params': grid_search.best_params_,
    'CV Accuracy': grid_search.best_score_,
    'Test Accuracy': accuracy_score(y_test, y_pred),
    'Test Precision': precision_score(y_test, y_pred, average='weighted'),
    'Test Recall': recall_score(y_test, y_pred, average='weighted'),
    'Test F1': f1_score(y_test, y_pred, average='weighted')
})

print(classification_report(y_test, y_pred))

Fitting 10 folds for each of 9 candidates, totalling 90 fits


KeyboardInterrupt: 

In [None]:
y_pred = best_model.predict(X)
print(classification_report(y, y_pred))

In [None]:
"""
#default random forest 
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(max_depth=20, random_state=0)
y_pred = clf.fit(X_train, y_train).predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
"""

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94      7815
           1       0.93      0.97      0.95      8613

    accuracy                           0.95     16428
   macro avg       0.95      0.94      0.95     16428
weighted avg       0.95      0.95      0.95     16428



In [15]:
#Random forest with Grid Search and K-fold cross-validation
model = RandomForestClassifier()

param_grid = {
    'n_estimators': [100, 200],              
    'max_depth': [10, 20, None],             
    'max_features': ['sqrt', 'log2'],        
    'min_samples_leaf': [1, 2]               
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X, y) 

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

results.append({
    'Model': 'Random Forest Classifier',
    'Best Params': grid_search.best_params_,
    'CV Accuracy': grid_search.best_score_,
    'Test Accuracy': accuracy_score(y_test, y_pred),
    'Test Precision': precision_score(y_test, y_pred, average='weighted'),
    'Test Recall': recall_score(y_test, y_pred, average='weighted'),
    'Test F1': f1_score(y_test, y_pred, average='weighted')
})

print(classification_report(y_test, y_pred))

Fitting 10 folds for each of 252 candidates, totalling 2520 fits


KeyboardInterrupt: 

In [None]:
y_pred = best_model.predict(X)
print(classification_report(y, y_pred))

In [None]:
feature_names = np.array(vectorizer.get_feature_names_out())
coefs = model.coef_[0]
top_phishing_idx = np.argsort(coefs)[-10:]
top_legit_idx = np.argsort(coefs)[:10]

In [None]:
results = pd.DataFrame(results)
print(results)

In [None]:


# Melt the DataFrame for seaborn
metrics_df = results.melt(id_vars='Model', 
                             value_vars=['Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1'], 
                             var_name='Metric', 
                             value_name='Score')

plt.figure(figsize=(10,6))
sns.barplot(data=metrics_df, x='Model', y='Score', hue='Metric')
plt.title('Model Performance Comparison')
plt.ylim(0, 1)
plt.xticks(rotation=15)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
 

df = pd.read_csv('data/emails.csv')
print(df.head())


train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


class EmailDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = str(self.df.iloc[idx]['body'])
        label = int(self.df.iloc[idx]['label'])
        encodings = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        input_ids = encodings['input_ids'].squeeze()
        attention_mask = encodings['attention_mask'].squeeze()
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label)
        }


train_ds = EmailDataset(train_df, tokenizer)
val_ds = EmailDataset(val_df, tokenizer)

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=8)


num_labels = df['label'].nunique()
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


for epoch in range(3):
    print(f"Epoch {epoch+1}")

    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Train Loss: {avg_train_loss:.4f}")


    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    acc = correct / total
    print(f"Validation Accuracy: {acc:.2%}")

model.save_pretrained('./phishing-bert-model')
tokenizer.save_pretrained('./phishing-bert-model')
print("Model saved to ./phishing-bert-model")


                                             subject  \
0                          Never agree to be a loser   
1                             Befriend Jenna Jameson   
2                               CNN.com Daily Top 10   
3  Re: svn commit: r619753 - in /spamassassin/tru...   
4                         SpecialPricesPharmMoreinfo   

                                                body  label  
0  Buck up, your troubles caused by small dimensi...      1  
1  \nUpgrade your sex and pleasures with these te...      1  
2  >+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...      1  
3  Would anyone object to removing .so from this ...      0  
4  \nWelcomeFastShippingCustomerSupport\nhttp://7...      1  


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1


 31%|███       | 2520/8214 [1:24:17<3:10:28,  2.01s/it]


KeyboardInterrupt: 

In [None]:

from transformers import BertTokenizer, BertForSequenceClassification
import shap
import torch


tokenizer = BertTokenizer.from_pretrained('./phishing-bert-model')
model = BertForSequenceClassification.from_pretrained('./phishing-bert-model')
model.eval()


email_text = """
California State Department of Vehicles(DMV) Final Notice: Enforcement Penalties Begin on July 25.
Our records show that as of today, you still have an outstanding traffic ticket. In accordance with  California State Administrative Code 15C-16.003, If you do not complete payment by July 24, 2025, we will take the following actions:

1. Report to the DMV violation database
2. Suspend your vehicle registration starting July 25
3. Suspend driving privileges for 30 days
4. Transfer to a toll booth and charge a 35% service fee
5. You may be prosecuted and your credit score will be affected
Pay Now:

https://cadmv.com-tieq.run/pay

Please pay immediately before enforcement to avoid license suspension and further legal disputes.
(Reply Y and re-open this message to click the link, or copy it to your browser.)
"""


def predict_proba(texts):
    encodings = tokenizer(
        texts,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=128
    )
    with torch.no_grad():
        logits = model(**encodings).logits
        probs = torch.nn.functional.softmax(logits, dim=1).numpy()
    return probs


probs = predict_proba([email_text])[0]
pred_label = probs.argmax()
print(f"Prediction: {pred_label} | Prob: {probs[pred_label]:.2%}")


explainer = shap.Explainer(
    predict_proba,
    masker=shap.maskers.Text(tokenizer)
)

shap_values = explainer([email_text])
shap.plots.text(shap_values[0])


In [None]:
print("\nTop indicative words for phishing:")
for word, coef in zip(feature_names[top_phishing_idx], coefs[top_phishing_idx]):
    print(f"{word}: {coef:.4f}")

print("\nTop indicative words for legitimate:")
for word, coef in zip(feature_names[top_legit_idx], coefs[top_legit_idx]):
    print(f"{word}: {coef:.4f}")


Top indicative words for phishing:
remove: 3.8784
our: 3.9179
investment: 3.9707
men: 4.1906
money: 4.2702
http: 4.6352
you: 5.0106
love: 5.3526
josemonkeyorg: 6.0801
your: 6.5893

Top indicative words for legitimate:
enron: -11.0770
wrote: -10.2384
thanks: -8.1074
url: -6.6347
vince: -6.1347
pm: -6.1271
louise: -5.7562
date: -5.6297
feb: -4.7627
university: -4.7434


In [None]:
pipeline = make_pipeline(vectorizer, model)
explainer = lime.lime_text.LimeTextExplainer(class_names=['Legitimate', 'Phishing'])

In [None]:
import random
idx = random.randint(0, len(df) - 1)
print("\nExplaining instance:", df['clean_email'].iloc[idx])
exp = explainer.explain_instance(df['clean_email'].iloc[idx], pipeline.predict_proba, num_features=10)


Explaining instance: cnn alerts my custom alert






 



alert name my custom alert
girl surives  storey fall

fri  aug   

full story



you have agreed to receive this email from cnncom as a result of your cnncom preference settings
to manage your settings click here
to alter your alert criteria or frequency or to unsubscribe from receiving custom email alerts click here


cable news network one cnn center atlanta georgia 
©  cable news network
a time warner company
all rights reserved
view our privacy policy and terms


In [None]:
import os
output_dir = './generated'
os.makedirs(output_dir, exist_ok=True)
exp.save_to_file(os.path.join(output_dir, 'lime_explanation.html'))

In [None]:
import joblib
joblib.dump(model, './generated/phishing_model.pkl')
joblib.dump(vectorizer, './generated/tfidf_vectorizer.pkl')

print("\nModel and vectorizer saved.")


Model and vectorizer saved.
