# This file is only for playing around!

In [1]:
import argparse
import os
import re
import sys
import json
from typing import List, Tuple, Optional

import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
accuracy_score, precision_recall_fscore_support,
roc_auc_score, classification_report, confusion_matrix
)
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


from imblearn.under_sampling import RandomUnderSampler

In [3]:
try:
    import shap # type: ignore
except Exception:
    shap = None


try:
    from lime.lime_text import LimeTextExplainer # type: ignore
except Exception:
    LimeTextExplainer = None


import matplotlib.pyplot as plt
from email import policy
from email.parser import BytesParser


RANDOM_STATE = 42

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
base = pd.read_csv("/Users/merterol/Desktop/Thesis Code/data/baseline_spam-ham.csv")
assassin = pd.read_csv("/Users/merterol/Desktop/Thesis Code/data/spam_assassin.csv")
zen = pd.read_csv("/Users/merterol/Desktop/Thesis Code/data/zenodo.csv")

In [5]:
print(zen.head())
print(zen.isna().sum())

# get rid of nan values
zen = zen.dropna()

                                              sender  \
0                   Young Esposito <Young@iworld.de>   
1                       Mok <ipline's1983@icable.ph>   
2  Daily Top 10 <Karmandeep-opengevl@universalnet...   
3                 Michael Parker <ivqrnai@pobox.com>   
4  Gretchen Suggs <externalsep1@loanofficertool.com>   

                                         receiver  \
0                     user4@gvc.ceas-challenge.cc   
1                   user2.2@gvc.ceas-challenge.cc   
2                   user2.9@gvc.ceas-challenge.cc   
3  SpamAssassin Dev <xrh@spamassassin.apache.org>   
4                   user2.2@gvc.ceas-challenge.cc   

                              date  \
0  Tue, 05 Aug 2008 16:31:02 -0700   
1  Tue, 05 Aug 2008 18:31:03 -0500   
2  Tue, 05 Aug 2008 20:28:00 -1200   
3  Tue, 05 Aug 2008 17:31:20 -0600   
4  Tue, 05 Aug 2008 19:31:21 -0400   

                                             subject  \
0                          Never agree to be a loser   
1  

# Naive Bayes

In [6]:
# train naive bayes (had to combine sender, receiver, subject and body into a single text feature)

for col in ["sender", "receiver", "subject", "body"]:
    if col not in zen.columns:
        zen[col] = ""

combo = zen["sender"] + " " + zen["receiver"] + " " + zen["date"] + " " + zen["subject"] + " " + zen["body"].astype(str)

target = zen.label.astype(int)
predictors = combo

X_train, X_test, y_train, y_test = train_test_split(
    predictors, target, test_size=0.2, random_state=RANDOM_STATE, stratify=target
)

nb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        lowercase=True,
        strip_accents='unicode',
        ngram_range=(1,2),
        min_df=2
    )),
    ('clf', MultinomialNB())
])

nb_pipeline.fit(X_train, y_train)
y_pred_nb = nb_pipeline.predict(X_test)
y_proba_nb = nb_pipeline.predict_proba(X_test)[:, 1]

print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba_nb):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba_nb)}")


Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3368
           1       1.00      0.99      0.99      4366

    accuracy                           0.99      7734
   macro avg       0.99      0.99      0.99      7734
weighted avg       0.99      0.99      0.99      7734

Confusion Matrix:
[[3361    7]
 [  37 4329]]
ROC AUC Score: 0.9997
ROC AUC Score: 0.9996975114330886


In [7]:
# further test on assassin dataset
print(assassin.head())
print(assassin.isna().sum())
assassin = assassin.dropna()

# use as second test set on the same model
print(classification_report(assassin['target'], nb_pipeline.predict(assassin['text'])))

                                                text  target
0  From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...       0
1  From gort44@excite.com Mon Jun 24 17:54:21 200...       1
2  From fork-admin@xent.com Mon Jul 29 11:39:57 2...       1
3  From dcm123@btamail.net.cn Mon Jun 24 17:49:23...       1
4  From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...       0
text      0
target    0
dtype: int64
              precision    recall  f1-score   support

           0       0.69      1.00      0.82      3900
           1       1.00      0.08      0.14      1896

    accuracy                           0.70      5796
   macro avg       0.85      0.54      0.48      5796
weighted avg       0.79      0.70      0.60      5796



# Logistic Regression

In [8]:
lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        lowercase=True,
        strip_accents='unicode',
        ngram_range=(1,2),
        min_df=2
    )),
    ('clf', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
])

lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)
y_proba_lr = lr_pipeline.predict_proba(X_test)[:, 1]

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba_lr):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba_lr)}")

  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3368
           1       1.00      1.00      1.00      4366

    accuracy                           1.00      7734
   macro avg       1.00      1.00      1.00      7734
weighted avg       1.00      1.00      1.00      7734

Confusion Matrix:
[[3350   18]
 [  19 4347]]
ROC AUC Score: 0.9996
ROC AUC Score: 0.9996148847224776


In [9]:
print(classification_report(assassin['target'], lr_pipeline.predict(assassin['text'])))

              precision    recall  f1-score   support

           0       0.68      1.00      0.81      3900
           1       1.00      0.01      0.02      1896

    accuracy                           0.68      5796
   macro avg       0.84      0.51      0.41      5796
weighted avg       0.78      0.68      0.55      5796



### Summary LR & NB:

in-domain works great but out-of-domain struggles alot --> trying feature engineering next

### Adding top level domain screening

In [11]:
tld_info = pd.read_csv("/Users/merterol/Desktop/Thesis Code/data/Suspicious_TLDs_List.csv")

tld_risk = dict(zip(tld_info['metadata_tld'], tld_info['metadata_severity']))
tld_categories = dict(zip(tld_info['metadata_tld'], tld_info['metadata_category']))