In [20]:
import os
import re
import nltk
import email
import string
import joblib
import pandas as pd
from tqdm import tqdm


import numpy as np
from bs4 import BeautifulSoup
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make output deterministic
RANDOM_STATE = 42

# Ensure NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download("wordnet", quiet=True)

punctuations = list(string.punctuation)
stemmer = nltk.PorterStemmer()

processed_emails_csv_path = os.path.join(".", "processed_emails_cleaned.csv")
suspicious_features_csv_path = os.path.join(".", "suspicious_features_dataset.csv")

In [21]:
stopwords_set = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

def flatten_to_string(parts, prefer_html=True):
    """Flatten nested email parts into clean plain text, optionally preferring HTML."""
    texts = []

    if isinstance(parts, str):
        texts.append(parts)
    elif isinstance(parts, list):
        for part in parts:
            texts += flatten_to_string(part, prefer_html)
    elif hasattr(parts, "get_payload"):
        try:
            ctype = parts.get_content_type()
            payload = parts.get_payload(decode=True)
            charset = parts.get_content_charset() or "utf-8"

            if payload:
                if isinstance(payload, (bytes, bytearray)):
                    payload = payload.decode(charset, errors="ignore")

                if ctype == "text/html" and prefer_html:
                    soup = BeautifulSoup(payload, "lxml")
                    for s in soup(["script", "style"]):
                        s.decompose()
                    text = soup.get_text(separator=" ", strip=True)
                    if text:
                        texts.append(text)
                elif ctype == "text/plain" and not prefer_html:
                    texts.append(payload)
        except Exception:
            pass

    return texts

def clean_text_for_spam(text):
    """Normalize whitespace, remove excessive line breaks, keep URLs and spam phrases."""
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def extract_email_text(path):
    """Read email file and return subject + cleaned body text."""
    with open(path, errors="ignore") as f:
        msg = email.message_from_file(f)

    subject = msg.get("Subject") or ""
    payload = msg.get_payload()
    body = " ".join(flatten_to_string(payload, prefer_html=True))
    body = clean_text_for_spam(body)

    return (subject + " " + body).strip()

def tokenize_for_spam(text, use_stem=True):
    """Tokenize, keep URLs and spam words, stem/lemmatize if desired."""
    if not text:
        return []

    text = text.lower()
    tokens = nltk.word_tokenize(text)
    cleaned = []

    for token in tokens:
        token = token.strip(string.punctuation)
        # Keep most tokens, including URLs, spam words, numbers
        if len(token) <= 1:
            continue
        if use_stem:
            token = stemmer.stem(token)
        else:
            token = lemmatizer.lemmatize(token)
        cleaned.append(token)

    return cleaned

In [22]:
# ---------- Step 1: Load or build cleaned dataset ----------

if not os.path.exists(processed_emails_csv_path):
    index_file = "index"  # file with lines: <label> <filepath>
    labels = {}
    with open(index_file, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 2:
                label, filepath = parts
                filename = os.path.basename(filepath)
                labels[filename] = label

    rows = []
    for fname, label in tqdm(labels.items(), desc="Parsing emails"):
        file_path = os.path.join("data", fname)  # adjust folder if needed
        text = extract_email_text(file_path) or ""
        tokens = tokenize_for_spam(text)
        clean_text = " ".join(tokens).strip()
        if clean_text:
            rows.append((clean_text, label))

    df = pd.DataFrame(rows, columns=["clean_text", "label"])

    # Map labels to numeric targets
    df['target'] = df['label'].map({'ham': 0, 'spam': 1})
    df = df.dropna(subset=['clean_text', 'target'])
    df['target'] = df['target'].astype(int)

    print(df.head())
    print(df['label'].value_counts())

    df.to_csv(processed_emails_csv_path, index=False)
    print(f"Saved cleaned dataset -> {processed_emails_csv_path}")

else:
    df = pd.read_csv(processed_emails_csv_path)

print("\nEmail counts (numeric target):")
print(df['target'].value_counts())


Email counts (numeric target):
1    49783
0    25219
Name: target, dtype: int64


In [23]:
df.head()

Unnamed: 0,clean_text,label,target
0,generic ciali brand qualiti do you feel the pr...,spam,1
1,typo in debian/readm hi ve just updat from the...,ham,0
2,authent viagra authent viagra mega authent dis...,spam,1
3,nice talk with ya hey billi it was realli fun ...,spam,1
4,or trembl stomach cramp troubl in sleep weak loos,spam,1


In [24]:
# ---------- Step 2: Suspicious word features ----------

bundle_path = "dataset_bundle.pkl"
vectorizer_path = "vectorizer.pkl"

def vectorize_emails(train_texts, test_texts):
    # Fit only on training data
    vectorizer = TfidfVectorizer(
        ngram_range=(1,3),
        analyzer='word',
        min_df=2,
        max_df=0.9,
        sublinear_tf=True
    )
    X_train = vectorizer.fit_transform(train_texts.dropna().str.lower())
    joblib.dump(vectorizer, vectorizer_path, compress=("zlib", 3))
    X_test = vectorizer.transform(test_texts.dropna().str.lower())

    return X_train, X_test, vectorizer

if not os.path.exists(bundle_path) or not os.path.exists(vectorizer_path):
    df = df.dropna()
    train_texts, test_texts, y_train , y_test = train_test_split(
        df["clean_text"], df["target"],
        test_size=0.2,
        stratify=df["target"],
        random_state=42
    )

    X_train, X_test, vectorizer = vectorize_emails(train_texts, test_texts)
    y_train = y_train.to_numpy()
    y_test = y_test.to_numpy()

    joblib.dump(
        {
            "X_train": X_train,
            "y_train": y_train,
            "X_test": X_test,
            "y_test": y_test,
        },
        bundle_path,
        compress=("zlib", 3)
    )
else:
    bundle = joblib.load(bundle_path)
    X_train = bundle["X_train"]
    y_train = bundle["y_train"]
    X_test = bundle["X_test"]
    y_test = bundle["y_test"]
    vectorizer = joblib.load(vectorizer_path)

In [25]:
# ---------- Step 3: Balance only training data ----------

ham_idx = np.where(y_train == 0)[0]
spam_idx = np.where(y_train == 1)[0]

np.random.seed(RANDOM_STATE)
spam_downsampled_idx = np.random.choice(spam_idx, size=len(ham_idx), replace=False)

balanced_idx = np.concatenate([ham_idx, spam_downsampled_idx])

# Subset X_train and y_train
X_train_balanced = X_train[balanced_idx, :]
y_train_balanced = y_train[balanced_idx]

# Shuffle
shuffled_idx = np.random.permutation(len(y_train_balanced))
X_train_balanced = X_train_balanced[shuffled_idx, :]
y_train_balanced = y_train_balanced[shuffled_idx]

# Display counts
unique, counts = np.unique(y_train_balanced, return_counts=True)
print("Balanced training dataset counts:")
for u, c in zip(unique, counts):
    print(f"{u}: {c}")

Balanced training dataset counts:
0: 20175
1: 20175


In [26]:
svc_clf = LinearSVC(random_state=RANDOM_STATE, class_weight="balanced")
svc_clf.fit(X_train_balanced, y_train_balanced)
y_pred = svc_clf.predict(X_test)

print("SVC Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, target_names=['ham','spam']))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

SVC Accuracy: 0.9918005466302247

Classification report:
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99      5044
        spam       0.99      0.99      0.99      9957

    accuracy                           0.99     15001
   macro avg       0.99      0.99      0.99     15001
weighted avg       0.99      0.99      0.99     15001


Confusion matrix:
 [[4975   69]
 [  54 9903]]


In [28]:
nb_clf = MultinomialNB()
nb_clf.fit(X_train_balanced, y_train_balanced)
y_pred = nb_clf.predict(X_test)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, target_names=['ham','spam']))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

Naive Bayes Accuracy: 0.9626691553896407

Classification report:
               precision    recall  f1-score   support

         ham       0.90      1.00      0.95      5044
        spam       1.00      0.95      0.97      9957

    accuracy                           0.96     15001
   macro avg       0.95      0.97      0.96     15001
weighted avg       0.97      0.96      0.96     15001


Confusion matrix:
 [[5025   19]
 [ 541 9416]]


In [29]:
lr_clf = LogisticRegression(C=0.1, penalty="l2", max_iter=500, class_weight="balanced")
lr_clf.fit(X_train_balanced, y_train_balanced)
y_pred = lr_clf.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, target_names=['ham','spam']))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

Logistic Regression Accuracy: 0.9806679554696354

Classification report:
               precision    recall  f1-score   support

         ham       0.98      0.96      0.97      5044
        spam       0.98      0.99      0.99      9957

    accuracy                           0.98     15001
   macro avg       0.98      0.98      0.98     15001
weighted avg       0.98      0.98      0.98     15001


Confusion matrix:
 [[4836  208]
 [  82 9875]]


In [30]:
class SpamPipeline:
    def __init__(self, vectorizer, model):
        self.vectorizer = vectorizer
        self.model = model

    def preprocess(self, texts):
        # Apply your existing cleaning + tokenization
        return [" ".join(tokenize_for_spam(clean_text_for_spam(text))) for text in texts]

    def predict(self, texts):
        processed = self.preprocess(texts)
        X = self.vectorizer.transform(processed)
        return self.model.predict(X)

pipeline_nb = SpamPipeline(vectorizer, nb_clf)
joblib.dump(pipeline_nb, "spam_pipeline.pkl", compress=("zlib", 3))

['spam_pipeline.pkl']

In [31]:
# ---------- Custom examples ----------

spam_emails = [
    "Congratulations! You've won a $1000 gift card from Amazon! Click here to claim your reward before it expires.",
    "Earn money from home in just 24 hours with our guaranteed system. No skills required, sign up now and start earning today!",
    "Exclusive deal: Buy one, get two free on all luxury watches. Hurry, this offer is only available for the next 12 hours.",
    "Your bank account has been compromised! Unauthorized login detected. Verify your identity immediately by clicking this secure link.",
    "Get cheap prescription drugs without a prescription! Order online today and receive free overnight shipping.",
    "You have been selected to receive a free iPhone 15 Pro Max! Claim yours now before stocks run out.",
    "Work from home and make $5000 per week easily. Start your journey to financial freedom today!",
    "Act now! Your credit card reward points are about to expire. Redeem them today for cash or exciting gifts.",
    "Congratulations! You’ve been chosen as our lucky winner of a luxury Caribbean vacation. Click here to confirm your spot.",
    "Increase your Bitcoin balance instantly with our automated trading bot. Start with just $100 and watch your money grow.",
    "Dear user, your email storage is 98% full. Click this link to upgrade your account for free unlimited storage.",
    "Get rich quick! Our proven online course has helped thousands of people quit their jobs and earn six figures. Enroll today.",
    "Attention! Your PayPal account has been locked due to suspicious activity. Confirm your details immediately to restore access.",
    "FREE membership upgrade! Unlock premium features today and connect with thousands of singles in your area.",
    "Limited-time offer: Buy diet pills now and lose up to 20 pounds in one month without exercise!"
]

ham_emails = [
    "Meeting rescheduled to 3 PM tomorrow. Please confirm if the new time works for you.",
    "Can you review the attached report and provide your feedback by the end of the day?",
    "Happy birthday! Wishing you a wonderful day filled with joy, laughter, and lots of cake.",
    "Lunch plans for Friday? Thinking about trying the new Italian restaurant downtown.",
    "Project deadline extended to next Monday, giving us more time to finalize the presentation.",
    "Family reunion details attached. Let me know if you’ll be able to make it this year.",
    "Reminder: Doctor's appointment at 10 AM. Don’t forget to bring your previous medical records.",
    "The client has requested a few changes to the proposal. Let’s discuss them in tomorrow’s meeting.",
    "Hope you’re doing well! Just wanted to check if you’re free this weekend for a quick catch-up.",
    "Don’t forget to bring the printed copies of the slides for the team review meeting tomorrow.",
    "We need to finalize the venue for the upcoming workshop. Please share your suggestions.",
    "Attached are the photos from last weekend’s hiking trip. It was a great experience!",
    "Please find the travel itinerary attached for our upcoming conference in Mumbai.",
    "Reminder: The library books are due for return on Monday. Kindly renew them if needed.",
    "Thank you for your help with the project last week. I really appreciate your support!"
]

examples = spam_emails + ham_emails
true_labels = [1] * len(spam_emails) + [0] * len(ham_emails)

processed_examples = [" ".join(tokenize_for_spam(clean_text_for_spam(text))) for text in examples]

X_examples = vectorizer.transform(processed_examples)

# ---------- Define models ----------
models = {
    "SVC": svc_clf,
    # "Random Forest": rf_clf,
    "Naive Bayes": nb_clf,
    "Logistic Regression": lr_clf,
}

label_map = {0: "HAM", 1: "SPAM"}

# ---------- Predict & evaluate ----------
for name, model in models.items():
    preds = model.predict(X_examples)
    results = [label_map[p] for p in preds]

    print(f"\n--- Predictions using {name} ---")
    
    acc = accuracy_score(true_labels, preds)
    print(f"Example Accuracy {name}: {acc:.4f}")
    
    # 🔹 More detailed metrics
    print("Classification report:")
    print(classification_report(true_labels, preds, target_names=["HAM", "SPAM"]))
    
    # 🔹 Confusion matrix
    cm = confusion_matrix(true_labels, preds)
    print("Confusion matrix:")
    print(cm)


--- Predictions using SVC ---
Example Accuracy SVC: 0.5333
Classification report:
              precision    recall  f1-score   support

         HAM       1.00      0.07      0.12        15
        SPAM       0.52      1.00      0.68        15

    accuracy                           0.53        30
   macro avg       0.76      0.53      0.40        30
weighted avg       0.76      0.53      0.40        30

Confusion matrix:
[[ 1 14]
 [ 0 15]]

--- Predictions using Naive Bayes ---
Example Accuracy Naive Bayes: 0.9333
Classification report:
              precision    recall  f1-score   support

         HAM       1.00      0.87      0.93        15
        SPAM       0.88      1.00      0.94        15

    accuracy                           0.93        30
   macro avg       0.94      0.93      0.93        30
weighted avg       0.94      0.93      0.93        30

Confusion matrix:
[[13  2]
 [ 0 15]]

--- Predictions using Logistic Regression ---
Example Accuracy Logistic Regression: 0.500

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
'''
Naive Bayes seems to perform the best on these custom examples, 
achieving the highest accuracy and balanced precision/recall 
for both HAM and SPAM classes.
'''

'\nNaive Bayes seems to perform the best on these custom examples, \nachieving the highest accuracy and balanced precision/recall \nfor both HAM and SPAM classes.\n'