In [1]:
import os
import re
import nltk
import string
import joblib

from PyPDF2 import PdfReader
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make output deterministic
RANDOM_STATE = 42

# Ensure NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download("wordnet", quiet=True)

processed_emails_csv_path = os.path.join(".", "processed_emails_cleaned.csv")
suspicious_features_csv_path = os.path.join(".", "suspicious_features_dataset.csv")

In [2]:
stopwords_set = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

def clean_text_for_spam(text):
    """Normalize whitespace, remove excessive line breaks, keep URLs and spam phrases."""
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def tokenize_for_spam(text, use_stem=True):
    """Tokenize, keep URLs and spam words, stem/lemmatize if desired."""
    if not text:
        return []

    text = text.lower()
    tokens = nltk.word_tokenize(text)
    cleaned = []

    for token in tokens:
        token = token.strip(string.punctuation)
        # Keep most tokens, including URLs, spam words, numbers
        if len(token) <= 1:
            continue
        if use_stem:
            token = stemmer.stem(token)
        else:
            token = lemmatizer.lemmatize(token)
        cleaned.append(token)

    return cleaned

In [3]:
spam_pipeline_path = "spam_pipeline.pkl"

class SpamPipeline:
    def __init__(self, vectorizer, model):
        self.vectorizer = vectorizer
        self.model = model

    def preprocess(self, texts):
        # Apply your existing cleaning + tokenization
        return [" ".join(tokenize_for_spam(clean_text_for_spam(text))) for text in texts]

    def predict(self, texts):
        processed = self.preprocess(texts)
        X = self.vectorizer.transform(processed)
        return self.model.predict(X)

pipeline_nb = joblib.load(spam_pipeline_path)

In [4]:
# ---------- Custom examples ----------

spam_emails = [
    "Congratulations! You've won a $1000 gift card from Amazon! Click here to claim your reward before it expires.",
    "Earn money from home in just 24 hours with our guaranteed system. No skills required, sign up now and start earning today!",
    "Exclusive deal: Buy one, get two free on all luxury watches. Hurry, this offer is only available for the next 12 hours.",
    "Your bank account has been compromised! Unauthorized login detected. Verify your identity immediately by clicking this secure link.",
    "Get cheap prescription drugs without a prescription! Order online today and receive free overnight shipping.",
    "You have been selected to receive a free iPhone 15 Pro Max! Claim yours now before stocks run out.",
    "Work from home and make $5000 per week easily. Start your journey to financial freedom today!",
    "Act now! Your credit card reward points are about to expire. Redeem them today for cash or exciting gifts.",
    "Congratulations! Youâ€™ve been chosen as our lucky winner of a luxury Caribbean vacation. Click here to confirm your spot.",
    "Increase your Bitcoin balance instantly with our automated trading bot. Start with just $100 and watch your money grow.",
    "Dear user, your email storage is 98% full. Click this link to upgrade your account for free unlimited storage.",
    "Get rich quick! Our proven online course has helped thousands of people quit their jobs and earn six figures. Enroll today.",
    "Attention! Your PayPal account has been locked due to suspicious activity. Confirm your details immediately to restore access.",
    "FREE membership upgrade! Unlock premium features today and connect with thousands of singles in your area.",
    "Limited-time offer: Buy diet pills now and lose up to 20 pounds in one month without exercise!"
]

ham_emails = [
    "Meeting rescheduled to 3 PM tomorrow. Please confirm if the new time works for you.",
    "Can you review the attached report and provide your feedback by the end of the day?",
    "Happy birthday! Wishing you a wonderful day filled with joy, laughter, and lots of cake.",
    "Lunch plans for Friday? Thinking about trying the new Italian restaurant downtown.",
    "Project deadline extended to next Monday, giving us more time to finalize the presentation.",
    "Family reunion details attached. Let me know if youâ€™ll be able to make it this year.",
    "Reminder: Doctor's appointment at 10 AM. Donâ€™t forget to bring your previous medical records.",
    "The client has requested a few changes to the proposal. Letâ€™s discuss them in tomorrowâ€™s meeting.",
    "Hope youâ€™re doing well! Just wanted to check if youâ€™re free this weekend for a quick catch-up.",
    "Donâ€™t forget to bring the printed copies of the slides for the team review meeting tomorrow.",
    "We need to finalize the venue for the upcoming workshop. Please share your suggestions.",
    "Attached are the photos from last weekendâ€™s hiking trip. It was a great experience!",
    "Please find the travel itinerary attached for our upcoming conference in Mumbai.",
    "Reminder: The library books are due for return on Monday. Kindly renew them if needed.",
    "Thank you for your help with the project last week. I really appreciate your support!"
]

examples = spam_emails + ham_emails
true_labels = [1] * len(spam_emails) + [0] * len(ham_emails)

preds = pipeline_nb.predict(examples)
label_map = {0: "HAM", 1: "SPAM"}

results = [label_map[p] for p in preds]

acc = accuracy_score(true_labels, preds)
print(f"Example Accuracy Naive Bayes {acc:.4f}")
    
# ðŸ”¹ More detailed metrics
print("Classification report:")
print(classification_report(true_labels, preds, target_names=["HAM", "SPAM"]))

# ðŸ”¹ Confusion matrix
cm = confusion_matrix(true_labels, preds)
print("Confusion matrix:")
print(cm)

Example Accuracy Naive Bayes 0.9333
Classification report:
              precision    recall  f1-score   support

         HAM       1.00      0.87      0.93        15
        SPAM       0.88      1.00      0.94        15

    accuracy                           0.93        30
   macro avg       0.94      0.93      0.93        30
weighted avg       0.94      0.93      0.93        30

Confusion matrix:
[[13  2]
 [ 0 15]]


In [5]:
def predict_pdf(path, label_map={0:"HAM", 1:"SPAM"}):
    """
    Extract text from a PDF, preprocess, vectorize, and predict spam/ham.
    
    Args:
        path (str): Path to PDF file.
        model: Trained classifier (e.g., svc_clf, nb_clf, lr_clf).
        vectorizer: Trained TF-IDF vectorizer.
        label_map (dict): Mapping from numeric labels to strings.
        
    Returns:
        dict with raw prediction, label string, and extracted text length.
    """
    try:
        reader = PdfReader(path)
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""
            break
    except Exception as e:
        raise ValueError(f"Failed to read PDF: {e}")

    # Preprocess
    processed = " ".join(pipeline_nb.preprocess([text])[0].split())

    if not processed.strip():
        return {"error": "No valid text extracted from PDF"}

    # Predict using pipeline
    pred = pipeline_nb.predict([processed])[0]  # wrap text in a list for single sample

    return {
        "prediction": int(pred),
        "label": label_map[pred],
        "text_length": len(processed),
        "preview": processed[:200] + ("..." if len(processed) > 200 else "")
    }

In [6]:
predict_pdf("test.pdf")

{'prediction': 1,
 'label': 'SPAM',
 'text_length': 1507,
 'preview': 'congratul you ve won 1000 gift card from amazon click here to claim your reward befor it expir earn money from home in just 24 hour with our guarante system no skill requir sign up now and start earn ...'}