In [1]:
# Install any missing libs (only needed if not already available)
!pip install beautifulsoup4

# Imports
import re, math, time, imaplib, email
import pandas as pd
from email.header import decode_header
from bs4 import BeautifulSoup




In [2]:
# --- Preprocess function ---
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    return text.split()

# --- Train Naive Bayes ---
def train_naive_bayes(dataset):
    spam_words, ham_words = {}, {}
    spam_count, ham_count = 0, 0

    for label, words in dataset:
        if label == "spam":
            spam_count += 1
            for w in words:
                spam_words[w] = spam_words.get(w, 0) + 1
        else:
            ham_count += 1
            for w in words:
                ham_words[w] = ham_words.get(w, 0) + 1

    p_spam = spam_count / len(dataset)
    p_ham = ham_count / len(dataset)

    vocab = set(list(spam_words.keys()) + list(ham_words.keys()))
    word_probs = {}
    for w in vocab:
        word_probs[w] = {
            "spam": (spam_words.get(w, 0) + 1) / (sum(spam_words.values()) + len(vocab)),
            "ham":  (ham_words.get(w, 0) + 1) / (sum(ham_words.values()) + len(vocab))
        }
    return p_spam, p_ham, word_probs

# --- Prediction ---
def predict(text, p_spam, p_ham, word_probs):
    words = preprocess(text)
    log_spam, log_ham = math.log(p_spam), math.log(p_ham)
    for w in words:
        if w in word_probs:
            log_spam += math.log(word_probs[w]["spam"])
            log_ham  += math.log(word_probs[w]["ham"])
    return "SPAM" if log_spam > log_ham else "HAM"

# --- Load dataset & train ---
df = pd.read_csv("/content/spam.csv", encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'message']
dataset = [(row['label'], preprocess(row['message'])) for _, row in df.iterrows()]

p_spam, p_ham, word_probs = train_naive_bayes(dataset)
print("✅ Model trained successfully")


✅ Model trained successfully


In [3]:
# --- Connect to Gmail ---
def connect_gmail(user, password, inbox="inbox"):
    mail = imaplib.IMAP4_SSL("imap.gmail.com")
    mail.login(user, password)
    mail.select(inbox)
    return mail

# --- Decode subject safely ---
def decode_subject(msg):
    subject, encoding = decode_header(msg["Subject"])[0]
    if isinstance(subject, bytes):
        subject = subject.decode(encoding if encoding else "utf-8")
    return subject

# --- Extract plain text body ---
def extract_body(msg):
    body = ""
    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == "text/plain":
                body = part.get_payload(decode=True).decode(errors="ignore")
                break
        else:  # fallback to HTML if no plain text found
            for part in msg.walk():
                if part.get_content_type() == "text/html":
                    html = part.get_payload(decode=True).decode(errors="ignore")
                    body = BeautifulSoup(html, "html.parser").get_text(" ")
                    break
    else:
        if msg.get_content_type() == "text/html":
            html = msg.get_payload(decode=True).decode(errors="ignore")
            body = BeautifulSoup(html, "html.parser").get_text(" ")
        else:
            body = msg.get_payload(decode=True).decode(errors="ignore")
    return body


In [4]:
# ⚠️ SECURITY NOTE: In Colab, better to use input() so your password isn't visible
# EMAIL_USER = input("Enter your Gmail: ")
# EMAIL_PASS = input("Enter your Gmail App Password: ")
EMAIL_USER = "Enter your Gmail ID Here"
EMAIL_PASS = "Enter Your App Password Here"

mail = connect_gmail(EMAIL_USER, EMAIL_PASS)
print("✅ Gmail connected")


✅ Gmail connected


**this will run for Infinite so I will change this code**

In [5]:
print("📩 Spam filter started. Checking for new mails...\n")

stop = 1

while stop != 0: # true - for infinite run
    status, messages = mail.search(None, "UNSEEN")
    ids = messages[0].split()

    if not ids:
        print("✅ No new mails. Waiting...")
    else:
        for msg_id in ids:
            res, msg_data = mail.fetch(msg_id, "(RFC822)")
            for response_part in msg_data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])

                    subject = decode_subject(msg)
                    body = extract_body(msg)

                    mail_text = subject + " " + body
                    result = predict(mail_text, p_spam, p_ham, word_probs)

                    print(f"📌 Subject: {subject}")
                    print(f"📌 Classified as: {result}")
                    print("-----")

    # time.sleep(60)  # check every 60 seconds
    stop -= 1

📩 Spam filter started. Checking for new mails...

📌 Subject: 🏁 Finish signing up
📌 Classified as: SPAM
-----
📌 Subject: Now shipped: Cursor CLI
📌 Classified as: SPAM
-----
📌 Subject: ✍️ Write better everywhere!
📌 Classified as: SPAM
-----
📌 Subject: Refer friends to Warp, get rewards 🎁
📌 Classified as: SPAM
-----
📌 Subject: 🚀 Take control of your writing — on the go
📌 Classified as: SPAM
-----
📌 Subject: 🥇 Achieve more with Premium
📌 Classified as: SPAM
-----


In [6]:
# --- Hard-coded accuracy function ---
def calculate_accuracy_hardcoded(dataset, p_spam, p_ham, word_probs):
    """
    dataset: list of tuples (label, preprocessed_words)
    p_spam, p_ham, word_probs: trained Naive Bayes parameters
    """
    correct = 0
    total = 0

    for data in dataset:
        label = data[0].lower()  # 'spam' or 'ham'
        words = data[1]          # list of preprocessed words
        total += 1

        # Join words to form a string for prediction
        text = " ".join(words)
        prediction = predict(text, p_spam, p_ham, word_probs)
        pred_label = "spam" if prediction == "SPAM" else "ham"

        if pred_label == label:
            correct += 1

    # Calculate accuracy
    accuracy = correct / total
    return accuracy

# --- Example usage ---
accuracy = calculate_accuracy_hardcoded(dataset, p_spam, p_ham, word_probs)
print("✅ Model Accuracy: {:.2f}%".format(accuracy * 100))


✅ Model Accuracy: 99.14%


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(y_true, y_pred):
    """
    Prints accuracy, precision, recall, and F1-score for the model.

    Parameters:
    y_true (array-like): True labels
    y_pred (array-like): Predicted labels
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='binary', pos_label="spam")
    recall = recall_score(y_true, y_pred, average='binary', pos_label="spam")
    f1 = f1_score(y_true, y_pred, average='binary', pos_label="spam")

    print("📊 Model Performance Metrics")
    print(f"Accuracy : {accuracy:.2%}")
    print(f"Precision: {precision:.2%}")
    print(f"Recall   : {recall:.2%}")
    print(f"F1-score : {f1:.2%}")

    return accuracy, precision, recall, f1




In [10]:
# Suppose y_test = true labels, y_pred = model predictions
acc, prec, rec, f1 = evaluate_model(y_test, y_pred)


NameError: name 'y_test' is not defined

In [11]:
# Prepare data for evaluation
y_true = [label.lower() for label, _ in dataset]  # Extract true labels
y_pred = [predict(" ".join(words), p_spam, p_ham, word_probs).lower() for _, words in dataset] # Generate predictions

# Evaluate the model using the evaluate_model function
acc, prec, rec, f1 = evaluate_model(y_true, y_pred)

📊 Model Performance Metrics
Accuracy : 99.14%
Precision: 96.79%
Recall   : 96.79%
F1-score : 96.79%
