<a href="https://colab.research.google.com/github/ubuntumel/AI_Colab_Projects/blob/main/Spam_Email_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import csv
import math
import re
import shutil
from collections import Counter
from pathlib import Path

# Setup persistence with Google Drive
try:
    from google.colab import drive
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    drive.mount('/content/drive', force_remount=False)
    PERSIST_DIR = Path('/content/drive/MyDrive/colab_data')
else:
    PERSIST_DIR = Path.cwd() / "colab_data"

PERSIST_DIR.mkdir(parents=True, exist_ok=True)

CSV_FILENAME = "SpamDetection.csv"
CSV_Path = PERSIST_DIR / CSV_FILENAME

# If CSV file is not in Drive yet but exists locally, copy it there
local_candidate = Path.cwd() / CSV_FILENAME
if not CSV_Path.exists() and local_candidate.exists():
    try:
        shutil.copy2(local_candidate, CSV_Path)
        print(f"Copied local '{local_candidate}' -> '{CSV_Path}' for persistence.")
    except Exception as e:
        print(f"Could not copy local file to Drive: {e}")

# Load dataset
csv_path = CSV_Path  # Always use Drive

if not csv_path.exists():
    print(f"CSV file '{csv_path}' not found.")
    exit()

rows = []
with open(csv_path, "r", encoding="utf-8") as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        if not row:
            continue
        if i == 0 and row[0].strip().lower() not in {"ham", "spam"}:
            continue
        label = row[0].strip().lower()
        text = row[1].strip()
        rows.append((label, text))

print(f"Total rows loaded: {len(rows)}")
if len(rows) < 30:
    print("Expected ~30 rows.")

# Train test split
# Use first 20 messages for training and the next 10 for testing.
train = rows[:20]
test = rows[20:30]

print(f"Train size: {len(train)}, Test size: {len(test)}")

# Tokenizer
token_pattern = re.compile(r"[a-zA-Z0-9']+")

def tokenize(text):
    return token_pattern.findall(text.lower())

# Count word frequencies
word_count = {"ham": Counter(), "spam": Counter()}
doc_count = Counter()
total_tokens = {"ham": 0, "spam": 0}

for label, text in train:
    doc_count[label] += 1
    tokens = tokenize(text)
    word_count[label].update(tokens)
    total_tokens[label] += len(tokens)

vocab = set(word_count["ham"].keys()) | set(word_count["spam"].keys())
V = len(vocab)

# Compute priors representing the probability of ham or spam in the training set.
n_train = len(train)
prior = {
    "ham": doc_count["ham"] / n_train if n_train else 0.0,
    "spam": doc_count["spam"] / n_train if n_train else 0.0
}
print("\nPriors calculation")
print({k: round(v, 4) for k, v in prior.items()})

# Likelihood and Posterior
alpha = 1.0  # Laplace smoothing

# log likelihood function of a message given a class
def log_likelihood(tokens, label):
    ll = 0.0
    denom = total_tokens[label] + alpha * V
    cwc = word_count[label]
    for w, c in Counter(tokens).items():
        if w not in vocab:
            continue
        pw = (cwc[w] + alpha) / denom
        ll += c * math.log(pw)
    return ll

#log posterior function using Bayesâ€™ algorithm
def log_posteriors(logp_ham, logp_spam):
    m = max(logp_ham, logp_spam)
    ph = math.exp(logp_ham - m)
    ps = math.exp(logp_spam - m)
    z = ph + ps
    return ph / z, ps / z

# Evaluate on test set
# Compute log-likelihood for ham and spam
# Add log prior
# Convert to posterior probabilities
# Predict the class with higher probability
print("\nTest set:")
y_true, y_pred = [], []

for i, (label, text) in enumerate(test, start=1):
    toks = tokenize(text)
    ll_ham = log_likelihood(toks, "ham")
    ll_spam = log_likelihood(toks, "spam")

    # log prior + log likelihood
    lp_ham = math.log(prior["ham"] + 1e-15) + ll_ham
    lp_spam = math.log(prior["spam"] + 1e-15) + ll_spam

    # posterior probabilities
    p_ham, p_spam = log_posteriors(lp_ham, lp_spam)

    # predicted class = whichever has higher posterior
    pred = "spam" if p_spam > p_ham else "ham"

    y_true.append(label)
    y_pred.append(pred)

    # print results for this test message
    print(f"\nTest #{i}")
    print(f"Sentence: {text}")
    print(f"log P(sentence|ham)  = {ll_ham:.3f}")
    print(f"log P(sentence|spam) = {ll_spam:.3f}")
    print(f"P(ham|sentence)  = {p_ham:.3f}")
    print(f"P(spam|sentence) = {p_spam:.3f}")
    print(f"Predicted class: {pred} | True: {label}")

# Accuracy test = (correct predictions) / (total test messages)
correct = sum(yt == yp for yt, yp in zip(y_true, y_pred))
acc = correct / len(y_true) if y_true else 0.0
print("\nAccuracy on Test Set")
print(f"{correct} / {len(y_true)} = {acc:.3f}")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Total rows loaded: 30
Train size: 20, Test size: 10

Priors calculation
{'ham': 0.55, 'spam': 0.45}

Test set:

Test #1
Sentence: Tell where you reached
log P(sentence|ham)  = -8.197
log P(sentence|spam) = -9.770
P(ham|sentence)  = 0.855
P(spam|sentence) = 0.145
Predicted class: ham | True: ham

Test #2
Sentence: Your gonna have to pick up a burger for yourself on your way home
log P(sentence|ham)  = -48.204
log P(sentence|spam) = -47.537
P(ham|sentence)  = 0.385
P(spam|sentence) = 0.615
Predicted class: spam | True: ham

Test #3
Sentence: As a valued customer I am pleased to advise you that for your recent review you are awarded a Bonus Prize
log P(sentence|ham)  = -69.965
log P(sentence|spam) = -70.084
P(ham|sentence)  = 0.579
P(spam|sentence) = 0.421
Predicted class: ham | True: spam

Test #4
Sentence: Urgent you are awarded a complimentary trip to EuroDis