In [None]:
# 📓 Spam Detection with Naive Bayes (Bag-of-Words)
# --- Full Jupyter Notebook Script ---

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# --- 1. Load the SMS Spam Collection Dataset ---
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'text'])

# Map labels to 0/1 (ham = 0, spam = 1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [None]:
# --- 3. Explore the Data ---
print("First few records:")
print(df.head())

# Label distribution
print("\nLabel distribution:")
print(df['label'].value_counts())

In [None]:
# --- 4. Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.3, random_state=42)

In [None]:
# --- 5. Bag of Words ---
vectorizer = CountVectorizer()
# vectorizer = CountVectorizer(max_features=5) # Weak vectorizer (UNDERFITTING). Limit vocabulary to top 5 most frequent words

X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

In [None]:
# --- 6. Train Classifier (Multinomial Naive Bayes) ---
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

In [None]:
# --- 7. Predictions and Evaluation ---
y_pred = clf.predict(X_test_counts)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

In [None]:
# --- 8. Confusion Matrix Visualization ---
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Try the following:
# 1) Train on 10% (OVERFIT)
# 2) Use a weak vectorizer (UNDERFIT)

# Why is the HAM example always correctly classified?

In [None]:
# --- 9. Custom Test Cases ---
#test_msg = ["Congratulations! You've won a ticket to Bahamas. Call now!"] # 1 = SPAM
#test_msg = ["Congratulations. You won a ticket to Bahamas. Answer now"] # 1 = SPAM
test_msg = ["Hey, I'm picking you up at 6. Don't forget your notebook"] # 0 = HAM


X_test_msg = vectorizer.transform(test_msg)

print("Predicted label:", clf.predict(X_test_msg)[0])  