In [1]:
# Spam Email Detection using scikit-learn

# Step 1: Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 2: Sample Dataset
data = {
    "email": [
        "Congratulations! You won a free ticket",
        "Hey, are we meeting today?",
        "Claim your free prize now",
        "Please find attached the report",
        "Earn money fast with this simple trick",
        "Let's have lunch tomorrow",
        "Exclusive offer just for you",
        "Reminder: your appointment is tomorrow",
        "You have been selected for a reward",
        "Can we discuss the project update?"
    ],
    "label": ["spam", "ham", "spam", "ham", "spam", "ham", "spam", "ham", "spam", "ham"]
}

df = pd.DataFrame(data)

# Step 3: Split Dataset into Train/Test
X = df['email']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Text Vectorization
# Convert text emails into numeric vectors
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Apply TF-IDF weighting
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Step 5: Train Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# Step 6: Make Predictions
y_pred = clf.predict(X_test_tfidf)

# Step 7: Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", report)

# Step 8: Test Custom Emails
test_emails = [
    "Win a free vacation now",
    "Can you send me the meeting agenda?"
]

test_counts = vectorizer.transform(test_emails)
test_tfidf = tfidf_transformer.transform(test_counts)
predictions = clf.predict(test_tfidf)

for email, pred in zip(test_emails, predictions):
    print(f"Email: '{email}' --> Prediction: {pred}")


Accuracy: 1.0

Confusion Matrix:
 [[2 0]
 [0 1]]

Classification Report:
               precision    recall  f1-score   support

         ham       1.00      1.00      1.00         2
        spam       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

Email: 'Win a free vacation now' --> Prediction: spam
Email: 'Can you send me the meeting agenda?' --> Prediction: ham
