In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Sample email dataset (For demo purposes, a small sample is created here)
# In a real project, you would load a dataset like 'spam.csv' or 'spamham.csv'

data = {
    'email_text': [
        "Free money, win a lottery now!",
        "Hi, I wanted to know more about your project.",
        "Congratulations, you have won a free gift card!",
        "Hey, just checking in on our meeting tomorrow.",
        "Special promotion, get a free smartphone now!",
        "Reminder: Your meeting is scheduled for 2 PM today.",
        "Limited time offer, win a car by entering this contest!",
        "Please find the attached report for review."
    ],
    'label': ['spam', 'not_spam', 'spam', 'not_spam', 'spam', 'not_spam', 'spam', 'not_spam']
}

# Convert the dictionary to a DataFrame
df = pd.DataFrame(data)

# Data Cleaning: Remove punctuation, numbers, and convert to lowercase
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    return text

df['cleaned_email'] = df['email_text'].apply(clean_text)

# Feature Extraction: Convert text to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english')  # Removing common stopwords
X = vectorizer.fit_transform(df['cleaned_email'])

# Label Encoding: Convert labels to numerical values (spam = 1, not_spam = 0)
y = np.array([1 if label == 'spam' else 0 for label in df['label']])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

# Function to predict if a new email is spam or not
def predict_spam(text):
    cleaned_text = clean_text(text)
    text_vectorized = vectorizer.transform([cleaned_text])
    prediction = model.predict(text_vectorized)
    return "Spam" if prediction == 1 else "Not Spam"

# Test with a new email
new_email = "Claim your free gift card now!"
print(f"The email is: {predict_spam(new_email)}")

new_email = "Let's meet at 2 PM to discuss the project."
print(f"The email is: {predict_spam(new_email)}")


Accuracy: 0.0000
Confusion Matrix:
[[0 2]
 [0 0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

The email is: Spam
The email is: Spam


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
