In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
# Create synthetic email data
emails = [
    ("Free lottery winner! Claim now!", "spam"),
    ("Meeting at 3pm tomorrow", "ham"),
    ("Get rich quick! Buy now!", "spam"),
    ("Project deadline reminder", "ham"),
    ("You've won $1000000!!!", "spam"),
    ("Can we reschedule to Tuesday?", "ham"),
    ("Viagra for sale cheap", "spam"),
    ("Budget report for Q3", "ham"),
    ("Double your money fast!", "spam"),
    ("Team lunch next week", "ham")
]

In [3]:
# Convert to DataFrame
df = pd.DataFrame(emails, columns=['text', 'label'])
print("Dataset Preview:")
print(df.head())

Dataset Preview:
                              text label
0  Free lottery winner! Claim now!  spam
1          Meeting at 3pm tomorrow   ham
2         Get rich quick! Buy now!  spam
3        Project deadline reminder   ham
4           You've won $1000000!!!  spam


In [4]:
# Convert text to numerical features using CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])  # Convert text to word count matrix

# Split into features (X) and target (y)
y = df['label']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\nFeature names (words):", vectorizer.get_feature_names_out())


Feature names (words): ['1000000' '3pm' 'budget' 'buy' 'cheap' 'claim' 'deadline' 'double' 'fast'
 'free' 'lottery' 'lunch' 'meeting' 'money' 'project' 'q3' 'quick'
 'reminder' 'report' 'reschedule' 'rich' 'sale' 'team' 'tomorrow'
 'tuesday' 've' 'viagra' 'week' 'winner' 'won']


In [5]:
# Initialize and train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Make predictions
y_pred = nb_classifier.predict(X_test)

# Print performance metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

         ham       0.50      1.00      0.67         1
        spam       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
def classify_email(text):
    # Convert text to the same format used during training
    text_vectorized = vectorizer.transform([text])
    
    # Get prediction
    prediction = nb_classifier.predict(text_vectorized)[0]
    
    # Get probability scores
    proba = nb_classifier.predict_proba(text_vectorized)[0]
    
    print(f"\nEmail Text: {text}")
    print(f"Classification: {prediction}")
    print(f"Probability of spam: {proba[1]:.2%}")
    print(f"Probability of ham: {proba[0]:.2%}")

# Test with new emails
test_emails = [
    "Congratulations! You've won an iPhone!",
    "Please review the attached project proposal",
    "There is a new scheme in market",
    "win $1000000 cash"
]

for email in test_emails:
    classify_email(email)


Email Text: Congratulations! You've won an iPhone!
Classification: spam
Probability of spam: 78.43%
Probability of ham: 21.57%

Email Text: Please review the attached project proposal
Classification: ham
Probability of spam: 32.28%
Probability of ham: 67.72%

Email Text: There is a new scheme in market
Classification: ham
Probability of spam: 50.00%
Probability of ham: 50.00%

Email Text: win $1000000 cash
Classification: spam
Probability of spam: 65.60%
Probability of ham: 34.40%
