# CLASSIFY EMAIL USING NAVIE BAYES ALGORITHM

In [5]:
#import the needed libraries

In [7]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import joblib
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [9]:
# Download stopwords if not already done
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Samsung\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# Sample email data
emails = [
    "Congratulations, you won a free ticket to Bahamas!",  # Spam
    "Hello, can we reschedule the meeting for tomorrow?",  # Not Spam
    "Exclusive deal just for you, claim your free iPhone now!",  # Spam
    "Dear team, please find the report attached.",  # Not Spam
    "Win a $1,000 gift card by clicking this link!",  # Spam
    "Reminder: Your appointment is scheduled for Monday at 3 PM.",  # Not Spam
    "Limited time offer! Get 50% off on all products.",  # Spam
    "Hi Mom, I’ll call you later tonight.",  # Not Spam
    "Claim your lottery prize now before it expires!",  # Spam
    "Your account balance is low. Please deposit funds.",  # Not Spam
    "Special promotion just for you: Buy one, get one free!",  # Spam
    "Don’t forget to submit the assignment before Friday.",  # Not Spam
    "Urgent! Update your payment information to avoid suspension.",  # Spam
    "Hi John, thanks for sending over the documents.",  # Not Spam
    "You’ve been selected for a cash reward. Claim here!",  # Spam
]


In [13]:
# Labels (1 = Spam, 0 = Not Spam)
labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]

In [15]:
# Preprocess function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Stemming
    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

In [17]:
# Preprocess all emails
emails = [preprocess_text(email) for email in emails]

In [19]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(emails, labels, test_size=0.3, random_state=42)


In [23]:
# Create a pipeline with TF-IDF vectorizer and Naive Bayes model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=500,ngram_range=(1,2))),
    ('model', MultinomialNB(alpha=0.1))
])

In [25]:
# Train the model
pipeline.fit(X_train, y_train)

In [27]:
# Cross-validation
cv_scores = cross_val_score(pipeline, emails, labels, cv=5)
print(f"Cross-Validation Accuracy: {cv_scores.mean():.2f}")


Cross-Validation Accuracy: 0.87


In [29]:
# Test the model
y_pred = pipeline.predict(X_test)


In [31]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Test Accuracy: 0.60

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       0.33      1.00      0.50         1

    accuracy                           0.60         5
   macro avg       0.67      0.75      0.58         5
weighted avg       0.87      0.60      0.63         5



In [33]:
# Save the model
joblib.dump(pipeline, 'spam_classifier_model.pkl')
print("\nModel saved as 'spam_classifier_model.pkl'")


Model saved as 'spam_classifier_model.pkl'


In [37]:
# Test on new data
test_email = ["Hurry up! Limited time offer on your favorite products. Click here to grab the deal."]
test_email_preprocessed = [preprocess_text(email) for email in test_email]
prediction = pipeline.predict(test_email_preprocessed)
print("\nPrediction for test email (0 = Not Spam, 1 = Spam):", prediction[0])



Prediction for test email (0 = Not Spam, 1 = Spam): 1
