In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re

In [2]:
# Download required NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\tekut\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tekut\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Join tokens back to string
    return ' '.join(tokens)


In [6]:
# Load and preprocess dataset
data = pd.read_csv('spam.csv')
data['Processed_Message'] = data['Message'].apply(preprocess_text)
data['Spam'] = data['Category'].apply(lambda x: 1 if x == 'spam' else 0)


In [7]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    data['Processed_Message'], data['Spam'], test_size=0.25, random_state=42
)

In [8]:
# Create pipeline with TF-IDF and Naive Bayes
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=5000)),
    ('nb', MultinomialNB())
])


In [9]:
# Train the model
clf.fit(X_train, y_train)


0,1,2
,steps,"[('vectorizer', ...), ('nb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [10]:
# Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)


In [11]:
print("Model Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))


Model Evaluation Metrics:
Accuracy: 0.9734
Precision: 1.0000
Recall: 0.8011

Detailed Classification Report:
              precision    recall  f1-score   support

         Ham       0.97      1.00      0.98      1207
        Spam       1.00      0.80      0.89       186

    accuracy                           0.97      1393
   macro avg       0.99      0.90      0.94      1393
weighted avg       0.97      0.97      0.97      1393



In [12]:
# Test on example emails
emails = [
    'Win a free lottery...',
    'Will u meet ur dream partner soon? Is ur career off 2 a flyng start? 2 find out free, txt HORO followed by ur star sign, e. g. HORO ARIES'
]
predictions = clf.predict(emails)
print("\nExample Email Predictions:")
for email, pred in zip(emails, predictions):
    print(f"Email: {email}")
    print(f"Prediction: {'Spam' if pred == 1 else 'Ham'}\n")




Example Email Predictions:
Email: Win a free lottery...
Prediction: Spam

Email: Will u meet ur dream partner soon? Is ur career off 2 a flyng start? 2 find out free, txt HORO followed by ur star sign, e. g. HORO ARIES
Prediction: Ham

