# Spam Detection Project

This notebook demonstrates the spam detection pipeline using modular Python scripts.

## Steps:
1. Load Dataset
2. Data Visualization
3. Text Preprocessing & Vectorization
4. Model Training (Naive Bayes)
5. Evaluation
6. Custom Prediction

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sys
import os


sys.path.append(os.path.abspath(''))

from src.utils import load_data
from src.preprocessing import clean_text, create_vectorizer
from src.model import train_model, evaluate_model, predict_email
from src.visualization import plot_spam_ham_count, plot_confusion_matrix, plot_text_length_distribution

ModuleNotFoundError: No module named 'pandas'

## 1. Load Dataset

In [3]:

DATA_PATH = "spam_ham_dataset.csv/spam_ham_dataset.csv"

try:
    df = load_data(DATA_PATH)
    print("Dataset loaded successfully.")
    print(df.head())
except FileNotFoundError as e:
    print(e)

NameError: name 'load_data' is not defined

## 2. Data Visualization

In [4]:

plot_spam_ham_count(df)

NameError: name 'plot_spam_ham_count' is not defined

In [None]:

df['length'] = df['text'].apply(len)
plot_text_length_distribution(df)

## 3. Preprocessing & Vectorization

In [None]:
# Clean text
df['clean_text'] = df['text'].apply(clean_text)

# Initialize Vectorizer
vectorizer = create_vectorizer(max_features=5000)

# Vectorize data
X = vectorizer.fit_transform(df['clean_text'])
y = df['label_num']

print("Data vectorization complete.")
print(f"X shape: {X.shape}")

## 4. Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

## 5. Model Training

In [None]:
model = train_model(X_train, y_train)
print("Model training complete.")

## 6. Evaluation

In [None]:
accuracy, report, cm, y_pred = evaluate_model(model, X_test, y_test)

print(f"Accuracy: {accuracy}")
print("classification Report:")
print(report)

# Plot Confusion Matrix
plot_confusion_matrix(cm)

## 7. Custom Prediction

In [None]:
sample_email = "Congratulations! You won 1 million dollars"
prediction = predict_email(model, vectorizer, sample_email, clean_text)
print(f"Email: '{sample_email}'")
print(f"Prediction: {prediction}")

sample_email_2 = "Hey, let's meet for lunch tomorrow."
prediction_2 = predict_email(model, vectorizer, sample_email_2, clean_text)
print(f"Email: '{sample_email_2}'")
print(f"Prediction: {prediction_2}")