In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import pickle
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries imported successfully!")

## Step 1: Load the Dataset

Download the dataset from: https://www.kaggle.com/code/mfaisalqureshi/email-spam-detection-98-accuracy

The dataset should be named `spam.csv` and placed in a `data/` folder.

Alternative: You can use the dataset from Kaggle API:
```bash
kaggle datasets download -d balaka18/email-spam-classification-dataset-csv
```

In [None]:
# Load the dataset
# Note: If you don't have the file, download it from the Kaggle link mentioned above
# The dataset typically has columns: 'v1' (label) and 'v2' (message)

try:
    df = pd.read_csv('data/spam.csv', encoding='latin-1')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Dataset not found. Creating sample data for demonstration...")
    # Creating a minimal dataset for demonstration purposes
    # In production, use the actual Kaggle dataset
    sample_data = {
        'v1': ['ham', 'spam', 'ham', 'spam', 'ham'] * 200,
        'v2': [
            'Hey, how are you doing today?',
            'WINNER!! You have won $1000000! Click here now!!!',
            'Let\'s meet for coffee tomorrow',
            'FREE FREE FREE! Buy now and get 90% discount!!!',
            'Can you send me the report please?'
        ] * 200
    }
    df = pd.DataFrame(sample_data)
    print("Sample dataset created for demonstration")

# Display basic information
print(f"\nDataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())

In [None]:
# Keep only relevant columns and rename them
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Check for missing values
print(f"Missing values:\n{df.isnull().sum()}")

# Drop any missing values
df = df.dropna()

# Check class distribution
print(f"\nClass distribution:")
print(df['label'].value_counts())

# Encode labels: spam=1, ham=0
df['label_encoded'] = df['label'].map({'spam': 1, 'ham': 0})

print(f"\nDataset after preprocessing:")
print(df.head())

## Step 2: Text Preprocessing

Clean and preprocess the email text data

In [None]:
def clean_text(text):
    """
    Clean and preprocess text data
    - Convert to lowercase
    - Remove special characters and digits
    - Remove extra whitespace
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply text cleaning
df['cleaned_message'] = df['message'].apply(clean_text)

print("Text cleaning complete!")
print(f"\nExample of cleaned text:")
print(f"Original: {df['message'].iloc[0]}")
print(f"Cleaned: {df['cleaned_message'].iloc[0]}")

## Step 3: Feature Extraction using TF-IDF

In [None]:
# Split data into training and testing sets
X = df['cleaned_message']
y = df['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")
print(f"\nClass distribution in training set:")
print(y_train.value_counts())

In [None]:
# Create TF-IDF vectorizer
# TF-IDF converts text into numerical features
vectorizer = TfidfVectorizer(
    max_features=3000,  # Use top 3000 features
    min_df=2,           # Ignore terms that appear in less than 2 documents
    max_df=0.8,         # Ignore terms that appear in more than 80% of documents
    ngram_range=(1, 2)  # Use unigrams and bigrams
)

# Fit and transform training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform testing data
X_test_tfidf = vectorizer.transform(X_test)

print(f"TF-IDF feature matrix shape: {X_train_tfidf.shape}")
print(f"Number of features: {len(vectorizer.get_feature_names_out())}")

## Step 4: Train Multiple Models and Compare

In [None]:
# Train multiple models
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train the model
    model.fit(X_train_tfidf, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_tfidf)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'predictions': y_pred
    }
    
    print(f"{name} Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Find best model
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
best_model = results[best_model_name]['model']
best_accuracy = results[best_model_name]['accuracy']

print(f"\n{'='*50}")
print(f"Best Model: {best_model_name}")
print(f"Best Accuracy: {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")
print(f"{'='*50}")

## Step 5: Evaluate Best Model

In [None]:
# Get predictions from best model
y_pred_best = results[best_model_name]['predictions']

# Print classification report
print("Classification Report:")
print("="*50)
print(classification_report(y_test, y_pred_best, target_names=['Ham', 'Spam']))

# Calculate and display confusion matrix
cm = confusion_matrix(y_test, y_pred_best)
print("\nConfusion Matrix:")
print(cm)

In [None]:
# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Ham', 'Spam'], 
            yticklabels=['Ham', 'Spam'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("Confusion matrix saved as 'confusion_matrix.png'")

In [None]:
# Visualize model comparison
plt.figure(figsize=(10, 6))
model_names = list(results.keys())
accuracies = [results[name]['accuracy'] * 100 for name in model_names]

bars = plt.bar(model_names, accuracies, color=['#3498db', '#e74c3c', '#2ecc71'])
plt.xlabel('Model', fontsize=12)
plt.ylabel('Accuracy (%)', fontsize=12)
plt.title('Model Comparison - Accuracy', fontsize=14, fontweight='bold')
plt.ylim([90, 100])

# Add accuracy values on top of bars
for i, (bar, acc) in enumerate(zip(bars, accuracies)):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
             f'{acc:.2f}%', ha='center', va='bottom', fontsize=10)

plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("Model comparison saved as 'model_comparison.png'")

## Step 6: Save the Model and Vectorizer

In [None]:
# Save the best model and vectorizer
model_data = {
    'model': best_model,
    'vectorizer': vectorizer,
    'model_name': best_model_name,
    'accuracy': best_accuracy
}

# Save to pickle file
with open('spam_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print(f"Model saved successfully as 'spam_model.pkl'")
print(f"Model: {best_model_name}")
print(f"Accuracy: {best_accuracy*100:.2f}%")

## Step 7: Test the Saved Model

Load the model and test it with new email examples

In [None]:
# Function to load and use the model
def load_spam_classifier():
    """
    Load the saved spam classification model
    Returns: model_data dictionary containing model and vectorizer
    """
    with open('spam_model.pkl', 'rb') as f:
        model_data = pickle.load(f)
    return model_data

def classify_email(email_text, model_data):
    """
    Classify a single email as spam or ham
    
    Args:
        email_text: The email message to classify
        model_data: Dictionary containing model and vectorizer
    
    Returns:
        tuple: (prediction, probability)
    """
    # Clean the text
    cleaned = clean_text(email_text)
    
    # Vectorize
    vectorized = model_data['vectorizer'].transform([cleaned])
    
    # Predict
    prediction = model_data['model'].predict(vectorized)[0]
    
    # Get probability if available
    try:
        probability = model_data['model'].predict_proba(vectorized)[0]
        prob_spam = probability[1]
    except:
        prob_spam = None
    
    # Convert to label
    label = 'spam' if prediction == 1 else 'inbox'
    
    return label, prob_spam

# Load the model
loaded_model_data = load_spam_classifier()
print(f"Model loaded successfully!")
print(f"Model type: {loaded_model_data['model_name']}")
print(f"Model accuracy: {loaded_model_data['accuracy']*100:.2f}%")

In [None]:
# Test with sample emails
test_emails = [
    "Hey! How are you? Want to grab lunch tomorrow?",
    "CONGRATULATIONS!!! You've WON $1,000,000! Click here NOW to claim your prize!!!",
    "Meeting scheduled for 3pm in conference room B",
    "FREE VIAGRA!!! Buy now and get 90% discount! Limited time offer!!!",
    "Can you please send me the quarterly report by end of day?",
    "URGENT: Your account will be closed. Click here to verify your information NOW!!!"
]

print("Testing with sample emails:")
print("="*70)

for i, email in enumerate(test_emails, 1):
    label, prob = classify_email(email, loaded_model_data)
    
    print(f"\nEmail {i}:")
    print(f"Text: {email[:60]}{'...' if len(email) > 60 else ''}")
    print(f"Classification: {label.upper()}")
    if prob is not None:
        print(f"Spam Probability: {prob*100:.2f}%")
    print("-"*70)

## Summary

### Model Training Complete!

**Outputs:**
1. `spam_model.pkl` - Trained model with vectorizer
2. `confusion_matrix.png` - Confusion matrix visualization
3. `model_comparison.png` - Model accuracy comparison

**Key Functions:**
- `load_spam_classifier()` - Load the saved model
- `classify_email(text, model_data)` - Classify new emails

**Usage Example:**
```python
model_data = load_spam_classifier()
label, probability = classify_email("Your email text here", model_data)
print(f"Classification: {label}")
```

The model is ready to be integrated into the backend API!