# Spam Email Classification - Interactive Exploration

This notebook provides an interactive environment for exploring spam email classification using both traditional machine learning and deep learning approaches.

## Features
- Data exploration and visualization
- Model training and evaluation
- Interactive predictions
- Performance analysis

In [None]:
# Import necessary libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('src')

# Import our custom modules
from data_preprocessing import DataLoader, EmailPreprocessor
from traditional_ml import TraditionalMLModels
from deep_learning import DeepLearningModels, EnsemblePredictor

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✓ All libraries imported successfully!")

## 1. Data Loading and Exploration

In [None]:
# Load and prepare the dataset
loader = DataLoader()
df = loader.load_spam_dataset()
prepared_df = loader.prepare_dataset(df)

print(f"Dataset shape: {prepared_df.shape}")
print(f"\nColumn names: {list(prepared_df.columns)}")
print(f"\nLabel distribution:")
print(prepared_df['label'].value_counts())

# Display first few rows
prepared_df.head()

In [None]:
# Visualize label distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Pie chart
label_counts = prepared_df['label'].value_counts()
axes[0].pie(label_counts.values, labels=label_counts.index, autopct='%1.1f%%', startangle=90)
axes[0].set_title('Email Label Distribution')

# Bar chart
label_counts.plot(kind='bar', ax=axes[1], color=['skyblue', 'lightcoral'])
axes[1].set_title('Email Label Counts')
axes[1].set_xlabel('Label')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Text length analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Text length distribution
axes[0, 0].hist(prepared_df['length'], bins=30, alpha=0.7, color='skyblue')
axes[0, 0].set_title('Text Length Distribution')
axes[0, 0].set_xlabel('Character Count')
axes[0, 0].set_ylabel('Frequency')

# Word count distribution
axes[0, 1].hist(prepared_df['num_words'], bins=30, alpha=0.7, color='lightgreen')
axes[0, 1].set_title('Word Count Distribution')
axes[0, 1].set_xlabel('Word Count')
axes[0, 1].set_ylabel('Frequency')

# Text length by label
spam_length = prepared_df[prepared_df['label'] == 'spam']['length']
ham_length = prepared_df[prepared_df['label'] == 'ham']['length']

axes[1, 0].hist([ham_length, spam_length], bins=20, alpha=0.7, 
                label=['Ham', 'Spam'], color=['skyblue', 'lightcoral'])
axes[1, 0].set_title('Text Length by Label')
axes[1, 0].set_xlabel('Character Count')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()

# Box plot of features by label
feature_cols = ['num_words', 'num_exclamation', 'num_uppercase', 'capital_ratio']
melted_df = prepared_df[feature_cols + ['label']].melt(id_vars=['label'], var_name='feature', value_name='value')
sns.boxplot(data=melted_df, x='feature', y='value', hue='label', ax=axes[1, 1])
axes[1, 1].set_title('Feature Distribution by Label')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Word clouds for spam vs ham
spam_text = ' '.join(prepared_df[prepared_df['label'] == 'spam']['cleaned_text'])
ham_text = ' '.join(prepared_df[prepared_df['label'] == 'ham']['cleaned_text'])

fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Spam word cloud
if spam_text.strip():
    spam_wordcloud = WordCloud(width=400, height=400, background_color='white').generate(spam_text)
    axes[0].imshow(spam_wordcloud, interpolation='bilinear')
    axes[0].set_title('Spam Email Word Cloud', fontsize=16)
    axes[0].axis('off')

# Ham word cloud
if ham_text.strip():
    ham_wordcloud = WordCloud(width=400, height=400, background_color='white').generate(ham_text)
    axes[1].imshow(ham_wordcloud, interpolation='bilinear')
    axes[1].set_title('Ham Email Word Cloud', fontsize=16)
    axes[1].axis('off')

plt.tight_layout()
plt.show()

## 2. Feature Analysis

In [None]:
# Correlation matrix of numerical features
feature_cols = ['length', 'num_words', 'num_sentences', 'num_exclamation', 
               'num_question', 'num_uppercase', 'num_digits', 'has_money_words', 
               'has_urgent_words', 'capital_ratio', 'target']

correlation_matrix = prepared_df[feature_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Feature importance for spam detection
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Prepare features
X_features = prepared_df[feature_cols[:-1]]  # Exclude target
y = prepared_df['target']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_features)

# Train Random Forest for feature importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_scaled, y)

# Plot feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols[:-1],
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=True)

plt.figure(figsize=(10, 8))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.title('Feature Importance for Spam Detection')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

## 3. Model Training and Evaluation

In [None]:
# Split the data
from sklearn.model_selection import train_test_split

X_text = prepared_df['text']
X_cleaned = prepared_df['cleaned_text']
y = prepared_df['target']

X_train_text, X_test_text, X_train_cleaned, X_test_cleaned, y_train, y_test = train_test_split(
    X_text, X_cleaned, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train_text)}")
print(f"Test set size: {len(X_test_text)}")
print(f"Class distribution in training set:")
print(pd.Series(y_train).value_counts())

In [None]:
# Train traditional ML models
print("Training Traditional ML Models...")
ml_models = TraditionalMLModels()
ml_models.train_all_models(X_train_cleaned, y_train, use_grid_search=False)

# Evaluate ML models
ml_results = ml_models.evaluate_all_models(X_test_cleaned, y_test)

print("\nML Model Results:")
for model_name, result in ml_results.items():
    print(f"{model_name}: F1={result['f1_score']:.4f}, AUC={result['auc_score']:.4f}")

In [None]:
# Visualize ML model performance
ml_comparison = pd.DataFrame([
    {
        'Model': model_name.replace('_', ' ').title(),
        'Accuracy': result['accuracy'],
        'Precision': result['precision'],
        'Recall': result['recall'],
        'F1-Score': result['f1_score'],
        'AUC': result['auc_score']
    }
    for model_name, result in ml_results.items()
])

# Plot comparison
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Traditional ML Model Performance', fontsize=16)

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC']

for i, metric in enumerate(metrics):
    row = i // 3
    col = i % 3
    
    bars = axes[row, col].bar(ml_comparison['Model'], ml_comparison[metric])
    axes[row, col].set_title(metric)
    axes[row, col].set_ylabel(metric)
    axes[row, col].tick_params(axis='x', rotation=45)
    axes[row, col].grid(True, alpha=0.3)
    
    # Add value labels
    for bar, value in zip(bars, ml_comparison[metric]):
        axes[row, col].text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.01,
                          f'{value:.3f}', ha='center', va='bottom')

# Remove the last subplot
fig.delaxes(axes[1, 2])

plt.tight_layout()
plt.show()

# Display comparison table
print("\nML Model Comparison:")
display(ml_comparison.sort_values('F1-Score', ascending=False))

## 4. Interactive Predictions

In [None]:
# Interactive prediction function
def predict_email(text, show_individual=True):
    """Make predictions on email text."""
    print(f"\n{'='*60}")
    print(f"EMAIL TEXT: {text[:100]}{'...' if len(text) > 100 else ''}")
    print(f"{'='*60}")
    
    # Get predictions from all models
    ml_predictions = ml_models.predict(text)
    
    if show_individual:
        print("\nINDIVIDUAL MODEL PREDICTIONS:")
        print("-" * 40)
        
        for model_name, pred in ml_predictions.items():
            print(f"{model_name.replace('_', ' ').title()}:")
            print(f"  Prediction: {pred['prediction'].upper()}")
            print(f"  Confidence: {pred['confidence']:.3f}")
            print(f"  Spam Probability: {pred['spam_probability']:.3f}")
            print()
    
    # Calculate ensemble prediction
    spam_probs = [pred['spam_probability'] for pred in ml_predictions.values()]
    avg_spam_prob = np.mean(spam_probs)
    ensemble_pred = 'spam' if avg_spam_prob > 0.5 else 'ham'
    ensemble_conf = max(avg_spam_prob, 1 - avg_spam_prob)
    
    print("ENSEMBLE PREDICTION:")
    print("-" * 20)
    print(f"Prediction: {ensemble_pred.upper()}")
    print(f"Confidence: {ensemble_conf:.3f}")
    print(f"Spam Probability: {avg_spam_prob:.3f}")
    
    return ensemble_pred, ensemble_conf

In [None]:
# Test with sample spam email
spam_sample = "WINNER! You've won $1000! Click here to claim your prize NOW! Limited time offer!"
predict_email(spam_sample)

In [None]:
# Test with sample legitimate email
ham_sample = "Hey John, thanks for your help with the project yesterday. The presentation went really well!"
predict_email(ham_sample)

In [None]:
# Interactive prediction widget
from IPython.widgets import interact, widgets

# Sample texts for quick testing
sample_texts = {
    "Spam 1": "URGENT! Your account will be suspended unless you verify immediately!",
    "Spam 2": "FREE MONEY! No strings attached! Call now!!!",
    "Ham 1": "Don't forget about mom's birthday next week.",
    "Ham 2": "The weather is really nice today, perfect for a walk.",
    "Custom": ""
}

def interactive_predict(sample_choice, custom_text):
    if sample_choice == "Custom":
        text = custom_text
    else:
        text = sample_texts[sample_choice]
    
    if text.strip():
        predict_email(text, show_individual=False)
    else:
        print("Please enter some text to classify.")

# Create interactive widget
interact(
    interactive_predict,
    sample_choice=widgets.Dropdown(
        options=list(sample_texts.keys()),
        value="Spam 1",
        description="Sample:"
    ),
    custom_text=widgets.Textarea(
        value="",
        placeholder="Enter your custom email text here...",
        description="Custom Text:",
        layout=widgets.Layout(width='100%', height='100px')
    )
)

## 5. Error Analysis

In [None]:
# Analyze model errors
best_model_name = max(ml_results.keys(), key=lambda k: ml_results[k]['f1_score'])
best_model = ml_models.trained_models[best_model_name]

print(f"Analyzing errors for best model: {best_model_name}")

# Get predictions
y_pred = best_model.predict(X_test_cleaned)
y_pred_proba = best_model.predict_proba(X_test_cleaned)[:, 1]

# Find misclassified examples
misclassified_mask = y_pred != y_test
false_positives = (y_pred == 1) & (y_test == 0)
false_negatives = (y_pred == 0) & (y_test == 1)

print(f"Total misclassifications: {misclassified_mask.sum()}")
print(f"False positives (ham → spam): {false_positives.sum()}")
print(f"False negatives (spam → ham): {false_negatives.sum()}")

# Show some examples
if false_positives.sum() > 0:
    print("\nFalse Positive Examples (Ham classified as Spam):")
    fp_indices = np.where(false_positives)[0][:3]
    for i, idx in enumerate(fp_indices, 1):
        print(f"\n{i}. Confidence: {y_pred_proba[idx]:.3f}")
        print(f"   Text: {X_test_text.iloc[idx][:200]}...")

if false_negatives.sum() > 0:
    print("\nFalse Negative Examples (Spam classified as Ham):")
    fn_indices = np.where(false_negatives)[0][:3]
    for i, idx in enumerate(fn_indices, 1):
        print(f"\n{i}. Confidence: {1-y_pred_proba[idx]:.3f}")
        print(f"   Text: {X_test_text.iloc[idx][:200]}...")

## 6. Model Interpretation

In [None]:
# Feature importance from the best traditional model
if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
    # For tree-based models
    feature_names = best_model.named_steps['tfidf'].get_feature_names_out()
    importances = best_model.named_steps['classifier'].feature_importances_
    
    # Get top features
    top_indices = np.argsort(importances)[-20:]
    top_features = [(feature_names[i], importances[i]) for i in top_indices]
    
    print("Top 20 Most Important Features:")
    for feature, importance in reversed(top_features):
        print(f"{feature:<20}: {importance:.4f}")
        
    # Plot top features
    features, values = zip(*top_features)
    
    plt.figure(figsize=(12, 8))
    plt.barh(range(len(features)), values)
    plt.yticks(range(len(features)), features)
    plt.xlabel('Feature Importance')
    plt.title(f'Top Features - {best_model_name.replace("_", " ").title()}')
    plt.tight_layout()
    plt.show()

elif hasattr(best_model.named_steps['classifier'], 'coef_'):
    # For linear models
    feature_names = best_model.named_steps['tfidf'].get_feature_names_out()
    coefficients = best_model.named_steps['classifier'].coef_[0]
    
    # Get top positive and negative coefficients
    top_positive_indices = np.argsort(coefficients)[-10:]
    top_negative_indices = np.argsort(coefficients)[:10]
    
    print("Top 10 Spam Indicators (Positive Coefficients):")
    for i in reversed(top_positive_indices):
        print(f"{feature_names[i]:<20}: {coefficients[i]:.4f}")
    
    print("\nTop 10 Ham Indicators (Negative Coefficients):")
    for i in top_negative_indices:
        print(f"{feature_names[i]:<20}: {coefficients[i]:.4f}")

## 7. Summary and Conclusions

In [None]:
# Create final summary
print("SPAM CLASSIFICATION PROJECT SUMMARY")
print("=" * 50)

print(f"\nDataset Statistics:")
print(f"  Total samples: {len(prepared_df)}")
print(f"  Training samples: {len(X_train_text)}")
print(f"  Test samples: {len(X_test_text)}")
print(f"  Spam percentage: {(prepared_df['target'].sum() / len(prepared_df)) * 100:.1f}%")

print(f"\nModel Performance:")
best_result = ml_results[best_model_name]
print(f"  Best Model: {best_model_name.replace('_', ' ').title()}")
print(f"  Accuracy: {best_result['accuracy']:.4f}")
print(f"  Precision: {best_result['precision']:.4f}")
print(f"  Recall: {best_result['recall']:.4f}")
print(f"  F1-Score: {best_result['f1_score']:.4f}")
print(f"  AUC Score: {best_result['auc_score']:.4f}")

print(f"\nKey Insights:")
print(f"  • Traditional ML models are effective for spam classification")
print(f"  • Text preprocessing significantly improves performance")
print(f"  • Feature engineering (length, caps, punctuation) adds value")
print(f"  • Ensemble methods can provide more robust predictions")

print(f"\nNext Steps:")
print(f"  • Try deep learning models (BERT, DistilBERT) for comparison")
print(f"  • Collect more diverse training data")
print(f"  • Implement real-time classification system")
print(f"  • Add more sophisticated feature engineering")

In [None]:
# Save models for production use
import joblib
import os

os.makedirs('models', exist_ok=True)

# Save the best model
joblib.dump(best_model, f'models/{best_model_name}_model.joblib')
print(f"✓ Best model saved: models/{best_model_name}_model.joblib")

# Save preprocessing pipeline
preprocessor = EmailPreprocessor()
joblib.dump(preprocessor, 'models/email_preprocessor.joblib')
print("✓ Preprocessor saved: models/email_preprocessor.joblib")

print("\n🎉 Spam classification project completed successfully!")
print("Models are ready for deployment in the web application.")