# Email Spam Classifier - UCI Spambase
This notebook walks through loading the dataset, preprocessing, training multiple classical ML classifiers, evaluating them, and plotting confusion matrices and ROC curves.

In [None]:
# Install dependencies (run once)
# !pip install -q -r requirements.txt


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

DATA_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data'

df = pd.read_csv(DATA_URL, header=None)
df.shape


In [None]:
# Quick EDA
print('Columns:', df.shape[1])
print(df.head())
print(df[57].value_counts())


In [None]:
# Prepare data
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)


In [None]:
# Train models
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'GaussianNB': GaussianNB()
}
for name, model in models.items():
    print('Training', name)
    model.fit(X_train_s, y_train)


In [None]:
# Evaluate models
import matplotlib.pyplot as plt
plt.figure(figsize=(8,6))
for name, model in models.items():
    y_pred = model.predict(X_test_s)
    acc = accuracy_score(y_test, y_pred)
    print('==', name, 'Accuracy:', acc)
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d')
    plt.title('Confusion matrix - ' + name)
    plt.show()
    try:
        if hasattr(model, 'predict_proba'):
            probs = model.predict_proba(X_test_s)[:,1]
        else:
            probs = model.decision_function(X_test_s)
        fpr, tpr, _ = roc_curve(y_test, probs)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{name} (AUC={{roc_auc:.3f}})" )
    except Exception as e:
        print('ROC failed for', name, e)
plt.plot([0,1],[0,1],'--')
plt.legend()
plt.title('ROC Curves')
plt.show()


## Next steps
- Persist best model with `joblib`
- Hyperparameter tuning (GridSearchCV)
- Build a simple Flask app to serve predictions
