
# Email Spam Classifier (Spambase)
This notebook walks through loading the UCI Spambase dataset, preprocessing, training Logistic Regression, SVM, k-NN and Gaussian Naive Bayes, then comparing their performance with accuracy, confusion matrix and ROC curves.


In [None]:

# Download dataset (if not present)
from src.utils import download_spambase, load_spambase
import os
DATA_PATH = "data/spambase.data"
if not os.path.exists(DATA_PATH):
    download_spambase(dest=DATA_PATH)
else:
    print("Dataset already present at", DATA_PATH)


In [None]:

# Load and inspect data
names = [f"feature_{i}" for i in range(57)] + ["is_spam"]
df = load_spambase(path="data/spambase.data", names=names)
df.shape, df.head()


In [None]:

# Preprocessing and split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = df.drop(columns=["is_spam"]).values
y = df["is_spam"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
print('Train shape', X_train.shape, 'Test shape', X_test.shape)


In [None]:

# Train models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from src.utils import plot_confusion_matrix, plot_roc
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "GaussianNB": GaussianNB()
}
results = {}
for name, model in models.items():
    print('Training', name)
    if name in ("KNN", "SVM", "LogisticRegression"):
        model.fit(X_train_s, y_train)
        y_pred = model.predict(X_test_s)
        y_score = model.predict_proba(X_test_s)[:,1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        if hasattr(model, "predict_proba"):
            y_score = model.predict_proba(X_test)[:,1]
        else:
            y_score = model.decision_function(X_test)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    try:
        roc_auc = roc_auc_score(y_test, y_score)
    except:
        roc_auc = None
    results[name] = {"accuracy": acc, "roc_auc": roc_auc}
    print(f"{name} -> accuracy: {acc:.4f}, roc_auc: {roc_auc}")
    plot_confusion_matrix(cm, classes=['ham','spam'], out_path=f"outputs/confusion_{name}.png")
    if roc_auc is not None:
        plot_roc(y_test, y_score, out_path=f"outputs/roc_{name}.png", model_name=name)


In [None]:

# Results summary
import json, pprint
import os
print("Outputs saved in ./outputs")
p = "outputs/results_summary.json"
if os.path.exists(p):
    print("Found saved summary:", p)
else:
    # build a small summary file from in-memory results if available
    with open(p, "w") as f:
        json.dump(results, f, indent=2)
pprint.pprint(results)
