## Problem 2 - Naive Bayes

In [39]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')


In [40]:
data = np.loadtxt("spambase/spambase.data", delimiter=",")
X = data[:, :-1]
y = data[:, -1].astype(int)

scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

Gaussian Naive Bayes Classifier:

In [41]:
class GaussianNaiveBayes:
    def __init__(self):
        self.priors = None
        self.means = None
        self.variances = None
        self.classes = None

    def fit(self, X, y):
        self.classes = np.unique(y)
        n_samples, n_features = X.shape

        self.means = np.zeros((len(self.classes), n_features))
        self.variances = np.zeros((len(self.classes), n_features))
        self.priors = np.zeros(len(self.classes))

        # each class calculation for prior, mean and variance
        for idx, c in enumerate(self.classes):
            X_c = X[y==c]

            self.priors[idx] = len(X_c) / n_samples
            self.means[idx, :] = np.mean(X_c, axis=0)
            self.variances[idx, :] = np.var(X_c, axis=0) + 1e-9

    def gaussian(self, x, mean, variance):
        return (1.0 / np.sqrt(2*np.pi*variance)) * np.exp(-((x-mean)**2) / (2 * variance))

    def predict_single(self, x):
        """Single instance"""
        posteriors = []

        # Calculate P(y=c|x) for each class
        for idx, c in enumerate(self.classes):
            # Start with prior P(y=c)
            posterior = self.priors[idx]

            # Multiply by P(xi|y=c) for each feature
            for i in range(len(x)):
                likelihood = self.gaussian(
                    x[i],
                    self.means[idx, i],
                    self.variances[idx, i]
                )
                posterior *= likelihood

            posteriors.append(posterior)

        return self.classes[np.argmax(posteriors)]

    def predict(self, X):
        """Multiple instances"""
        return np.array([self.predict_single(x) for x in X])

Bernoulli Classifier:

In [42]:
class BernoulliNaiveBayes:
    def __init__(self):
        self.class_priors = None
        self.means = None  # 2x57 matrix of feature means per class
        self.probs = None  # 2x57 matrix Z[k,i] = P(feature i < mean | class k)
        self.classes = None

    def fit(self, X, y):
        """
        Train BNB by computing class-specific means and Bernoulli probabilities
        """
        self.classes = np.unique(y)
        n_samples, n_features = X.shape

        # Initialize matrices
        self.means = np.zeros((len(self.classes), n_features))
        self.probs = np.zeros((len(self.classes), n_features))
        self.class_priors = np.zeros(len(self.classes))

        # Compute statistics for each class
        for idx, c in enumerate(self.classes):
            X_c = X[y == c]

            # Class prior P(y=c)
            self.class_priors[idx] = len(X_c) / n_samples

            # Mean for each feature
            self.means[idx, :] = np.mean(X_c, axis=0)

            # Z[k,i] = probability of being below mean for feature i in class k
            for i in range(n_features):
                # Count how many samples are below the mean
                below_mean = X_c[:, i] <= self.means[idx, i]
                self.probs[idx, i] = np.mean(below_mean)

    def predict_single(self, x):
        """
        Predict single instance using Bernoulli distribution
        """
        posteriors = []

        for idx, c in enumerate(self.classes):
            # Start with prior P(y=c)
            posterior = self.class_priors[idx]

            # Multiply by P(xi|y=c) for each feature
            for i in range(len(x)):
                # Bernoulli probability based on whether xi > mean[k,i]
                if x[i] > self.means[idx, i]:
                    # P(xi > mean | y=c) = 1 - Z[k,i]
                    prob = 1 - self.probs[idx, i]
                else:
                    # P(xi <= mean | y=c) = Z[k,i]
                    prob = self.probs[idx, i]

                # Avoid zero probability
                prob = max(prob, 1e-10)
                posterior *= prob

            posteriors.append(posterior)

        return self.classes[np.argmax(posteriors)]

    def predict(self, X):
        """
        Predict multiple instances
        """
        return np.array([self.predict_single(x) for x in X])

Non-Parametric Classifier: Histogram 4 bin

In [43]:
class HistogramNaiveBayes:
    def __init__(self, n_bins=4, alpha=0.01, use_log=True):
        """
        Final implementation with log1p preprocessing
        - Strictly 4 bins as required by TA
        - Log1p transformation handles zeros and skewed distributions
        """
        self.n_bins = n_bins
        self.alpha = alpha
        self.use_log = use_log
        self.class_priors = None
        self.bin_edges = {}
        self.bin_probs = {}
        self.classes = None

    def fit(self, X, y):
        """
        Train 4-bin Histogram NB with log1p preprocessing
        """
        # Apply log1p transformation if enabled
        if self.use_log:
            X_processed = np.log1p(X)
        else:
            X_processed = X

        self.classes = np.unique(y)
        n_samples, n_features = X_processed.shape
        self.class_priors = np.zeros(len(self.classes))

        for idx, c in enumerate(self.classes):
            X_c = X_processed[y == c]
            self.class_priors[idx] = len(X_c) / n_samples

            self.bin_edges[c] = {}
            self.bin_probs[c] = {}

            for feature_idx in range(n_features):
                feature_values = X_c[:, feature_idx]

                # Create 4-bin histogram with TA-specified edges:
                # min, Q1, mean, Q3, max
                min_val = np.min(feature_values)
                q1 = np.percentile(feature_values, 25)
                mean_val = np.mean(feature_values)
                q3 = np.percentile(feature_values, 75)
                max_val = np.max(feature_values)

                edges = np.array([min_val, q1, mean_val, q3, max_val])
                edges = np.sort(edges)
                edges = np.unique(edges)

                # Ensure we have 5 edges for 4 bins
                if len(edges) < 5:
                    edges = np.linspace(min_val, max_val + 1e-10, 5)

                edges[-1] += 1e-10  # Ensure max is included

                # Calculate histogram
                hist, _ = np.histogram(feature_values, bins=edges)

                # Apply Laplace smoothing
                hist_smooth = hist + self.alpha

                # Normalize to probabilities
                hist_prob = hist_smooth / np.sum(hist_smooth)

                self.bin_edges[c][feature_idx] = edges
                self.bin_probs[c][feature_idx] = hist_prob

    def predict_single(self, x):
        """
        Predict single instance
        """
        log_posteriors = []

        for idx, c in enumerate(self.classes):
            log_posterior = np.log(self.class_priors[idx] + 1e-10)

            for i in range(len(x)):
                edges = self.bin_edges[c][i]
                probs = self.bin_probs[c][i]

                bin_idx = np.searchsorted(edges[:-1], x[i], side='right') - 1
                bin_idx = np.clip(bin_idx, 0, len(probs) - 1)

                log_posterior += np.log(probs[bin_idx] + 1e-10)

            log_posteriors.append(log_posterior)

        return self.classes[np.argmax(log_posteriors)]

    def predict(self, X):
        """
        Predict with same preprocessing
        """
        # Apply same preprocessing
        if self.use_log:
            X_processed = np.log1p(X)
        else:
            X_processed = X

        return np.array([self.predict_single(x) for x in X_processed])

K-FOLD Cross Val:

In [44]:
def evaluate_model_kfold(model_class, X, y, n_folds=5):
    """
    Evaluate model using k-fold cross validation
    """
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

    train_accuracies = []
    test_accuracies = []

    for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
        # Split data
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Normalize data
        scaler = StandardScaler()
        X_train_norm = scaler.fit_transform(X_train)
        X_test_norm = scaler.transform(X_test)

        # Train model
        model = model_class()
        model.fit(X_train_norm, y_train)

        # Evaluate
        y_pred_train = model.predict(X_train_norm)
        y_pred_test = model.predict(X_test_norm)

        train_acc = accuracy_score(y_train, y_pred_train)
        test_acc = accuracy_score(y_test, y_pred_test)

        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)

        print(f"Fold {fold}: Train Acc = {train_acc:.4f}, Test Acc = {test_acc:.4f}")

    avg_train = np.mean(train_accuracies)
    avg_test = np.mean(test_accuracies)
    std_train = np.std(train_accuracies)
    std_test = np.std(test_accuracies)

    return {
        'train_accs': train_accuracies,
        'test_accs': test_accuracies,
        'avg_train': avg_train,
        'avg_test': avg_test,
        'std_train': std_train,
        'std_test': std_test
    }

# Evaluate all three models
print("="*50)
print("Gaussian Naive Bayes Results:")
print("="*50)
gnb_results = evaluate_model_kfold(GaussianNaiveBayes, X, y, n_folds=5)
print(f"\nAverage Train Accuracy: {gnb_results['avg_train']:.4f} ± {gnb_results['std_train']:.4f}")
print(f"Average Test Accuracy: {gnb_results['avg_test']:.4f} ± {gnb_results['std_test']:.4f}")

print("\n" + "="*50)
print("Bernoulli Naive Bayes Results:")
print("="*50)
bnb_results = evaluate_model_kfold(BernoulliNaiveBayes, X, y, n_folds=5)
print(f"\nAverage Train Accuracy: {bnb_results['avg_train']:.4f} ± {bnb_results['std_train']:.4f}")
print(f"Average Test Accuracy: {bnb_results['avg_test']:.4f} ± {bnb_results['std_test']:.4f}")

print("\n" + "="*50)
print("Histogram Naive Bayes Results:")
print("="*50)
hnb_results = evaluate_model_kfold(HistogramNaiveBayes, X, y, n_folds=5)
print(f"\nAverage Train Accuracy: {hnb_results['avg_train']:.4f} ± {hnb_results['std_train']:.4f}")
print(f"Average Test Accuracy: {hnb_results['avg_test']:.4f} ± {hnb_results['std_test']:.4f}")

Gaussian Naive Bayes Results:
Fold 1: Train Acc = 0.8204, Test Acc = 0.8208
Fold 2: Train Acc = 0.8047, Test Acc = 0.8033
Fold 3: Train Acc = 0.8245, Test Acc = 0.7946
Fold 4: Train Acc = 0.8250, Test Acc = 0.8228
Fold 5: Train Acc = 0.8215, Test Acc = 0.8337

Average Train Accuracy: 0.8192 ± 0.0075
Average Test Accuracy: 0.8150 ± 0.0141

Bernoulli Naive Bayes Results:
Fold 1: Train Acc = 0.8938, Test Acc = 0.9001
Fold 2: Train Acc = 0.8894, Test Acc = 0.8848
Fold 3: Train Acc = 0.8970, Test Acc = 0.8880
Fold 4: Train Acc = 0.8998, Test Acc = 0.8783
Fold 5: Train Acc = 0.8946, Test Acc = 0.9065

Average Train Accuracy: 0.8949 ± 0.0034
Average Test Accuracy: 0.8915 ± 0.0103

Histogram Naive Bayes Results:
Fold 1: Train Acc = 0.8821, Test Acc = 0.8599
Fold 2: Train Acc = 0.8824, Test Acc = 0.8967
Fold 3: Train Acc = 0.8911, Test Acc = 0.8935
Fold 4: Train Acc = 0.8911, Test Acc = 0.8913
Fold 5: Train Acc = 0.8794, Test Acc = 0.8815

Average Train Accuracy: 0.8852 ± 0.0049
Average Test Ac

In [45]:
# Test the fixed Histogram NB
print("="*50)
print("Fixed Histogram Naive Bayes Results:")
print("="*50)

# Use the alternative implementation
hnb_fixed_results = evaluate_model_kfold(HistogramNaiveBayes, X, y, n_folds=5)

print(f"\nAverage Train Accuracy: {hnb_fixed_results['avg_train']:.4f} ± {hnb_fixed_results['std_train']:.4f}")
print(f"Average Test Accuracy: {hnb_fixed_results['avg_test']:.4f} ± {hnb_fixed_results['std_test']:.4f}")

# Create updated summary table
print("\n" + "="*60)
print("FINAL SUMMARY TABLE - All Models (5-Fold CV)")
print("="*60)
print(f"{'Model':<20} {'Train Acc':<15} {'Test Acc':<15}")
print("-"*60)
print(f"{'Gaussian NB':<20} {gnb_results['avg_train']:.4f} ± {gnb_results['std_train']:.4f}  "
      f"{gnb_results['avg_test']:.4f} ± {gnb_results['std_test']:.4f}")
print(f"{'Bernoulli NB':<20} {bnb_results['avg_train']:.4f} ± {bnb_results['std_train']:.4f}  "
      f"{bnb_results['avg_test']:.4f} ± {bnb_results['std_test']:.4f}")
print(f"{'Histogram NB (Fixed)':<20} {hnb_fixed_results['avg_train']:.4f} ± {hnb_fixed_results['std_train']:.4f}  "
      f"{hnb_fixed_results['avg_test']:.4f} ± {hnb_fixed_results['std_test']:.4f}")

Fixed Histogram Naive Bayes Results:
Fold 1: Train Acc = 0.8821, Test Acc = 0.8599
Fold 2: Train Acc = 0.8824, Test Acc = 0.8967
Fold 3: Train Acc = 0.8911, Test Acc = 0.8935
Fold 4: Train Acc = 0.8911, Test Acc = 0.8913
Fold 5: Train Acc = 0.8794, Test Acc = 0.8815

Average Train Accuracy: 0.8852 ± 0.0049
Average Test Accuracy: 0.8846 ± 0.0133

FINAL SUMMARY TABLE - All Models (5-Fold CV)
Model                Train Acc       Test Acc       
------------------------------------------------------------
Gaussian NB          0.8192 ± 0.0075  0.8150 ± 0.0141
Bernoulli NB         0.8949 ± 0.0034  0.8915 ± 0.0103
Histogram NB (Fixed) 0.8852 ± 0.0049  0.8846 ± 0.0133
