#### Model

In [1]:
import numpy as np
import pandas as pd

class NaiveBayes(object):
    """ Gaussian Naive Bayes model
    
    @attrs:
        n_classes:    the number of classes
        attr_dist:    a 2D (n_classes x n_attributes) NumPy array of the attribute distributions
        label_priors: a 1D NumPy array of the priors distribution
        means:        mean per class and feature
        vars:         variance per class and feature
    """

    def __init__(self, n_classes):
        """ Initializes a NaiveBayes model with n_classes. """
        self.n_classes = n_classes
        self.means = None          
        self.vars = None  
        self.label_priors = None

    def train(self, X_train, y_train):
        """ Trains the model, using maximum likelihood estimation.
        For Gaussian NB: estimate class priors, means, and variances.
        @params:
            X_train: a 2D (n_examples x n_attributes) numpy array
            y_train: a 1D (n_examples) numpy array
        @return:
            a tuple consisting of:
                1) a 2D numpy array of the attribute distributions
                2) a 1D numpy array of the priors distribution
        """
        n_samples, n_features = X_train.shape

        # ---- 1. Estimate class priors----
        counts = np.bincount(y_train, minlength=self.n_classes)
        self.label_priors = counts / n_samples  

        # ---- 2. Estimate means and variances for each class and feature ----
        self.means = np.zeros((self.n_classes, n_features))
        self.vars = np.zeros((self.n_classes, n_features))

        for c in range(self.n_classes):
            X_c = X_train[y_train == c]  
            self.means[c, :] = np.mean(X_c, axis=0)
            self.vars[c, :] = np.var(X_c, axis=0) + 1e-9

        return self.means, self.vars, self.label_priors

    def predict(self, inputs):
        """ Outputs a predicted label for each input in inputs.
            Remember to convert to log space to avoid overflow/underflow
            errors!

        @params:
            inputs: a 2D NumPy array containing inputs
        @return:
            a 1D numpy array of predictions
        """
        n_samples = inputs.shape[0]
        log_priors = np.log(self.label_priors) 
        predictions = np.zeros(n_samples, dtype=int)
        for i in range(n_samples):
            x = inputs[i]
            log_probs = np.zeros(self.n_classes)
            for c in range(self.n_classes):
                mean_c=self.means[c]
                var_c=self.vars[c]
                log_likelihood = -0.5 * np.sum(
                    np.log(2 * np.pi * var_c) + ((x - mean_c) ** 2) / var_c
                )
                log_probs[c] = log_priors[c] + log_likelihood
            joint_probs = np.exp(log_probs)     
            evidence = np.sum(joint_probs)       
            posterior = joint_probs / evidence      
            predictions[i] = np.argmax(posterior)
        return predictions
    def accuracy(self, X_test, y_test):
        """ Outputs the accuracy of the trained model on a given dataset (data).

        @params:
            X_test: a 2D numpy array of examples
            y_test: a 1D numpy array of labels
        @return:
            a float number indicating accuracy (between 0 and 1)
        """

        # TODO
        predictions = self.predict(X_test)
        correct = np.sum(predictions == y_test)
        total = y_test.shape[0]
        accuracy = correct / total
        return accuracy
        


#### Check Model

In [2]:
import numpy as np
import pytest

"""Two-class simple sanity check with held-out test set."""
X_train = np.array([
    [0.0, 1.0],  # class 0
    [0.0, 3.0],  # class 0
    [2.0, 1.0],  # class 1
    [2.0, 3.0],  # class 1
])
y_train = np.array([0, 0, 1, 1])

nb = NaiveBayes(n_classes=2)
means, vars_, priors = nb.train(X_train, y_train)

# Priors: two samples in each class -> 0.5, 0.5
assert np.allclose(priors, np.array([0.5, 0.5]))

# Means:
# class 0: X_c0 = [[0,1], [0,3]] -> mean = [0, 2]
# class 1: X_c1 = [[2,1], [2,3]] -> mean = [2, 2]
expected_means = np.array([[0.0, 2.0],
                           [2.0, 2.0]])
assert np.allclose(means, expected_means)

# Variances (population, N in denominator):
# class 0, feature 0: values [0, 0] -> var = 0
# class 0, feature 1: values [1, 3] -> mean = 2, var = ((1-2)**2 + (3-2)**2)/2 = 1
# class 1, feature 0: values [2, 2] -> var = 0
# class 1, feature 1: values [1, 3] -> same as above = 1
# added 1e-9 for numerical stability in the implementation.
expected_vars = np.array([[0.0, 1.0],
                          [0.0, 1.0]]) + 1e-9
assert np.allclose(vars_, expected_vars)

# Held-out test set.
X_test = np.array([
    [0.0, 1.5],  # class 0
    [2.0, 2.5],  # class 1
])
y_test = np.array([0, 1])

y_pred = nb.predict(X_test)
assert np.array_equal(y_pred, y_test)

acc = nb.accuracy(X_test, y_test)
assert acc == 1.0
print("Success!")


Success!


In [8]:
"""Edge case: zero-variance feature and imbalanced classes, with held-out test set."""
# Feature 0 is always 1.0 (zero variance), feature 1 separates classes.
X_train = np.array([
    [1.0, 0.0],  # class 0
    [1.0, 0.2],  # class 0
    [1.0, 0.1],  # class 0
    [1.0, 1.0],  # class 1
])
y_train = np.array([0, 0, 0, 1])

nb = NaiveBayes(n_classes=2)
means, vars_, priors = nb.train(X_train, y_train)

# Class priors should reflect the 3:1 imbalance.
assert np.allclose(priors, np.array([0.75, 0.25]))

# Zero-variance feature should have a small positive variance due to 1e-9.
assert np.all(vars_[:, 0] > 0)

# Held-out test set (use points consistent with the zero-variance feature).
X_test = np.array([
    [1.0, 0.05],  # class 0
    [1.0, 0.9],   # class 1 (matches the lone class-1 sample)
])
y_test = np.array([0, 0])

y_pred = nb.predict(X_test)

assert np.array_equal(y_pred, y_test)

acc = nb.accuracy(X_test, y_test)
assert acc == 1.0
print("Success!")


Success!


In [9]:
"""Multi-class (>2) sanity check with held-out test set."""
X_train = np.array([
    [0.0, 0.0],  # class 0
    [0.0, 1.0],  # class 0
    [1.0, 0.0],  # class 1
    [1.0, 1.0],  # class 1
    [2.0, 0.0],  # class 2
    [2.0, 1.0],  # class 2
])
y_train = np.array([0, 0, 1, 1, 2, 2])

nb = NaiveBayes(n_classes=3)
means, vars_, priors = nb.train(X_train, y_train)

# Priors: two samples in each of the three classes -> 1/3 each.
expected_priors = np.array([1/3, 1/3, 1/3])
assert np.allclose(priors, expected_priors)

# Means for each class:
# class 0: [[0,0],[0,1]] -> mean [0, 0.5]
# class 1: [[1,0],[1,1]] -> mean [1, 0.5]
# class 2: [[2,0],[2,1]] -> mean [2, 0.5]
expected_means = np.array([
    [0.0, 0.5],
    [1.0, 0.5],
    [2.0, 0.5],
])
assert np.allclose(means, expected_means)

# Variances (population) for each class:
# feature 0 is constant within each class -> var = 0 (then +1e-9)
# feature 1 has values [0,1] in every class -> mean 0.5, var = 0.25
expected_vars = np.array([
    [0.0, 0.25],
    [0.0, 0.25],
    [0.0, 0.25],
]) + 1e-9
assert np.allclose(vars_, expected_vars)

# Held-out test set.
X_test = np.array([
    [0.0, 0.2],  # class 0
    [1.0, 0.8],  # class 1
    [2.0, 0.2],  # class 2
])
y_test = np.array([0, 1, 2])

y_pred = nb.predict(X_test)
assert np.array_equal(y_pred, y_test)

acc = nb.accuracy(X_test, y_test)
assert acc == 1.0
print("Success!")


Success!
