In [1]:
import numpy as np

class MixedNaiveBayes:
    """
    A Naive Bayes classifier that can handle three different feature types:
    gaussian for continuous values represented by a normal distribution
    bernoulli for binary values with Laplace smoothing
    categorical for discrete string or integer values with Laplace smoothing

    All probability calculations are done in log space. This avoids numerical
    underflow when probabilities become very small. During prediction the model
    converts log probabilities back into normal probabilities and normalizes
    them so that each row sums to one.

    Feature likelihoods
    -------------------
    For gaussian features:
        log_pdf = -0.5 * ( log(2 * pi * variance) + ((x - mean)^2) / variance )

    For bernoulli features:
        p1 = (count_of_1_in_class + alpha) / (n_class + 2 * alpha)
        log_likelihood = x * log(p1) + (1 - x) * log(1 - p1)

    For categorical features:
        p(value) = (count_of_value_in_class + alpha) / (n_class + K * alpha)
        log_likelihood = log( p(value) )
        Here K is the number of possible categories for that feature.

    Parameters
    ----------
    eps : float
        Small constant added to Gaussian variances for stability.
    laplace_alpha : float
        Laplace smoothing parameter used for Bernoulli and categorical features.

    Notes
    -----
    This model assumes that categorical test values always appear in the
    training data. Any preprocessing or encoding needed to align categories
    must be done before calling fit.

    Example
    -------
    >>> X = np.array([[5.1, 1, "red"],
                      [4.9, 0, "blue"],
                      [5.0, 1, "red"]])
    >>> y = np.array(["A", "B", "A"])
    >>> feature_types = ["gaussian", "bernoulli", "categorical"]
    >>> model = MixedNaiveBayes()
    >>> model.fit(X, y, feature_types)
    >>> model.predict(X)
    array(['A', 'B', 'A'], dtype='<U1')
    """

    def __init__(self, eps=1e-8, laplace_alpha=1.0):
        self.eps = eps
        self.alpha = laplace_alpha

        self.feature_types = None
        self.classes_ = None

        self.log_priors_ = {}

        self.gaussian_means_ = {}
        self.gaussian_vars_ = {}
        self.bernoulli_probs_ = {}
        self.categorical_probs_ = {}
        self.categorical_values_ = {}

    # Likelihood helper functions

    def _gaussian_log_pdf(self, x, mean, var):
        """
        Computes the log density of a Gaussian distribution for a single feature.
        This function is for continuous values.
        """
        return -0.5 * (np.log(2.0 * np.pi * var) + ((x - mean) ** 2) / var)

    def _bernoulli_log_likelihood(self, x, p):
        """
        Computes the log likelihood of a Bernoulli feature.
        x must be 0 or 1 and p is the probability of seeing a 1.
        """
        return x * np.log(p) + (1.0 - x) * np.log(1.0 - p)

    def _categorical_log_likelihood(self, value, probs_dict):
        """
        Computes the log likelihood of a categorical feature.
        The feature value must exist in the dictionary created during training.
        """
        return np.log(probs_dict[value])

    # Aggregator
    def _compute_log_likelihood(self, x_row, class_label):
        """
        Computes the total log likelihood of a sample under a specific class.
        This is the sum of the log likelihoods of each feature.
        """
        log_l = 0.0

        for j, ftype in enumerate(self.feature_types):
            value = x_row[j]

            if ftype == "gaussian":
                mean = self.gaussian_means_[class_label][j]
                var = self.gaussian_vars_[class_label][j]
                log_l += self._gaussian_log_pdf(value, mean, var)

            elif ftype == "bernoulli":
                p = self.bernoulli_probs_[class_label][j]
                log_l += self._bernoulli_log_likelihood(value, p)

            elif ftype == "categorical":
                probs_dict = self.categorical_probs_[class_label][j]
                log_l += self._categorical_log_likelihood(value, probs_dict)

        return log_l
    # Fit

    def fit(self, X, y, feature_types):
        """
        Learns all model parameters. This includes class priors as well as the
        parameters needed for each feature type.

        Inputs
        ------
        X : array-like of shape (n_samples, n_features)
            Training data.
        y : array-like of shape (n_samples,)
            Class labels.
        feature_types : list of strings
            One entry per feature indicating whether it is gaussian,
            bernoulli or categorical.

        Output
        ------
        Returns the trained model instance.
        """
        X = np.asarray(X)
        y = np.asarray(y)
        self.feature_types = feature_types
        n_samples, n_features = X.shape

        self.classes_ = np.unique(y)

        for j, ftype in enumerate(feature_types):
            if ftype == "categorical":
                self.categorical_values_[j] = np.unique(X[:, j])

        for c in self.classes_:
            n_c = np.sum(y == c)
            self.log_priors_[c] = np.log(n_c / n_samples)

        for c in self.classes_:
            self.gaussian_means_[c] = {}
            self.gaussian_vars_[c] = {}
            self.bernoulli_probs_[c] = {}
            self.categorical_probs_[c] = {}

        for c in self.classes_:
            Xc = X[y == c]
            n_c = len(Xc)

            for j, ftype in enumerate(feature_types):

                if ftype == "gaussian":
                    mean = Xc[:, j].mean()
                    var = Xc[:, j].var() + self.eps
                    self.gaussian_means_[c][j] = mean
                    self.gaussian_vars_[c][j] = var

                elif ftype == "bernoulli":
                    count1 = np.sum(Xc[:, j] == 1)
                    K = 2
                    p1 = (count1 + self.alpha) / (n_c + K * self.alpha)
                    self.bernoulli_probs_[c][j] = p1

                elif ftype == "categorical":
                    values = self.categorical_values_[j]
                    K = len(values)

                    probs = {}
                    for v in values:
                        count_v = np.sum(Xc[:, j] == v)
                        p_v = (count_v + self.alpha) / (n_c + K * self.alpha)
                        probs[v] = p_v

                    self.categorical_probs_[c][j] = probs

        return self

    # Prediction

    def predict_log_proba(self, X):
        """
        Computes unnormalized log posterior scores for each class.
        These are the raw log scores before normalization.
        """
        X = np.asarray(X)
        n_samples = X.shape[0]
        log_probs = np.zeros((n_samples, len(self.classes_)))

        for i in range(n_samples):
            x_row = X[i]
            for k, c in enumerate(self.classes_):
                log_l = self._compute_log_likelihood(x_row, c)
                log_probs[i, k] = self.log_priors_[c] + log_l

        return log_probs

    def predict_proba(self, X):
        """
        Converts log posterior scores into normalized probabilities.
        Uses the log-sum-exp trick for numerical stability.

        Each row of the output represents the probability distribution
        over all classes for that sample.
        """
        log_probs = self.predict_log_proba(X)

        max_log = np.max(log_probs, axis=1, keepdims=True)
        shifted = log_probs - max_log

        probs = np.exp(shifted)
        probs = probs / probs.sum(axis=1, keepdims=True)

        return probs

    def predict(self, X):
        """
        Returns the predicted class label for each sample.
        """
        probs = self.predict_proba(X)
        class_indices = np.argmax(probs, axis=1)
        return self.classes_[class_indices]
