#Additional exercises: Build a Multinomial naive Bayes model and binarized naive Bayes from scratch.

In [6]:
from collections import defaultdict
import math

corpus = [["good", "good", "good", "great", "great", "great"],
          ["poor", "great", "great"],
          ["good", "poor", "poor", "poor"],
          ["good", "poor", "poor", "poor", "poor", "poor", "great", "great"],
          ["poor", "poor"]]

labels = ["pos", "pos", "neg", "neg", "neg"]

test = ["A", "good", ",", "good", "plot", "and", "great", "characters", ",", "but", "poor", "acting"]

In [4]:
class MultinomialNaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.vocab_size = 0
        self.classMapping = {}
        self.classes = []
        self.indexMapping = {}
        self.wordCounts = []
        self.wordProbs = []
        self.classCounts = []
        self.class_priors = []

    def fit(self, X, y):
      """
      Fit the model according to the given training data.
      Parameters
      ----------
      X : array-like of shape (n_samples, n_features)
          Training vectors, where n_samples is the number of samples and
          n_features is the number of features.
      y : array-like of shape (n_samples,)
          Target values.
      Returns
      -------
      self : MultinomialNaiveBayes
          Fitted estimator.
      """
      n_samples = len(X)
      token_index = 0
      for i in range(n_samples):
        #Adding new classes
        if y[i] not in self.classes:
          self.classMapping[y[i]] = len(self.classes)
          self.classes.append(y[i])
          self.wordCounts.append(defaultdict(int))
          self.wordProbs.append(defaultdict(int))
          self.classCounts.append(0)
          self.class_priors.append(0)

        #Get the class index and track the number of documents per class
        class_index = self.classMapping[y[i]]
        self.classCounts[class_index] += 1
        sentence = X[i]

        for token in sentence:

          #Adding new token index
          if token not in self.indexMapping:
            self.indexMapping[token] = len(self.indexMapping)
            self.vocab_size += 1

          #Get the frequencies of each token
          token_index = self.indexMapping[token]
          self.wordCounts[class_index][token_index] += 1

      for class_index in range(len(self.classes)):
        total_count = sum(self.wordCounts[class_index].values())
        self.class_priors[class_index] = self.classCounts[class_index] / n_samples

        #Calculate the probabilities of each token in each class
        for token_index in range(self.vocab_size):
          word_count = self.wordCounts[class_index][token_index]
          word_prob = (word_count + self.alpha) / (total_count + self.alpha * self.vocab_size)
          self.wordProbs[class_index][token_index] = word_prob

    def predict(self, sentence, return_probs):
      """
      Perform classification on an array of test vectors X.
      Parameters
      ----------
      X : array-like of shape (n_samples, n_features)
      return_probs : return the raw probabilities of all classes (boolean)
      Returns
      -------
      best_class: class that has highest probs
      class_probs : array, shape = [n_samples]
          Predicted target values for X.
      """
      class_probs = [0] * len(self.classes)

      for class_index in range(len(self.classes)):
        arbitrary_class_prob = self.alpha / (self.classCounts[class_index] + self.alpha * self.vocab_size)
        log_arbitrary_class_prob = math.log(arbitrary_class_prob)
        log_sentence_prob = 0

        for token in sentence:
          if token in self.indexMapping:
            token_index = self.indexMapping[token]
            word_prob = self.wordProbs[class_index][token_index]
            log_word_prob = math.log(word_prob)
            log_sentence_prob += log_word_prob

          else:
            log_sentence_prob += log_arbitrary_class_prob

        log_class_prob = math.log(self.class_priors[class_index])
        class_probs[class_index] = math.exp(log_sentence_prob + log_class_prob)

      if return_probs:
        return class_probs
      else:
        best_class = self.classes[class_probs.index(max(class_probs))]
        return best_class

In [8]:
class BinaryNaiveBayes:
  def __init__(self, alpha = 1.0):
    self.alpha = alpha
    self.vocab_size = 0
    self.classMapping = {}
    self.classes = []
    self.indexMapping = {}
    self.wordCounts = []
    self.wordProbs = []
    self.classCounts = []
    self.class_priors = []

  def fit(self, X, y):
      """
      Fit the model according to the given training data.
      Parameters
      ----------
      X : array-like of shape (n_samples, n_features)
          Training vectors, where n_samples is the number of samples and
          n_features is the number of features.
      y : array-like of shape (n_samples,)
          Target values.
      Returns
      -------
      self : MultinomialNaiveBayes
          Fitted estimator.
      """
      n_samples = len(X)
      token_index = 0
      for i in range(n_samples):

        if y[i] not in self.classes:
          self.classMapping[y[i]] = len(self.classes)
          self.classes.append(y[i])
          self.wordCounts.append(defaultdict(int))
          self.wordProbs.append(defaultdict(int))
          self.classCounts.append(0)
          self.class_priors.append(0)

        class_index = self.classMapping[y[i]]
        self.classCounts[class_index] += 1
        sentence = set(X[i])

        for token in sentence:

          if token not in self.indexMapping:
            self.indexMapping[token] = len(self.indexMapping)
            self.vocab_size += 1

          token_index = self.indexMapping[token]
          self.wordCounts[class_index][token_index] += 1

      for class_index in range(len(self.classes)):
        total_count = self.classCounts[class_index]
        self.class_priors[class_index] = self.classCounts[class_index] / n_samples

        for token_index in range(self.vocab_size):
          word_count = self.wordCounts[class_index][token_index]
          word_prob = (word_count + self.alpha) / (total_count + self.alpha * self.vocab_size)
          self.wordProbs[class_index][token_index] = word_prob

  def predict(self, sentence, return_probs):
      """
      Perform classification on an array of test vectors X.
      Parameters
      ----------
      X : array-like of shape (n_samples, n_features)
      return_probs : return the raw probabilities of all classes (boolean)
      Returns
      -------
      best_class: class that has highest probs
      class_probs : array, shape = [n_samples]
          Predicted target values for X.
      """
      class_probs = [0] * len(self.classes)
      sentence = set(sentence)

      for class_index in range(len(self.classes)):
        arbitrary_class_prob = self.alpha / (self.classCounts[class_index] + self.alpha * self.vocab_size)
        log_arbitrary_class_prob = math.log(arbitrary_class_prob)
        log_sentence_prob = 0

        for token in sentence:
          if token in self.indexMapping:
            token_index = self.indexMapping[token]
            word_prob = self.wordProbs[class_index][token_index]
            log_word_prob = math.log(word_prob)
            log_sentence_prob += log_word_prob

          else:
            log_sentence_prob += log_arbitrary_class_prob

        log_class_prob = math.log(self.class_priors[class_index])
        class_probs[class_index] = math.exp(log_sentence_prob + log_class_prob)

      if return_probs:
        return class_probs
      else:
        best_class = self.classes[class_probs.index(max(class_probs))]
        return best_class

#4.3 Use both naive Bayes models to assign a class (pos or neg) for this sentence:
      A good, good plot and great characters, but poor acting.
#Do the two model agree or disagree?

In [9]:
MNB_model = MultinomialNaiveBayes()
MNB_model.fit(corpus, labels)

BNB = BinaryNaiveBayes()
BNB.fit(corpus, labels)

print(MNB_model.predict(test, return_probs=True))
print(BNB.predict(test, return_probs=True))

print("Multinomial Naive Bayes prediction: ", MNB_model.predict(test, return_probs=False))
print("Binary Naive Bayes prediction: ", BNB.predict(test, return_probs=False))

[9.481481481481494e-09, 1.2702874607864575e-09]
[4.915200000000022e-07, 2.381496723060516e-07]
Multinomial Naive Bayes prediction:  pos
Binary Naive Bayes prediction:  pos
