✨ Add classifier class with Complement Naive Bayes

yoshoku · Mar 20, 2020 · 2864055 · 2864055
1 parent fe4bc96
commit 2864055
Show file tree

Hide file tree

Showing 3 changed files with 158 additions and 0 deletions.
diff --git a/lib/rumale.rb b/lib/rumale.rb
@@ -49,6 +49,7 @@
 require 'rumale/nearest_neighbors/k_neighbors_regressor'
 require 'rumale/naive_bayes/base_naive_bayes'
 require 'rumale/naive_bayes/bernoulli_nb'
+require 'rumale/naive_bayes/complement_nb'
 require 'rumale/naive_bayes/gaussian_nb'
 require 'rumale/naive_bayes/multinomial_nb'
 require 'rumale/tree/node'

diff --git a/lib/rumale/naive_bayes/complement_nb.rb b/lib/rumale/naive_bayes/complement_nb.rb
@@ -0,0 +1,85 @@
+# frozen_string_literal: true
+
+require 'rumale/naive_bayes/base_naive_bayes'
+
+module Rumale
+  module NaiveBayes
+    # ComplementNB is a class that implements Complement Naive Bayes classifier.
+    #
+    # @example
+    #   estimator = Rumale::NaiveBayes::ComplementNB.new(smoothing_param: 1.0)
+    #   estimator.fit(training_samples, training_labels)
+    #   results = estimator.predict(testing_samples)
+    #
+    # *Reference*
+    # - Rennie, J. D. M., Shih, L., Teevan, J., and Karger, D. R., "Tackling the Poor Assumptions of Naive Bayes Text Classifiers," ICML' 03, pp. 616--623, 2013.
+    class ComplementNB < BaseNaiveBayes
+      # Return the class labels.
+      # @return [Numo::Int32] (size: n_classes)
+      attr_reader :classes
+
+      # Return the prior probabilities of the classes.
+      # @return [Numo::DFloat] (shape: [n_classes])
+      attr_reader :class_priors
+
+      # Return the conditional probabilities for features of each class.
+      # @return [Numo::DFloat] (shape: [n_classes, n_features])
+      attr_reader :feature_probs
+
+      # Create a new classifier with Complement Naive Bayes.
+      #
+      # @param smoothing_param [Float] The smoothing parameter.
+      # @param norm [Boolean] The flag indicating whether to normlize the weight vectors.
+      def initialize(smoothing_param: 1.0, norm: false)
+        check_params_numeric(smoothing_param: smoothing_param)
+        check_params_positive(smoothing_param: smoothing_param)
+        check_params_boolean(norm: norm)
+        @params = {}
+        @params[:smoothing_param] = smoothing_param
+        @params[:norm] = norm
+      end
+
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::Int32] (shape: [n_samples]) The categorical variables (e.g. labels)
+      #   to be used for fitting the model.
+      # @return [ComplementNB] The learned classifier itself.
+      def fit(x, y)
+        x = check_convert_sample_array(x)
+        y = check_convert_label_array(y)
+        check_sample_label_size(x, y)
+        n_samples, = x.shape
+        @classes = Numo::Int32[*y.to_a.uniq.sort]
+        @class_priors = Numo::DFloat[*@classes.to_a.map { |l| y.eq(l).count.fdiv(n_samples) }]
+        @class_log_probs = Numo::NMath.log(@class_priors)
+        compl_features = Numo::DFloat[*@classes.to_a.map { |l| x[y.ne(l).where, true].sum(0) }]
+        compl_features += @params[:smoothing_param]
+        n_classes = @classes.size
+        @feature_probs = compl_features / compl_features.sum(1).reshape(n_classes, 1)
+        feature_log_probs = Numo::NMath.log(@feature_probs)
+        @weights = if normalize?
+                     feature_log_probs / feature_log_probs.sum(1).reshape(n_classes, 1)
+                   else
+                     -feature_log_probs
+                   end
+        self
+      end
+
+      # Calculate confidence scores for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
+      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence scores per sample for each class.
+      def decision_function(x)
+        x = check_convert_sample_array(x)
+        @class_log_probs + x.dot(@weights.transpose)
+      end
+
+      private
+
+      def normalize?
+        @params[:norm] == true
+      end
+    end
+  end
+end
diff --git a/spec/rumale/naive_bayes/complement_nb_spec.rb b/spec/rumale/naive_bayes/complement_nb_spec.rb
@@ -0,0 +1,72 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe Rumale::NaiveBayes::ComplementNB do
+  let(:x) { Numo::DFloat[[4, 3, 0, 0], [4, 0, 0, 0], [4, 0, 1, 0], [0, 0, 5, 3], [0, 0, 0, 3], [0, 1, 5, 3]] }
+  let(:y) { Numo::Int32[1, 1, 1, -1, -1, -1] }
+  let(:n_samples) { x.shape[0] }
+  let(:n_features) { x.shape[1] }
+  let(:classes) { y.to_a.uniq.sort }
+  let(:n_classes) { classes.size }
+  let(:estimator) { described_class.new(smoothing_param: 1.0, norm: norm).fit(x, y) }
+  let(:probs) { estimator.predict_proba(x) }
+  let(:score) { estimator.score(x, y) }
+  let(:func_vals) { estimator.decision_function(x) }
+  let(:predicted) { estimator.predict(x) }
+  let(:predicted_by_probs) { Numo::Int32[*(Array.new(n_samples) { |n| classes[probs[n, true].max_index] })] }
+  let(:copied) { Marshal.load(Marshal.dump(estimator)) }
+
+  shared_examples 'classification' do
+    it 'classifies two clusters data.', :aggregate_failures do
+      expect(estimator.class_priors.class).to eq(Numo::DFloat)
+      expect(estimator.class_priors.ndim).to eq(1)
+      expect(estimator.class_priors.shape[0]).to eq(n_classes)
+      expect(estimator.feature_probs.class).to eq(Numo::DFloat)
+      expect(estimator.feature_probs.ndim).to eq(2)
+      expect(estimator.feature_probs.shape[0]).to eq(n_classes)
+      expect(estimator.feature_probs.shape[1]).to eq(n_features)
+      expect(estimator.classes.class).to eq(Numo::Int32)
+      expect(estimator.classes.ndim).to eq(1)
+      expect(estimator.classes.shape[0]).to eq(n_classes)
+      expect(func_vals.class).to eq(Numo::DFloat)
+      expect(func_vals.ndim).to eq(2)
+      expect(func_vals.shape[0]).to eq(n_samples)
+      expect(func_vals.shape[1]).to eq(n_classes)
+      expect(predicted.class).to eq(Numo::Int32)
+      expect(predicted.ndim).to eq(1)
+      expect(predicted.shape[0]).to eq(n_samples)
+      expect(predicted).to eq(y)
+      expect(score).to eq(1.0)
+    end
+
+    it 'estimates class probabilities with two clusters dataset.', :aggregate_failures do
+      expect(probs.class).to eq(Numo::DFloat)
+      expect(probs.ndim).to eq(2)
+      expect(probs.shape[0]).to eq(n_samples)
+      expect(probs.shape[1]).to eq(n_classes)
+      expect(predicted_by_probs).to eq(y)
+    end
+  end
+
+  context 'when classifier is defined without normalization' do
+    let(:norm) { false }
+
+    it_behaves_like 'classification'
+  end
+
+  context 'when classifier is defined without normalization' do
+    let(:norm) { true }
+
+    it_behaves_like 'classification'
+
+    it 'dumps and restores itself using Marshal module.', :aggregate_failures do
+      expect(estimator.class).to eq(copied.class)
+      expect(estimator.params).to eq(copied.params)
+      expect(estimator.classes).to eq(copied.classes)
+      expect(estimator.class_priors).to eq(copied.class_priors)
+      expect(estimator.feature_probs).to eq(copied.feature_probs)
+      expect(score).to eq(copied.score(x, y))
+    end
+  end
+end