diff --git a/lib/rumale/decomposition/pca.rb b/lib/rumale/decomposition/pca.rb index 7e51df4f..c403715d 100644 --- a/lib/rumale/decomposition/pca.rb +++ b/lib/rumale/decomposition/pca.rb @@ -12,6 +12,11 @@ module Decomposition # decomposer = Rumale::Decomposition::PCA.new(n_components: 2) # representaion = decomposer.fit_transform(samples) # + # # If Numo::Linalg is installed, you can specify 'evd' for the solver option. + # require 'numo/linalg/autoloader' + # decomposer = Rumale::Decomposition::PCA.new(n_components: 2, solver: 'evd') + # representaion = decomposer.fit_transform(samples) + # # *Reference* # - A. Sharma and K K. Paliwal, "Fast principal component analysis using fixed-point algorithm," Pattern Recognition Letters, 28, pp. 1151--1155, 2007. class PCA @@ -23,7 +28,7 @@ class PCA attr_reader :components # Returns the mean vector. - # @return [Numo::DFloat] (shape: [n_features] + # @return [Numo::DFloat] (shape: [n_features]) attr_reader :mean # Return the random generator. @@ -33,15 +38,19 @@ class PCA # Create a new transformer with PCA. # # @param n_components [Integer] The number of principal components. - # @param max_iter [Integer] The maximum number of iterations. - # @param tol [Float] The tolerance of termination criterion. + # @param solver [String] The algorithm for the optimization ('fpt' or 'evd'). + # 'fpt' uses the fixed-point algorithm. 'evd' performs eigen value decomposition of the covariance matrix of samples. + # @param max_iter [Integer] The maximum number of iterations. If solver = 'evd', this parameter is ignored. + # @param tol [Float] The tolerance of termination criterion. If solver = 'evd', this parameter is ignored. # @param random_seed [Integer] The seed value using to initialize the random generator. - def initialize(n_components: 2, max_iter: 100, tol: 1.0e-4, random_seed: nil) + def initialize(n_components: 2, solver: 'fpt', max_iter: 100, tol: 1.0e-4, random_seed: nil) check_params_integer(n_components: n_components, max_iter: max_iter) + check_params_string(solver: solver) check_params_float(tol: tol) check_params_type_or_nil(Integer, random_seed: random_seed) check_params_positive(n_components: n_components, max_iter: max_iter, tol: tol) @params = {} + @params[:solver] = solver != 'evd' ? 'fpt' : 'evd' @params[:n_components] = n_components @params[:max_iter] = max_iter @params[:tol] = tol @@ -69,14 +78,19 @@ def fit(x, _y = nil) centered_x = x - @mean # optimization. covariance_mat = centered_x.transpose.dot(centered_x) / (n_samples - 1) - @params[:n_components].times do - comp_vec = Rumale::Utils.rand_uniform(n_features, sub_rng) - @params[:max_iter].times do - updated = orthogonalize(covariance_mat.dot(comp_vec)) - break if (updated.dot(comp_vec) - 1).abs < @params[:tol] - comp_vec = updated + if @params[:solver] == 'evd' && enable_linalg? + _, evecs = Numo::Linalg.eigh(covariance_mat, vals_range: (n_features - @params[:n_components])...n_features) + @components = evecs.reverse(1).transpose.dup + else + @params[:n_components].times do + comp_vec = Rumale::Utils.rand_uniform(n_features, sub_rng) + @params[:max_iter].times do + updated = orthogonalize(covariance_mat.dot(comp_vec)) + break if (updated.dot(comp_vec) - 1).abs < @params[:tol] + comp_vec = updated + end + @components = @components.nil? ? comp_vec : Numo::NArray.vstack([@components, comp_vec]) end - @components = @components.nil? ? comp_vec : Numo::NArray.vstack([@components, comp_vec]) end self end diff --git a/spec/decomposition/pca_spec.rb b/spec/decomposition/pca_spec.rb index 58240bea..4bd22ee7 100644 --- a/spec/decomposition/pca_spec.rb +++ b/spec/decomposition/pca_spec.rb @@ -5,38 +5,70 @@ RSpec.describe Rumale::Decomposition::PCA do let(:x) { Marshal.load(File.read(__dir__ + '/../test_samples.dat')) } let(:n_components) { 16 } - let(:decomposer) { described_class.new(n_components: n_components, tol: 1.0e-8, random_seed: 1) } + let(:solver) { 'fpt' } + let(:decomposer) { described_class.new(n_components: n_components, solver: solver, tol: 1.0e-8, random_seed: 1) } let(:transformer) { Rumale::KernelApproximation::RBF.new(gamma: 1.0, n_components: 32, random_seed: 1) } - it 'projects high-dimensinal data into subspace.' do - samples = transformer.fit_transform(x) - sub_samples = decomposer.fit_transform(samples) - - n_samples, n_features = samples.shape - expect(sub_samples.class).to eq(Numo::DFloat) - expect(sub_samples.shape[0]).to eq(n_samples) - expect(sub_samples.shape[1]).to eq(n_components) - - expect(decomposer.components.class).to eq(Numo::DFloat) - expect(decomposer.components.shape[0]).to eq(n_components) - expect(decomposer.components.shape[1]).to eq(n_features) - expect(decomposer.mean.class).to eq(Numo::DFloat) - expect(decomposer.mean.shape[0]).to eq(n_features) - expect(decomposer.mean.shape[1]).to be_nil - - rec_samples = decomposer.inverse_transform(sub_samples) - mse = Numo::NMath.sqrt(((samples - rec_samples)**2).sum(1)).mean - expect(mse).to be <= 0.1 + context 'when solver is fix point algorithm' do + it 'projects high-dimensinal data into subspace.' do + samples = transformer.fit_transform(x) + sub_samples = decomposer.fit_transform(samples) + n_samples, n_features = samples.shape + expect(sub_samples.class).to eq(Numo::DFloat) + expect(sub_samples.shape[0]).to eq(n_samples) + expect(sub_samples.shape[1]).to eq(n_components) + expect(decomposer.components.class).to eq(Numo::DFloat) + expect(decomposer.components.shape[0]).to eq(n_components) + expect(decomposer.components.shape[1]).to eq(n_features) + expect(decomposer.mean.class).to eq(Numo::DFloat) + expect(decomposer.mean.shape[0]).to eq(n_features) + expect(decomposer.mean.shape[1]).to be_nil + rec_samples = decomposer.inverse_transform(sub_samples) + mse = Numo::NMath.sqrt(((samples - rec_samples)**2).sum(1)).mean + expect(mse).to be <= 0.1 + end + + it 'projects data into one-dimensional subspace.' do + liner = described_class.new(n_components: 1, random_seed: 1) + sub_x = liner.fit_transform(x) + rec_x = liner.inverse_transform(sub_x.expand_dims(1)) + expect(rec_x.shape[0]).to eq(x.shape[0]) + expect(rec_x.shape[1]).to eq(x.shape[1]) + mse = Numo::NMath.sqrt(((x - rec_x)**2).sum(1)).mean + expect(mse).to be <= 0.5 + end end - it 'projects data into one-dimensional subspace.' do - liner = described_class.new(n_components: 1, random_seed: 1) - sub_x = liner.fit_transform(x) - rec_x = liner.inverse_transform(sub_x.expand_dims(1)) - expect(rec_x.shape[0]).to eq(x.shape[0]) - expect(rec_x.shape[1]).to eq(x.shape[1]) - mse = Numo::NMath.sqrt(((x - rec_x)**2).sum(1)).mean - expect(mse).to be <= 0.5 + context 'when solver is eigen value decomposition' do + let(:solver) { 'evd' } + + it 'projects high-dimensinal data into subspace.' do + samples = transformer.fit_transform(x) + sub_samples = decomposer.fit_transform(samples) + n_samples, n_features = samples.shape + expect(sub_samples.class).to eq(Numo::DFloat) + expect(sub_samples.shape[0]).to eq(n_samples) + expect(sub_samples.shape[1]).to eq(n_components) + expect(decomposer.components.class).to eq(Numo::DFloat) + expect(decomposer.components.shape[0]).to eq(n_components) + expect(decomposer.components.shape[1]).to eq(n_features) + expect(decomposer.mean.class).to eq(Numo::DFloat) + expect(decomposer.mean.shape[0]).to eq(n_features) + expect(decomposer.mean.shape[1]).to be_nil + rec_samples = decomposer.inverse_transform(sub_samples) + mse = Numo::NMath.sqrt(((samples - rec_samples)**2).sum(1)).mean + expect(mse).to be <= 0.1 + end + + it 'projects data into one-dimensional subspace.' do + liner = described_class.new(n_components: 1, random_seed: 1) + sub_x = liner.fit_transform(x) + rec_x = liner.inverse_transform(sub_x.expand_dims(1)) + expect(rec_x.shape[0]).to eq(x.shape[0]) + expect(rec_x.shape[1]).to eq(x.shape[1]) + mse = Numo::NMath.sqrt(((x - rec_x)**2).sum(1)).mean + expect(mse).to be <= 0.5 + end end it 'dumps and restores itself using Marshal module.' do @@ -45,6 +77,7 @@ copied = Marshal.load(Marshal.dump(decomposer)) expect(decomposer.class).to eq(copied.class) expect(decomposer.params[:n_components]).to eq(copied.params[:n_components]) + expect(decomposer.params[:solver]).to eq(copied.params[:solver]) expect(decomposer.params[:max_iter]).to eq(copied.params[:max_iter]) expect(decomposer.params[:tol]).to eq(copied.params[:tol]) expect(decomposer.params[:random_seed]).to eq(copied.params[:random_seed])