Updates indexing to use batch processing with a config variable #135

Closed
wants to merge 1 commit into
from
@@ -8,6 +8,8 @@ module Sunspot
# Sunspot.config.pagination.default_per_page::
# Solr always paginates its results. This sets Sunspot's default result
# count per page if it is not explicitly specified in the query.
+ # Sunspot.config.indexing.default_batch_size::
+ # This sets the batch size for indexing, default is 50
#
module Configuration
class <<self
@@ -28,6 +30,9 @@ def build #:nodoc:
pagination do
default_per_page 30
end
+ indexing do
+ default_batch_size 50
+ end
end
end
@@ -238,7 +238,7 @@ def solr_reindex(options = {})
#
def solr_index(opts={})
options = {
- :batch_size => 50,
+ :batch_size => Sunspot.config.indexing.default_batch_size,
:batch_commit => true,
:include => self.sunspot_options[:include],
:start => opts.delete(:first_id) || 0
@@ -275,13 +275,19 @@ def solr_index(opts={})
# wrong. Usually you will want to rectify the situation by calling
# #clean_index_orphans or #reindex
#
+ # ==== Options (passed as a hash)
+ #
+ # batch_size<Integer>:: Batch size with which to load records. Passing
+ # Default is 1000 (from ActiveRecord).
+ #
# ==== Returns
#
# Array:: Collection of IDs that exist in Solr but not in the database
- def solr_index_orphans
+ def solr_index_orphans(opts={})
+ batch_size = opts[:batch_size] || Sunspot.config.indexing.default_batch_size
count = self.count
indexed_ids = solr_search_ids { paginate(:page => 1, :per_page => count) }.to_set
- all(:select => 'id').each do |object|
+ find_each(:select => 'id', :batch_size => batch_size) do |object|
indexed_ids.delete(object.id)
end
indexed_ids.to_a
@@ -293,8 +299,13 @@ def solr_index_orphans
# circumstances, this should not be necessary; this method is provided
# in case something goes wrong.
#
- def solr_clean_index_orphans
- solr_index_orphans.each do |id|
+ # ==== Options (passed as a hash)
+ #
+ # batch_size<Integer>:: Batch size with which to load records
+ # Default is 50
+ #
+ def solr_clean_index_orphans(opts={})
+ solr_index_orphans(opts).each do |id|
new do |fake_instance|
fake_instance.id = id
end.solr_remove_from_index
@@ -211,6 +211,12 @@
it 'should return IDs of objects that are in the index but not the database' do
Post.index_orphans.should == [@posts.first.id]
end
+
+ it 'should find the orphans in batches to improve performance' do
+ Post.should_receive(:find_each).with(hash_including(:batch_size => 10)).and_return([])
+ Post.index_orphans(:batch_size => 10)
+ end
+
end
describe 'clean_index_orphans()' do