Skip to content

Commit

Permalink
Add flush option.
Browse files Browse the repository at this point in the history
  • Loading branch information
tonic20 committed Dec 27, 2011
1 parent 660eeb7 commit 6a9efa3
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 9 deletions.
4 changes: 4 additions & 0 deletions lib/anemone/core.rb
Expand Up @@ -56,6 +56,8 @@ class Core
:proxy_port => false, :proxy_port => false,
# HTTP read timeout in seconds # HTTP read timeout in seconds
:read_timeout => nil, :read_timeout => nil,
# Delete data generated by previous crawl
:flush => false,
# skip any link with a rel=nofollow attribute # skip any link with a rel=nofollow attribute
:skip_nofollow_links => false :skip_nofollow_links => false
} }
Expand Down Expand Up @@ -165,6 +167,7 @@ def run
loop do loop do
page = page_queue.deq page = page_queue.deq
@pages.touch_key page.url @pages.touch_key page.url

puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose] puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]


page.has_duplicate = @pages.has_digest?(page.root_url, page.digest) page.has_duplicate = @pages.has_digest?(page.root_url, page.digest)
Expand Down Expand Up @@ -211,6 +214,7 @@ def run
def process_options def process_options
@opts = DEFAULT_OPTS.merge @opts @opts = DEFAULT_OPTS.merge @opts
@opts[:threads] = 1 if @opts[:delay] > 0 @opts[:threads] = 1 if @opts[:delay] > 0

storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash) storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
@pages = PageStore.new(storage) @pages = PageStore.new(storage)
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt] @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
Expand Down
3 changes: 2 additions & 1 deletion lib/anemone/page.rb
Expand Up @@ -39,7 +39,8 @@ class Page
# Create a new page # Create a new page
# #
def initialize(url, params = {}, options = nil) def initialize(url, params = {}, options = nil)
@url = url # hack: to upcase encoded urls
@url = URI(URI.encode(URI.decode(url.to_s))) # url
@data = {} @data = {}


@code = params[:code] @code = params[:code]
Expand Down
6 changes: 3 additions & 3 deletions lib/anemone/storage.rb
Expand Up @@ -18,17 +18,17 @@ def self.TokyoCabinet(file = 'anemone.tch')
self::TokyoCabinet.new(file) self::TokyoCabinet.new(file)
end end


def self.MongoDB(mongo_db = nil, collection_name = 'pages') def self.MongoDB(mongo_db = nil, collection_name = 'pages', options = {flush: false})
require 'anemone/storage/mongodb' require 'anemone/storage/mongodb'
mongo_db ||= Mongo::Connection.new.db('anemone') mongo_db ||= Mongo::Connection.new.db('anemone')
raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB) raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
self::MongoDB.new(mongo_db, collection_name) options.merge! flush: false unless options.has_key? :flush
self::MongoDB.new(mongo_db, collection_name, options)
end end


def self.Redis(opts = {}) def self.Redis(opts = {})
require 'anemone/storage/redis' require 'anemone/storage/redis'
self::Redis.new(opts) self::Redis.new(opts)
end end

end end
end end
13 changes: 8 additions & 5 deletions lib/anemone/storage/mongodb.rb
Expand Up @@ -11,13 +11,16 @@ class MongoDB


BINARY_FIELDS = %w(body headers) BINARY_FIELDS = %w(body headers)


def initialize(mongo_db, collection_name) def initialize(mongo_db, collection_name, options)
@db = mongo_db @db = mongo_db
@collection = @db[collection_name] @collection = @db[collection_name]
@collection.remove
@collection.create_index 'url' if options[:flush]
@collection.create_index 'digest' @collection.remove
@collection.create_index 'data.content' @collection.create_index 'url'
@collection.create_index 'digest'
@collection.create_index 'data.content'
end
end end


def [](url) def [](url)
Expand Down

0 comments on commit 6a9efa3

Please sign in to comment.