Permalink
Browse files

Add flush option.

  • Loading branch information...
tonic20 committed Dec 27, 2011
1 parent 660eeb7 commit 6a9efa38228ba12b8fc953da97078ee57e27160b
Showing with 17 additions and 9 deletions.
  1. +4 −0 lib/anemone/core.rb
  2. +2 −1 lib/anemone/page.rb
  3. +3 −3 lib/anemone/storage.rb
  4. +8 −5 lib/anemone/storage/mongodb.rb
View
@@ -56,6 +56,8 @@ class Core
:proxy_port => false,
# HTTP read timeout in seconds
:read_timeout => nil,
+ # Delete data generated by previous crawl
+ :flush => false,
# skip any link with a rel=nofollow attribute
:skip_nofollow_links => false
}
@@ -165,6 +167,7 @@ def run
loop do
page = page_queue.deq
@pages.touch_key page.url
+
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
page.has_duplicate = @pages.has_digest?(page.root_url, page.digest)
@@ -211,6 +214,7 @@ def run
def process_options
@opts = DEFAULT_OPTS.merge @opts
@opts[:threads] = 1 if @opts[:delay] > 0
+
storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
@pages = PageStore.new(storage)
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
View
@@ -39,7 +39,8 @@ class Page
# Create a new page
#
def initialize(url, params = {}, options = nil)
- @url = url
+ # hack: to upcase encoded urls
+ @url = URI(URI.encode(URI.decode(url.to_s))) # url
@data = {}
@code = params[:code]
View
@@ -18,17 +18,17 @@ def self.TokyoCabinet(file = 'anemone.tch')
self::TokyoCabinet.new(file)
end
- def self.MongoDB(mongo_db = nil, collection_name = 'pages')
+ def self.MongoDB(mongo_db = nil, collection_name = 'pages', options = {flush: false})
require 'anemone/storage/mongodb'
mongo_db ||= Mongo::Connection.new.db('anemone')
raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
- self::MongoDB.new(mongo_db, collection_name)
+ options.merge! flush: false unless options.has_key? :flush
+ self::MongoDB.new(mongo_db, collection_name, options)
end
def self.Redis(opts = {})
require 'anemone/storage/redis'
self::Redis.new(opts)
end
-
end
end
@@ -11,13 +11,16 @@ class MongoDB
BINARY_FIELDS = %w(body headers)
- def initialize(mongo_db, collection_name)
+ def initialize(mongo_db, collection_name, options)
@db = mongo_db
@collection = @db[collection_name]
- @collection.remove
- @collection.create_index 'url'
- @collection.create_index 'digest'
- @collection.create_index 'data.content'
+
+ if options[:flush]
+ @collection.remove
+ @collection.create_index 'url'
+ @collection.create_index 'digest'
+ @collection.create_index 'data.content'
+ end
end
def [](url)

0 comments on commit 6a9efa3

Please sign in to comment.