Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

On page error #29

Merged
merged 11 commits into from Jun 1, 2014
31 changes: 20 additions & 11 deletions lib/polipus.rb
Expand Up @@ -111,6 +111,7 @@ def initialize(job_name = 'polipus', urls = [], options = {})
@skip_links_like = []
@on_page_downloaded = []
@on_before_save = []
@on_page_error = []
@focus_crawl_block = nil
@on_crawl_end = []
@redis_factory = nil
Expand Down Expand Up @@ -191,27 +192,28 @@ def takeover
page = pages.last
end

# Execute on_before_save blocks
@on_before_save.each {|e| e.call(page)} unless page.nil?
execute_plugin 'on_after_download'

@logger.warn {"Page #{page.url} has error: #{page.error}"} if page.error
if page.error
@logger.warn {"Page #{page.url} has error: #{page.error}"}
incr_error
@on_page_error.each {|e| e.call(page)}
end

incr_error if page.error
# Execute on_before_save blocks
@on_before_save.each {|e| e.call(page)}

if page && page.storable?
if page.storable?
@storage.add page
end

if page
@logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
@logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}
end

@logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
@logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}

incr_pages

# Execute on_page_downloaded blocks
@on_page_downloaded.each {|e| e.call(page)} unless page.nil?
@on_page_downloaded.each {|e| e.call(page)}

if @options[:depth_limit] == false || @options[:depth_limit] > page.depth
links_for(page).each do |url_to_visit|
Expand Down Expand Up @@ -261,6 +263,7 @@ def on_page_downloaded(&block)
self
end

# A block of code will be executed when crawl session is over
def on_crawl_end(&block)
@on_crawl_end << block
self
Expand All @@ -273,6 +276,12 @@ def on_before_save(&block)
self
end

# A block of code will be executed whether a page contains an error
def on_page_error(&block)
@on_page_error << block
self
end

# A block of code will be executed
# on every page downloaded. The code is used to extract urls to visit
# see links_for method
Expand Down
17 changes: 13 additions & 4 deletions lib/polipus/page.rb
Expand Up @@ -17,8 +17,7 @@ class Page
attr_reader :error
# Integer response code of the page
attr_accessor :code
# Depth of this page from the root of the crawl. This is not necessarily the
# shortest path; use PageStore#shortest_paths! to find that value.
# Depth of this page from the root of the crawl.
attr_accessor :depth
# URL of the page that brought us to this page
attr_accessor :referer
Expand Down Expand Up @@ -130,6 +129,14 @@ def redirect?
(300..307).include?(@code)
end

#
# Returns +true+ if the page is a HTTP success, returns +false+
# otherwise.
#
def success?
(200..206).include?(@code)
end

#
# Returns +true+ if the page was not found (returned 404 code),
# returns +false+ otherwise.
Expand Down Expand Up @@ -192,7 +199,8 @@ def to_hash
'response_time' => @response_time,
'fetched' => @fetched,
'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
'fetched_at' => @fetched_at
'fetched_at' => @fetched_at,
'error' => @error
}
end

Expand Down Expand Up @@ -230,7 +238,8 @@ def self.from_hash(hash)
'@response_time' => hash['response_time'].to_i,
'@fetched' => hash['fetched'],
'@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
'@fetched_at' => hash['fetched_at']
'@fetched_at' => hash['fetched_at'],
'@error' => hash['error']
}.each do |var, value|
page.instance_variable_set(var, value)
end
Expand Down
3 changes: 2 additions & 1 deletion spec/http_spec.rb
Expand Up @@ -12,6 +12,7 @@
page.should be_an_instance_of(Polipus::Page)
page.doc.search("title").text.strip.should eq "SF bay area apts/housing for rent classifieds - craigslist"
page.fetched_at.should_not be_nil
page.fetched?.should be_true
end
end

Expand Down Expand Up @@ -107,7 +108,7 @@
describe 'net errors' do
it 'should handle net errors correctly' do
VCR.use_cassette('http_errors') do
http = Polipus::HTTP.new
http = Polipus::HTTP.new(open_timeout:1, read_timeout: 1)
http.fetch_page("http://www.wrong-domain.lol/").error.should_not be_nil
end
end
Expand Down
19 changes: 19 additions & 0 deletions spec/page_spec.rb
Expand Up @@ -48,5 +48,24 @@
end
end

context 'page error' do

let(:page) do
Polipus::Page.new 'http://www.google.com/', error: 'an error'
end

it 'should serialize an error' do
page.to_hash['error'].should eq 'an error'
end

end

context 'page code' do
it 'should identify HTTPSuccess code' do
Polipus::Page.new('http://www.google.com/', code: 201).success?.should be_true
Polipus::Page.new('http://www.google.com/', code: 404).success?.should be_false
end

end

end
9 changes: 9 additions & 0 deletions spec/polipus_spec.rb
Expand Up @@ -73,5 +73,14 @@
cache_hit["http://rubygems.org/gems"].should be 2
end

it "should call on_page_error code blocks when a page has error" do
p = Polipus::PolipusCrawler.new("polipus-rspec", ["http://dasd.adad.dom/"], p_options.merge(open_timeout:1, read_timeout: 1))
a_page = nil
p.on_page_error {|page| a_page = page}
p.takeover
a_page.should_not be_nil
a_page.error.should_not be_nil
end

end
end