Permalink
Browse files

ignore pkg

  • Loading branch information...
1 parent 911e235 commit 7500b35ea2eb78e561265677a92f148c43336b9b @oscardelben oscardelben committed Jan 10, 2011
View
@@ -16,6 +16,8 @@ gem install rawler
== TODO
+* Ignore mailto
+* Login support with Mechanize
* Handle https (now returns 400). See http://stackoverflow.com/questions/1719809/ruby-on-rails-https-post-bad-request
* Export to html
* Handle multiple urls at once
View
Binary file not shown.
View
Binary file not shown.
@@ -1,12 +0,0 @@
-=== 0.0.2 / 2011-01-10
-
-* 1 major enhancement
-
- * Handle relative urls
-
-=== 0.0.1 / 2011-01-10
-
-* 1 major enhancement
-
- * Birthday!
-
@@ -1,13 +0,0 @@
-History.txt
-Manifest.txt
-README.txt
-Rakefile
-bin/rawler
-lib/rawler/base.rb
-lib/rawler/crawler.rb
-lib/rawler.rb
-spec/spec.opts
-spec/spec_helper.rb
-spec/unit/base_spec.rb
-spec/unit/crawler_spec.rb
-tasks/rspec.rake
@@ -1,47 +0,0 @@
-= rawler
-
-* http://github.com/#{github_username}/#{project_name}
-
-== DESCRIPTION:
-
-Rawler is a Ruby library that crawls your website and see the status code of each of your links. Useful for finding dead links.
-
-== SYNOPSIS:
-
- rawler http://example.com
-
-== INSTALL:
-
-gem install rawler
-
-== TODO
-
-* Handle https (now returns 400). See http://stackoverflow.com/questions/1719809/ruby-on-rails-https-post-bad-request
-* Export to html
-* Handle multiple urls at once
-* Add user agent
-
-== LICENSE:
-
-(The MIT License)
-
-Copyright (c) 2011 Oscar Del Ben
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-'Software'), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -1,24 +0,0 @@
-# -*- ruby -*-
-
-require 'rubygems'
-require 'hoe'
-
-# Hoe.plugin :compiler
-# Hoe.plugin :gem_prelude_sucks
-# Hoe.plugin :inline
-# Hoe.plugin :racc
-# Hoe.plugin :rubyforge
-
-Hoe.spec 'rawler' do
- # HEY! If you fill these out in ~/.hoe_template/Rakefile.erb then
- # you'll never have to touch them again!
- # (delete this comment too, of course)
-
- developer('Oscar Del Ben', 'info@oscardelben.com')
-
- self.rubyforge_name = 'oscardelben'
-
- extra_deps << ['nokogiri']
-end
-
-# vim: syntax=ruby
@@ -1,11 +0,0 @@
-#!/usr/bin/env ruby
-
-require File.join(File.dirname(__FILE__), '..', '/lib/rawler.rb')
-
-domain = ARGV[0]
-
-if domain.nil?
- puts "Usage: rawler http://example.com"
-end
-
-Rawler::Base.new(domain, $stdout).validate
@@ -1,13 +0,0 @@
-require 'rubygems'
-require 'net/http'
-require 'nokogiri'
-
-$:.unshift(File.dirname(__FILE__)) unless
- $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
-
-module Rawler
- VERSION = '0.0.2'
-
- autoload :Base, "rawler/base"
- autoload :Crawler, "rawler/crawler"
-end
@@ -1,61 +0,0 @@
-module Rawler
-
- class Base
-
- attr_accessor :url, :responses
-
- def initialize(url, output)
- @url = url
- @responses = {}
- $output = output
- end
-
- def validate
- validate_links_in_page(url)
- end
-
- private
-
- def validate_links_in_page(current_url)
- Rawler::Crawler.new(current_url).links.each do |page_url|
- validate_page(page_url)
- end
- end
-
- def validate_page(page_url)
- if not_yet_parsed?(page_url)
- add_status_code(page_url)
- validate_links_in_page(page_url) if same_domain?(page_url)
- end
- end
-
- def add_status_code(link)
- uri = URI.parse(link)
-
- response = nil
-
- Net::HTTP.start(uri.host, uri.port) do |http|
- path = (uri.path.size == 0) ? "/" : uri.path
- response = http.head(path, {'User-Agent'=>'Rawler'})
- end
-
- $output.puts("#{response.code} - #{link}")
- responses[link] = { :status => response.code.to_i }
- rescue Errno::ECONNREFUSED
- puts "Connection refused - '#{link}'"
- rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
- Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
- puts "Connection problems - #{link}"
- end
-
- def same_domain?(link)
- URI.parse(url).host == URI.parse(link).host
- end
-
- def not_yet_parsed?(link)
- responses[link].nil?
- end
-
- end
-
-end
@@ -1,29 +0,0 @@
-module Rawler
-
- class Crawler
-
- attr_accessor :url, :links
-
- def initialize(url)
- @url = url
- end
-
- def links
- content = Net::HTTP.get(URI.parse(url))
-
- doc = Nokogiri::HTML(content)
- doc.css('a').map { |a| absolute_url(a['href']) }
- rescue Errno::ECONNREFUSED
- $output.puts "Couldn't connect to #{url}"
- []
- end
-
- private
-
- def absolute_url(path)
- URI.parse(url).merge(path.to_s).to_s
- end
-
- end
-
-end
@@ -1 +0,0 @@
---colour
@@ -1,10 +0,0 @@
-
-$:.unshift(File.dirname(__FILE__) + '/../lib')
-require 'rawler'
-require 'fakeweb'
-
-FakeWeb.allow_net_connect = false
-
-def register(uri, content, status=200)
- FakeWeb.register_uri(:any, uri, :body => content, :status => status)
-end
@@ -1,93 +0,0 @@
-require File.dirname(__FILE__) + '/../spec_helper.rb'
-
-describe Rawler::Base do
-
- let(:output) { double('output').as_null_object }
- let(:rawler) { Rawler::Base.new('http://example.com', output) }
-
- before(:each) do
- register('http://example.com', site)
- end
-
- describe "validate_links" do
-
- it "should validate links recursively" do
- register('http://example.com/foo1', '<a href="http://external.com/foo">x</a>')
- register('http://example.com/foo2', '')
- register('http://external.com', '')
- register('http://external.com/foo', '')
-
- rawler.validate
-
- rawler.responses['http://example.com/foo1'].should_not be_nil
- rawler.responses['http://example.com/foo2'].should_not be_nil
- rawler.responses['http://external.com'].should_not be_nil
- rawler.responses['http://external.com/foo'].should_not be_nil
- end
-
- it "should not validate links on external pages" do
- register('http://example.com/foo', '<a href="http://external.com/foo">x</a>')
- register('http://external.com/foo', '<a href="http://external.com/bar">x</a>')
- register('http://external.com/bar', '')
-
- rawler.validate
-
- rawler.responses['http://external.com/foo'].should_not be_nil
- rawler.responses['http://external.com/bar'].should be_nil
- end
-
- it "should output results" do
- register('http://example.com/foo1', '<a href="http://external.com/foo">x</a>')
- register('http://example.com/foo2', '')
- register('http://external.com', '')
- register('http://external.com/foo', '', 302)
-
- output.should_receive(:puts).with('200 - http://example.com/foo1')
- output.should_receive(:puts).with('200 - http://example.com/foo2')
- output.should_receive(:puts).with('200 - http://external.com')
- output.should_receive(:puts).with('302 - http://external.com/foo')
-
- rawler.validate
- end
-
- end
-
- describe "get_status_code" do
-
- it "should add to 200 links" do
- url = 'http://example.com/foo'
- register(url, '', 200)
-
- rawler.send(:add_status_code, url)
-
- rawler.responses[url][:status].should == 200
- end
-
- it "should add to 302 links" do
- url = 'http://example.com/foo'
- register(url, '', 302)
-
- rawler.send(:add_status_code, url)
-
- rawler.responses[url][:status].should == 302
- end
-
- end
-
-
- private
-
- def site
- <<-site
- <html>
- <body>
- <a href="http://example.com/foo1">foo1</a>
- <a href="http://example.com/foo2">foo2</a>
-
- <a href="http://external.com">external</a>
- </body>
- </html>
- site
- end
-
-end
Oops, something went wrong.

0 comments on commit 7500b35

Please sign in to comment.