Skip to content

Commit

Permalink
Merge branch 'next'
Browse files Browse the repository at this point in the history
  • Loading branch information
chriskite committed Jan 20, 2012
2 parents a7895d0 + 6dda8e8 commit 4b378d5
Show file tree
Hide file tree
Showing 22 changed files with 452 additions and 50 deletions.
5 changes: 5 additions & 0 deletions .gitignore
@@ -0,0 +1,5 @@
*.swp
Gemfile.lock
test.db
test.tch
test.kch
19 changes: 18 additions & 1 deletion CHANGELOG.rdoc
@@ -1,6 +1,23 @@
== 0.7.0 / 2012-01-19

* Major enhancements

* Added support for SQLite3 and Kyoto Cabinet storage

* Minor enhancements

* Added Page#base to use base HTML element
* Use bundler for development dependencies

* Bug fixes

* Encode characters in URLs
* Fix specs to run under rake
* Fix handling of redirect_to in storage adapters

== 0.6.1 / 2011-02-24

*Bug fixes
* Bug fixes

* Fix a bug preventing SSL connections from working

Expand Down
7 changes: 7 additions & 0 deletions CONTRIBUTORS
Expand Up @@ -2,3 +2,10 @@ Many thanks to the following folks who have contributed code to Anemone. In no p

Marc Seeger
Joost Baaij
Laurent Arnoud
zzzhc
Mauro Asprea
Alex Pooley
polysics
Sergey Kojin
Richard Paul
3 changes: 3 additions & 0 deletions Gemfile
@@ -0,0 +1,3 @@
source :rubygems

gemspec
3 changes: 2 additions & 1 deletion README.rdoc
Expand Up @@ -16,7 +16,7 @@ See http://anemone.rubyforge.org for more information.
* Records response time for each page
* CLI program can list all pages in a domain, calculate page depths, and more
* Obey robots.txt
* In-memory or persistent storage of pages during crawl, using TokyoCabinet, MongoDB, or Redis
* In-memory or persistent storage of pages during crawl, using TokyoCabinet, SQLite3, MongoDB, or Redis

== Examples
See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
Expand All @@ -32,5 +32,6 @@ To test and develop this gem, additional requirements are:
* tokyocabinet
* mongo
* redis
* sqlite3

You will need to have {Tokyo Cabinet}[http://fallabs.com/tokyocabinet/], {MongoDB}[http://www.mongodb.org/], and {Redis}[http://code.google.com/p/redis/] installed on your system and running.
19 changes: 9 additions & 10 deletions Rakefile
@@ -1,26 +1,25 @@
require 'rubygems'
require 'rake'
require 'rspec/core/rake_task'

require 'spec/rake/spectask'
Spec::Rake::SpecTask.new(:spec) do |spec|
spec.libs << 'lib' << 'spec'
spec.spec_files = FileList['spec/**/*_spec.rb']
desc "Run all specs"
RSpec::Core::RakeTask.new(:rspec) do |spec|
spec.pattern = 'spec/**/*_spec.rb'
end

Spec::Rake::SpecTask.new(:rcov) do |spec|
spec.libs << 'lib' << 'spec'
RSpec::Core::RakeTask.new(:rcov) do |spec|
spec.pattern = 'spec/**/*_spec.rb'
spec.rcov = true
end

task :default => :spec
task :default => :rspec

require 'rake/rdoctask'
Rake::RDocTask.new do |rdoc|
require 'rdoc/task'
RDoc::Task.new do |rdoc|
version = File.exist?('VERSION') ? File.read('VERSION') : ""

rdoc.rdoc_dir = 'rdoc'
rdoc.title = "anemone #{version}"
rdoc.rdoc_files.include('README*')
rdoc.rdoc_files.include('lib/**/*.rb')
end
end
2 changes: 1 addition & 1 deletion VERSION
@@ -1 +1 @@
0.6.1
0.7.0
12 changes: 11 additions & 1 deletion anemone.gemspec
@@ -1,6 +1,6 @@
spec = Gem::Specification.new do |s|
s.name = "anemone"
s.version = "0.6.1"
s.version = "0.7.0"
s.author = "Chris Kite"
s.homepage = "http://anemone.rubyforge.org"
s.rubyforge_project = "anemone"
Expand All @@ -14,6 +14,16 @@ spec = Gem::Specification.new do |s|
s.add_dependency("nokogiri", ">= 1.3.0")
s.add_dependency("robots", ">= 0.7.2")

s.add_development_dependency "rake", ">=0.8.7"
s.add_development_dependency "rspec", ">=2.6.0"
s.add_development_dependency "fakeweb", ">=1.3.0"
s.add_development_dependency "redis", ">=2.2.0"
s.add_development_dependency "mongo", ">=1.3.1"
s.add_development_dependency "bson_ext", ">=1.3.1"
s.add_development_dependency "tokyocabinet", ">=1.29"
s.add_development_dependency "kyotocabinet-ruby", ">=1.27.1"
s.add_development_dependency "sqlite3", ">=1.3.4"

s.files = %w[
VERSION
LICENSE.txt
Expand Down
2 changes: 1 addition & 1 deletion lib/anemone/core.rb
Expand Up @@ -9,7 +9,7 @@

module Anemone

VERSION = '0.6.1';
VERSION = '0.7.0';

#
# Convenience method to start a crawl
Expand Down
2 changes: 1 addition & 1 deletion lib/anemone/http.rb
Expand Up @@ -112,7 +112,7 @@ def get(url, referer = nil)

response, response_time = get_response(loc, referer)
code = Integer(response.code)
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
yield response, code, loc, redirect_to, response_time
limit -= 1
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
Expand Down
21 changes: 18 additions & 3 deletions lib/anemone/page.rb
Expand Up @@ -62,7 +62,7 @@ def links
doc.search("//a[@href]").each do |a|
u = a['href']
next if u.nil? or u.empty?
abs = to_absolute(URI(u)) rescue next
abs = to_absolute(URI(URI.escape(u))) rescue next
@links << abs if in_domain?(abs)
end
@links.uniq!
Expand Down Expand Up @@ -131,6 +131,21 @@ def not_found?
404 == @code
end

#
# Base URI from the HTML doc head element
# http://www.w3.org/TR/html4/struct/links.html#edef-BASE
#
def base
@base = if doc
href = doc.search('//head/base/@href')
URI(href.to_s) unless href.nil? rescue nil
end unless @base

return nil if @base && @base.to_s().empty?
@base
end


#
# Converts relative URL *link* into an absolute URL based on the
# location of the page
Expand All @@ -142,7 +157,7 @@ def to_absolute(link)
link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))

relative = URI(link)
absolute = @url.merge(relative)
absolute = base ? base.merge(relative) : @url.merge(relative)

absolute.path = '/' if absolute.path.empty?

Expand Down Expand Up @@ -190,7 +205,7 @@ def self.from_hash(hash)
'@visited' => hash['visited'],
'@depth' => hash['depth'].to_i,
'@referer' => hash['referer'],
'@redirect_to' => URI(hash['redirect_to']),
'@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
'@response_time' => hash['response_time'].to_i,
'@fetched' => hash['fetched']
}.each do |var, value|
Expand Down
10 changes: 10 additions & 0 deletions lib/anemone/storage.rb
Expand Up @@ -18,6 +18,11 @@ def self.TokyoCabinet(file = 'anemone.tch')
self::TokyoCabinet.new(file)
end

def self.KyotoCabinet(file = 'anemone.tch')
require 'anemone/storage/kyoto_cabinet'
self::KyotoCabinet.new(file)
end

def self.MongoDB(mongo_db = nil, collection_name = 'pages')
require 'anemone/storage/mongodb'
mongo_db ||= Mongo::Connection.new.db('anemone')
Expand All @@ -29,6 +34,11 @@ def self.Redis(opts = {})
require 'anemone/storage/redis'
self::Redis.new(opts)
end

def self.SQLite3(file = 'anemone.db')
require 'anemone/storage/sqlite3'
self::SQLite3.new(file)
end

end
end
72 changes: 72 additions & 0 deletions lib/anemone/storage/kyoto_cabinet.rb
@@ -0,0 +1,72 @@
begin
require 'kyotocabinet'
rescue LoadError
puts $!
puts "You need the kyotocabinet-ruby gem to use Anemone::Storage::KyotoCabinet"
exit
end

require 'forwardable'

module Anemone
module Storage
class KyotoCabinet
extend Forwardable

def_delegators :@db, :close, :size, :each

def initialize(file)
raise "KyotoCabinet filename must have .kch extension" if File.extname(file) != '.kch'
@db = ::KyotoCabinet::DB::new
@db.open(file, ::KyotoCabinet::DB::OWRITER | ::KyotoCabinet::DB::OCREATE)
@db.clear
end

def [](key)
if value = @db[key]
load_value(value)
end
end

def []=(key, value)
@db[key] = [Marshal.dump(value)].pack("m")
end

def each
@db.each do |k, v|
yield(k, load_value(v))
end
end

def has_key?(key)
# Kyoto Cabinet doesn't have a way to query whether a key exists, so hack it
keys = @db.match_prefix(key)
!!keys && keys.include?(key)
end

def keys
acc = []
@db.each_key { |key| acc << key.first }
acc
end

def delete(key)
value = self[key]
@db.delete(key)
value
end

def merge!(hash)
hash.each { |key, value| self[key] = value }
self
end

private

def load_value(value)
Marshal.load(value.unpack("m")[0])
end

end
end
end
90 changes: 90 additions & 0 deletions lib/anemone/storage/sqlite3.rb
@@ -0,0 +1,90 @@
begin
require 'sqlite3'
rescue LoadError
puts "You need the sqlite3 gem to use Anemone::Storage::SQLite3"
exit
end

module Anemone
module Storage
class SQLite3

def initialize(file)
@db = ::SQLite3::Database.new(file)
create_schema
end

def [](url)
value = @db.get_first_value('SELECT data FROM anemone_storage WHERE key = ?', url.to_s)
if value
Marshal.load(value)
end
end

def []=(url, value)
data = Marshal.dump(value)
if has_key?(url)
@db.execute('UPDATE anemone_storage SET data = ? WHERE key = ?', data, url.to_s)
else
@db.execute('INSERT INTO anemone_storage (data, key) VALUES(?, ?)', data, url.to_s)
end
end

def delete(url)
page = self[url]
@db.execute('DELETE FROM anemone_storage WHERE key = ?', url.to_s)
page
end

def each
@db.execute("SELECT key, data FROM anemone_storage ORDER BY id") do |row|
value = Marshal.load(row[1])
yield row[0], value
end
end

def merge!(hash)
hash.each { |key, value| self[key] = value }
self
end

def size
@db.get_first_value('SELECT COUNT(id) FROM anemone_storage')
end

def keys
@db.execute("SELECT key FROM anemone_storage ORDER BY id").map{|t| t[0]}
end

def has_key?(url)
!!@db.get_first_value('SELECT id FROM anemone_storage WHERE key = ?', url.to_s)
end

def close
@db.close
end

private

def create_schema
@db.execute_batch <<SQL
create table if not exists anemone_storage (
id INTEGER PRIMARY KEY ASC,
key TEXT,
data BLOB
);
create index if not exists anemone_key_idx on anemone_storage (key);
SQL
end

def load_page(hash)
BINARY_FIELDS.each do |field|
hash[field] = hash[field].to_s
end
Page.from_hash(hash)
end

end
end
end

1 change: 1 addition & 0 deletions lib/anemone/storage/tokyo_cabinet.rb
@@ -1,6 +1,7 @@
begin
require 'tokyocabinet'
rescue LoadError
puts $!
puts "You need the tokyocabinet gem to use Anemone::Storage::TokyoCabinet"
exit
end
Expand Down

0 comments on commit 4b378d5

Please sign in to comment.