Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Starting to work better with sites and top-level sitemaps
- Loading branch information
Showing
16 changed files
with
232 additions
and
116 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
module Seota | ||
class UnusableResource < Exception | ||
def initialize(url) | ||
super("#{url} is an unusable resource") | ||
end | ||
end | ||
|
||
class Resource | ||
def self.new_or_nil(url) | ||
new(url) | ||
rescue UnusableResource | ||
nil | ||
end | ||
|
||
attr_reader :url, :uri | ||
|
||
def initialize(url) | ||
@url = url | ||
@uri, @response = follow(url) | ||
end | ||
|
||
def [](header_key) | ||
@response[header_key] | ||
end | ||
|
||
def body | ||
@response.body | ||
end | ||
private | ||
def follow(url) | ||
uri = URI.parse(url) | ||
response = Net::HTTP.get_response(uri) | ||
if [Net::HTTPMovedPermanently, Net::HTTPFound].include?(response.class) | ||
follow(response["location"]) | ||
elsif response.kind_of?(Net::HTTPSuccess) | ||
[uri, response] | ||
else | ||
raise UnusableResource, url | ||
end | ||
end | ||
|
||
def replace_path(new_path) | ||
uri.to_s.sub(%r[#{uri.path}$], new_path) | ||
end | ||
end # Resource | ||
end # Seota |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
module Seota | ||
class Robots < Resource | ||
def initialize(url) | ||
super(url.sub(/\/?$/, "/robots.txt")) | ||
end | ||
|
||
def sitemap_urls | ||
@sitemaps_urls ||= body.scan(/^(?:Sitemap:\s+)(.+)$/).flatten | ||
end | ||
end # Robots | ||
end # Seota |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,35 @@ | ||
module Seota | ||
class Site | ||
attr_reader :uri | ||
def initialize(requested_url) | ||
@uri = URI.parse(requested_url) | ||
class Site < Resource | ||
include WhyValidationsSuckIn96::ValidationSupport | ||
|
||
setup_validations do | ||
validates_presence_of :robots, :message => "should provide a robots.txt" | ||
validates_length_of :sitemaps, :minimum => 1, :message => "should provide a sitemap of some sort" | ||
end | ||
|
||
def initialize(url) | ||
uri = URI.parse(url) | ||
super(uri.to_s.sub(%r[#{uri.path}$], '/')) | ||
end | ||
|
||
def base_uri; "#{uri.scheme}://#{uri.host}"; end | ||
def resource(path) "#{base_uri}#{pad_with_slash(path)}"; end | ||
def robots | ||
@robots ||= Robots.new_or_nil(url) | ||
end | ||
|
||
def resource_exists?(resource) | ||
response = nil | ||
Net::HTTP.start(uri.host, uri.port) { |http| response = http.head(pad_with_slash(resource)) } | ||
response.kind_of?(Net::HTTPSuccess) ? resource : nil | ||
def sitemaps | ||
@sitemaps ||= find_sitemaps | ||
end | ||
private | ||
def pad_with_slash(path) | ||
"/#{path.gsub(/^\//, '')}" | ||
def find_sitemaps | ||
sitemap_urls = (robots && robots.sitemap_urls) || default_sitemap_urls | ||
sitemap_urls.map do |sitemap_url| | ||
puts sitemap_url | ||
Sitemap.new_or_nil(sitemap_url) | ||
end.compact | ||
end | ||
|
||
def default_sitemap_urls | ||
@default_sitemap_urls ||= [replace_path("/sitemap.xml"), replace_path("/sitemap.txt")] | ||
end | ||
end # Site | ||
end # Seota |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,32 @@ | ||
module Seota | ||
class Sitemap | ||
class UnprocessableSitemap < Exception; end | ||
|
||
module XmlSitemapSupport | ||
def page_urls | ||
@page_urls ||= find_page_urls | ||
end | ||
private | ||
def find_page_urls | ||
document = Nokogiri::XML(body) | ||
document.search("urlset url loc").map { |loc| loc.content } | ||
end | ||
end # XmlSitemapSupport | ||
|
||
class Sitemap < Resource | ||
|
||
# TODO: gz sitemap files and sitemap indexes | ||
attr_reader :path | ||
|
||
def initialize(site) | ||
# TODO: may convert these conditions to individual modules | ||
if @path = site.resource_exists?("/sitemap.xml") | ||
sitemap = Nokogiri::XML(open(site.resource(path))) | ||
@pages = sitemap.search("urlset url loc").map { |loc| loc.content } | ||
elsif @path = site.resource_exists?("/sitemap.txt") | ||
@pages = [] | ||
def initialize(url) | ||
super(url) | ||
puts self["content-type"] | ||
if self["content-type"] =~ /\/xml$/ | ||
(class << self; self; end).instance_eval { include XmlSitemapSupport } | ||
end | ||
end | ||
|
||
def pages | ||
(@pages || []).map { |page_url| Page.new(page_url) } | ||
def page_urls | ||
raise UnprocessableSitemap, "Unable to determine format of #{url}" | ||
end | ||
|
||
def found?; !@path.nil?; end | ||
end # Sitemap | ||
|
||
end # Seota |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.