Skip to content

Commit

Permalink
Starting to work better with sites and top-level sitemaps
Browse files Browse the repository at this point in the history
  • Loading branch information
gus committed Jan 27, 2010
1 parent f788e88 commit cc26bca
Show file tree
Hide file tree
Showing 16 changed files with 232 additions and 116 deletions.
50 changes: 31 additions & 19 deletions app.rb
Expand Up @@ -9,28 +9,40 @@ class App < Sinatra::Base
# enable :dump_errors
enable :static

get("/") { haml(:index) }

get %r{^/analyze/([\w.-]+)$} do |domain|
sitemap = Sitemap.new(Site.new("http://#{domain}"))
before do
content_type 'application/json', :charset => 'utf-8'
{:sitemap => sitemap.pages.map(&:path)}.to_json
end

get %r{^/analyze/([\w.-]+)/page/(.*)$} do |domain, path|
page = Page.new("http://#{domain}", path)
content_type 'application/json', :charset => 'utf-8'
result = {:uri => page.uri, :exists => page.exists?, :valid => false}
if page.exists?
result = result.merge({:valid => page.valid?,
:title => {:value => page.title, :failures => page.failures_on(:title)},
:description => {:value => page.description, :failures => page.failures_on(:description)},
:keywords => {:value => page.keywords, :failures => page.failures_on(:keywords)},
:single_word_density => page.single_word_density,
:double_word_density => page.double_word_density
})
end
result.to_json
error UnusableResource do
status 427
{:error => request.env['sinatra.error'].message}.to_json
end

get("/") do
content_type 'text/html', :charset => 'utf-8'
haml(:index)
end

get %r{^/analyze/([\w.-]+)/sitemap/(.*)$} do |domain, url|
sitemap = Sitemap.new(url)
{:pages => sitemap.page_urls}.to_json
end

get %r{^/analyze/([\w.-]+)/page/(.*)$} do |domain, url|
page = Page.new(url)
{:uri => page.uri, :valid => page.valid?,
:title => {:value => page.title, :failures => page.failures_on(:title)},
:description => {:value => page.description, :failures => page.failures_on(:description)},
:keywords => {:value => page.keywords, :failures => page.failures_on(:keywords)},
:single_word_density => page.single_word_density,
:double_word_density => page.double_word_density
}.to_json
end

get %r{^/analyze/([\w.-]+)$} do |domain|
site = Site.new("http://#{domain}")
{:sitemaps => site.sitemaps.map(&:url)}.to_json
end

end # App
end
47 changes: 28 additions & 19 deletions bin/seota
Expand Up @@ -13,25 +13,34 @@ puts "Preparing to analyze #{site.base_uri}"
#
# Collection page urls from sitemap

sitemap = Seota::Sitemap.new(site)
(puts " ! No sitemap found"; exit(1)) unless sitemap.found?
puts " + Using sitemap: #{sitemap.path}"
robots = nil

begin
robots = Seota::Robots.new(site)
# robots.sitemaps.each do |sitemap|
# puts " + Using sitemap: #{sitemap.url}"
# end
rescue Seota::NoRobotsFound
puts " ! No robots.txt found"
exit(1)
end

#
# Analyze each page

sitemap.pages.each do |page|
if page.exists?
puts " > #{page.uri}"
puts " Title: #{page.title}"
puts " Meta(description): #{page.description}"
puts " Meta(keywords): #{page.keywords}"
puts " Word Count: #{page.word_count}"
# puts " Single word density: #{page.single_word_density.inspect}"
# puts " Double word density: #{page.double_word_density.inspect}"
puts " Anchors with titles: 0"
puts " Anchors without titles: 0"
else
puts " ! Page not found: #{page.uri}"
end
end # sitemap_resource
robots.sitemaps.each do |sitemap|
puts " + Using sitemap: #{sitemap.url}"
sitemap.pages.each do |page|
if page.exists?
puts " > #{page.uri}"
puts " Title: #{page.title}"
puts " Meta(description): #{page.description}"
puts " Meta(keywords): #{page.keywords}"
# puts " Single word density: #{page.single_word_density.inspect}"
# puts " Double word density: #{page.double_word_density.inspect}"
puts " Anchors with titles: 0"
puts " Anchors without titles: 0"
else
puts " ! Page not found: #{page.uri}"
end
end # sitemap.pages
end
3 changes: 3 additions & 0 deletions lib/seota.rb
Expand Up @@ -3,8 +3,11 @@
require 'net/http'
require 'nokogiri'
require 'open-uri'
require 'whyvalidationssuckin96'

require 'seota/resource'
require 'seota/site'
require 'seota/robots'
require 'seota/sitemap'
require 'seota/wordable'
require 'seota/page'
37 changes: 2 additions & 35 deletions lib/seota/page.rb
@@ -1,10 +1,6 @@
require 'whyvalidationssuckin96'

module Seota
class Page
class Page < Resource
include Seota::Wordable

attr_reader :uri
include WhyValidationsSuckIn96::ValidationSupport

setup_validations do
Expand All @@ -18,42 +14,13 @@ def failures_on(attribute)
failed_validations.select {|v| v.attribute == attribute }.map(&:message)
end

def initialize(*uri_parts)
@uri = URI.join(*uri_parts)
end

def path; uri.path.gsub(/^$/, "/"); end

def title; document.css("head title").inner_text; end
def description; read_meta_attribute("description", "content"); end
def keywords; read_meta_attribute("keywords", "content"); end

def exists?
response = nil
Net::HTTP.start(uri.host, uri.port) { |http| response = http.head(path) }
response.kind_of?(Net::HTTPSuccess)
end

# TODO: Push this stuff into a word analysis class
def word_count; get_words.length; end

def single_word_density
get_words.inject(Hash.new(0)) { |dict, word| dict[word] += 1; dict }
end

def double_word_density
dict = Hash.new(0)
get_words[0..-2].each_with_index do |word, i|
dict["#{word} #{get_words[i+1]}"] += 1
end
dict
end
private
def document; @document ||= Nokogiri::HTML(uri.open); end

def get_words
@words ||= parse_words(document.css("body").inner_text)
end
def document; @document ||= Nokogiri::HTML(body); end

def read_meta_attribute(name, attribute_name)
node = document.css("head meta[name=#{name}]")
Expand Down
46 changes: 46 additions & 0 deletions lib/seota/resource.rb
@@ -0,0 +1,46 @@
module Seota
class UnusableResource < Exception
def initialize(url)
super("#{url} is an unusable resource")
end
end

class Resource
def self.new_or_nil(url)
new(url)
rescue UnusableResource
nil
end

attr_reader :url, :uri

def initialize(url)
@url = url
@uri, @response = follow(url)
end

def [](header_key)
@response[header_key]
end

def body
@response.body
end
private
def follow(url)
uri = URI.parse(url)
response = Net::HTTP.get_response(uri)
if [Net::HTTPMovedPermanently, Net::HTTPFound].include?(response.class)
follow(response["location"])
elsif response.kind_of?(Net::HTTPSuccess)
[uri, response]
else
raise UnusableResource, url
end
end

def replace_path(new_path)
uri.to_s.sub(%r[#{uri.path}$], new_path)
end
end # Resource
end # Seota
11 changes: 11 additions & 0 deletions lib/seota/robots.rb
@@ -0,0 +1,11 @@
module Seota
class Robots < Resource
def initialize(url)
super(url.sub(/\/?$/, "/robots.txt"))
end

def sitemap_urls
@sitemaps_urls ||= body.scan(/^(?:Sitemap:\s+)(.+)$/).flatten
end
end # Robots
end # Seota
38 changes: 26 additions & 12 deletions lib/seota/site.rb
@@ -1,21 +1,35 @@
module Seota
class Site
attr_reader :uri
def initialize(requested_url)
@uri = URI.parse(requested_url)
class Site < Resource
include WhyValidationsSuckIn96::ValidationSupport

setup_validations do
validates_presence_of :robots, :message => "should provide a robots.txt"
validates_length_of :sitemaps, :minimum => 1, :message => "should provide a sitemap of some sort"
end

def initialize(url)
uri = URI.parse(url)
super(uri.to_s.sub(%r[#{uri.path}$], '/'))
end

def base_uri; "#{uri.scheme}://#{uri.host}"; end
def resource(path) "#{base_uri}#{pad_with_slash(path)}"; end
def robots
@robots ||= Robots.new_or_nil(url)
end

def resource_exists?(resource)
response = nil
Net::HTTP.start(uri.host, uri.port) { |http| response = http.head(pad_with_slash(resource)) }
response.kind_of?(Net::HTTPSuccess) ? resource : nil
def sitemaps
@sitemaps ||= find_sitemaps
end
private
def pad_with_slash(path)
"/#{path.gsub(/^\//, '')}"
def find_sitemaps
sitemap_urls = (robots && robots.sitemap_urls) || default_sitemap_urls
sitemap_urls.map do |sitemap_url|
puts sitemap_url
Sitemap.new_or_nil(sitemap_url)
end.compact
end

def default_sitemap_urls
@default_sitemap_urls ||= [replace_path("/sitemap.xml"), replace_path("/sitemap.txt")]
end
end # Site
end # Seota
36 changes: 23 additions & 13 deletions lib/seota/sitemap.rb
@@ -1,22 +1,32 @@
module Seota
class Sitemap
class UnprocessableSitemap < Exception; end

module XmlSitemapSupport
def page_urls
@page_urls ||= find_page_urls
end
private
def find_page_urls
document = Nokogiri::XML(body)
document.search("urlset url loc").map { |loc| loc.content }
end
end # XmlSitemapSupport

class Sitemap < Resource

# TODO: gz sitemap files and sitemap indexes
attr_reader :path

def initialize(site)
# TODO: may convert these conditions to individual modules
if @path = site.resource_exists?("/sitemap.xml")
sitemap = Nokogiri::XML(open(site.resource(path)))
@pages = sitemap.search("urlset url loc").map { |loc| loc.content }
elsif @path = site.resource_exists?("/sitemap.txt")
@pages = []
def initialize(url)
super(url)
puts self["content-type"]
if self["content-type"] =~ /\/xml$/
(class << self; self; end).instance_eval { include XmlSitemapSupport }
end
end

def pages
(@pages || []).map { |page_url| Page.new(page_url) }
def page_urls
raise UnprocessableSitemap, "Unable to determine format of #{url}"
end

def found?; !@path.nil?; end
end # Sitemap

end # Seota
18 changes: 17 additions & 1 deletion lib/seota/wordable.rb
Expand Up @@ -3,7 +3,23 @@ module Wordable
COMMON_WORDS = %w[the of and to a in that is was he for it with as his on be at by i this had not are but
from or have an they which one you were her all she there would their we him been has when who will
more no if]


def single_word_density
get_words.inject(Hash.new(0)) { |dict, word| dict[word] += 1; dict }
end

def double_word_density
dict = Hash.new(0)
get_words[0..-2].each_with_index do |word, i|
dict["#{word} #{get_words[i+1]}"] += 1
end
dict
end

def get_words
@words ||= parse_words(document.css("body").inner_text)
end

def parse_words(str)
to_parse = str.downcase
to_parse.gsub(/\W/i, ' ').gsub(/ +/, ' ').strip.split.reject { |word| COMMON_WORDS.include?(word) }
Expand Down
10 changes: 3 additions & 7 deletions public/javascripts/app/page.js
Expand Up @@ -100,12 +100,8 @@ Seota.Page.prototype = $.extend({}, {
container.empty();
container.append(Mustache.to_html("<h5><a href='{{ uri }}' target='_blank'>{{ uri }}</a></h5>",
{"uri": this.details.uri}));
if (this.details.exists) {
var meta_div = $("<div></div>").addClass("half").
append(this._title_tag()).append(this._description_tag()).append(this._keywords_tag());
container.append(meta_div).append($("<div></div>").addClass("half").append(this._densities_tag()));
} else {
container.append($("<p></p>").text("Page could not be found"));
}
var meta_div = $("<div></div>").addClass("half").
append(this._title_tag()).append(this._description_tag()).append(this._keywords_tag());
container.append(meta_div).append($("<div></div>").addClass("half").append(this._densities_tag()));
}
});

0 comments on commit cc26bca

Please sign in to comment.