Starting to work better with sites and top-level sitemaps

thumblemonks · Jan 27, 2010 · cc26bca · cc26bca
1 parent f788e88
commit cc26bca
Show file tree

Hide file tree

Showing 16 changed files with 232 additions and 116 deletions.
diff --git a/app.rb b/app.rb
@@ -9,28 +9,40 @@ class App < Sinatra::Base
     # enable :dump_errors
     enable :static
 
-    get("/") { haml(:index) }
-
-    get %r{^/analyze/([\w.-]+)$} do |domain|
-      sitemap = Sitemap.new(Site.new("http://#{domain}"))
+    before do
       content_type 'application/json', :charset => 'utf-8'
-      {:sitemap => sitemap.pages.map(&:path)}.to_json
     end
 
-    get %r{^/analyze/([\w.-]+)/page/(.*)$} do |domain, path|
-      page = Page.new("http://#{domain}", path)
-      content_type 'application/json', :charset => 'utf-8'
-      result = {:uri => page.uri, :exists => page.exists?, :valid => false}
-      if page.exists?
-        result = result.merge({:valid => page.valid?,
-          :title => {:value => page.title, :failures => page.failures_on(:title)},
-          :description => {:value => page.description, :failures => page.failures_on(:description)},
-          :keywords => {:value => page.keywords, :failures => page.failures_on(:keywords)},
-          :single_word_density => page.single_word_density,
-          :double_word_density => page.double_word_density
-        })
-      end
-      result.to_json
+    error UnusableResource do
+      status 427
+      {:error => request.env['sinatra.error'].message}.to_json
+    end
+
+    get("/") do
+      content_type 'text/html', :charset => 'utf-8'
+      haml(:index)
+    end
+
+    get %r{^/analyze/([\w.-]+)/sitemap/(.*)$} do |domain, url|
+      sitemap = Sitemap.new(url)
+      {:pages => sitemap.page_urls}.to_json
     end
+
+    get %r{^/analyze/([\w.-]+)/page/(.*)$} do |domain, url|
+      page = Page.new(url)
+      {:uri => page.uri, :valid => page.valid?,
+        :title => {:value => page.title, :failures => page.failures_on(:title)},
+        :description => {:value => page.description, :failures => page.failures_on(:description)},
+        :keywords => {:value => page.keywords, :failures => page.failures_on(:keywords)},
+        :single_word_density => page.single_word_density,
+        :double_word_density => page.double_word_density
+      }.to_json
+    end
+
+    get %r{^/analyze/([\w.-]+)$} do |domain|
+      site = Site.new("http://#{domain}")
+      {:sitemaps => site.sitemaps.map(&:url)}.to_json
+    end
+
   end # App
 end
diff --git a/bin/seota b/bin/seota
@@ -13,25 +13,34 @@ puts "Preparing to analyze #{site.base_uri}"
 #
 # Collection page urls from sitemap
 
-sitemap = Seota::Sitemap.new(site)
-(puts " ! No sitemap found"; exit(1)) unless sitemap.found?
-puts " + Using sitemap: #{sitemap.path}"
+robots = nil
+
+begin
+  robots = Seota::Robots.new(site)
+  # robots.sitemaps.each do |sitemap|
+  #   puts " + Using sitemap: #{sitemap.url}"
+  # end
+rescue Seota::NoRobotsFound
+  puts " ! No robots.txt found"
+  exit(1)
+end
 
 #
 # Analyze each page
-
-sitemap.pages.each do |page|
-  if page.exists?
-    puts " > #{page.uri}"
-    puts "   Title: #{page.title}"
-    puts "   Meta(description): #{page.description}"
-    puts "   Meta(keywords): #{page.keywords}"
-    puts "   Word Count: #{page.word_count}"
-    # puts "   Single word density: #{page.single_word_density.inspect}"
-    # puts "   Double word density: #{page.double_word_density.inspect}"
-    puts "   Anchors with titles: 0"
-    puts "   Anchors without titles: 0"
-  else
-    puts " ! Page not found: #{page.uri}"
-  end
-end # sitemap_resource
+robots.sitemaps.each do |sitemap|
+  puts " + Using sitemap: #{sitemap.url}"
+  sitemap.pages.each do |page|
+    if page.exists?
+      puts " > #{page.uri}"
+      puts "   Title: #{page.title}"
+      puts "   Meta(description): #{page.description}"
+      puts "   Meta(keywords): #{page.keywords}"
+      # puts "   Single word density: #{page.single_word_density.inspect}"
+      # puts "   Double word density: #{page.double_word_density.inspect}"
+      puts "   Anchors with titles: 0"
+      puts "   Anchors without titles: 0"
+    else
+      puts " ! Page not found: #{page.uri}"
+    end
+  end # sitemap.pages
+end
diff --git a/lib/seota.rb b/lib/seota.rb
@@ -3,8 +3,11 @@
 require 'net/http'
 require 'nokogiri'
 require 'open-uri'
+require 'whyvalidationssuckin96'
 
+require 'seota/resource'
 require 'seota/site'
+require 'seota/robots'
 require 'seota/sitemap'
 require 'seota/wordable'
 require 'seota/page'
diff --git a/lib/seota/page.rb b/lib/seota/page.rb
@@ -1,10 +1,6 @@
-require 'whyvalidationssuckin96'
-
 module Seota
-  class Page
+  class Page < Resource
     include Seota::Wordable
-
-    attr_reader :uri
     include WhyValidationsSuckIn96::ValidationSupport
 
     setup_validations do
@@ -18,42 +14,13 @@ def failures_on(attribute)
       failed_validations.select {|v| v.attribute == attribute }.map(&:message)
     end
 
-    def initialize(*uri_parts)
-      @uri = URI.join(*uri_parts)
-    end
-
     def path; uri.path.gsub(/^$/, "/"); end
 
     def title; document.css("head title").inner_text; end
     def description; read_meta_attribute("description", "content"); end
     def keywords; read_meta_attribute("keywords", "content"); end
-
-    def exists?
-      response = nil
-      Net::HTTP.start(uri.host, uri.port) { |http| response = http.head(path) }
-      response.kind_of?(Net::HTTPSuccess)
-    end
-
-    # TODO: Push this stuff into a word analysis class
-    def word_count; get_words.length; end
-
-    def single_word_density
-      get_words.inject(Hash.new(0)) { |dict, word| dict[word] += 1; dict }
-    end
-
-    def double_word_density
-      dict = Hash.new(0)
-      get_words[0..-2].each_with_index do |word, i|
-        dict["#{word} #{get_words[i+1]}"] += 1
-      end
-      dict
-    end
   private
-    def document; @document ||= Nokogiri::HTML(uri.open); end
-
-    def get_words
-      @words ||= parse_words(document.css("body").inner_text)
-    end
+    def document; @document ||= Nokogiri::HTML(body); end
 
     def read_meta_attribute(name, attribute_name)
       node = document.css("head meta[name=#{name}]")

diff --git a/lib/seota/resource.rb b/lib/seota/resource.rb
@@ -0,0 +1,46 @@
+module Seota
+  class UnusableResource < Exception
+    def initialize(url)
+      super("#{url} is an unusable resource")
+    end
+  end
+
+  class Resource
+    def self.new_or_nil(url)
+      new(url)
+    rescue UnusableResource
+      nil
+    end
+
+    attr_reader :url, :uri
+
+    def initialize(url)
+      @url = url
+      @uri, @response = follow(url)
+    end
+
+    def [](header_key)
+      @response[header_key]
+    end
+
+    def body
+      @response.body
+    end
+  private
+    def follow(url)
+      uri = URI.parse(url)
+      response = Net::HTTP.get_response(uri)
+      if [Net::HTTPMovedPermanently, Net::HTTPFound].include?(response.class)
+        follow(response["location"])
+      elsif response.kind_of?(Net::HTTPSuccess)
+        [uri, response]
+      else
+        raise UnusableResource, url
+      end
+    end
+
+    def replace_path(new_path)
+      uri.to_s.sub(%r[#{uri.path}$], new_path)
+    end
+  end # Resource
+end # Seota
diff --git a/lib/seota/robots.rb b/lib/seota/robots.rb
@@ -0,0 +1,11 @@
+module Seota
+  class Robots < Resource
+    def initialize(url)
+      super(url.sub(/\/?$/, "/robots.txt"))
+    end
+
+    def sitemap_urls
+      @sitemaps_urls ||= body.scan(/^(?:Sitemap:\s+)(.+)$/).flatten
+    end
+  end # Robots
+end # Seota
diff --git a/lib/seota/site.rb b/lib/seota/site.rb
@@ -1,21 +1,35 @@
 module Seota
-  class Site
-    attr_reader :uri
-    def initialize(requested_url)
-      @uri = URI.parse(requested_url)
+  class Site < Resource
+    include WhyValidationsSuckIn96::ValidationSupport
+
+    setup_validations do
+      validates_presence_of :robots, :message => "should provide a robots.txt"
+      validates_length_of :sitemaps, :minimum => 1, :message => "should provide a sitemap of some sort"
+    end
+
+    def initialize(url)
+      uri = URI.parse(url)
+      super(uri.to_s.sub(%r[#{uri.path}$], '/'))
     end
 
-    def base_uri; "#{uri.scheme}://#{uri.host}"; end
-    def resource(path) "#{base_uri}#{pad_with_slash(path)}"; end
+    def robots
+      @robots ||= Robots.new_or_nil(url)
+    end
 
-    def resource_exists?(resource)
-      response = nil
-      Net::HTTP.start(uri.host, uri.port) { |http| response = http.head(pad_with_slash(resource)) }
-      response.kind_of?(Net::HTTPSuccess) ? resource : nil
+    def sitemaps
+      @sitemaps ||= find_sitemaps
     end
   private
-    def pad_with_slash(path)
-      "/#{path.gsub(/^\//, '')}"
+    def find_sitemaps
+      sitemap_urls = (robots && robots.sitemap_urls) || default_sitemap_urls
+      sitemap_urls.map do |sitemap_url|
+        puts sitemap_url
+        Sitemap.new_or_nil(sitemap_url)
+      end.compact
+    end
+
+    def default_sitemap_urls
+      @default_sitemap_urls ||= [replace_path("/sitemap.xml"), replace_path("/sitemap.txt")]
     end
   end # Site
 end # Seota
diff --git a/lib/seota/sitemap.rb b/lib/seota/sitemap.rb
@@ -1,22 +1,32 @@
 module Seota
-  class Sitemap
+  class UnprocessableSitemap < Exception; end
+
+  module XmlSitemapSupport
+    def page_urls
+      @page_urls ||= find_page_urls
+    end
+  private
+    def find_page_urls
+      document = Nokogiri::XML(body)
+      document.search("urlset url loc").map { |loc| loc.content }
+    end
+  end # XmlSitemapSupport
+
+  class Sitemap < Resource
+
     # TODO: gz sitemap files and sitemap indexes
-    attr_reader :path
 
-    def initialize(site)
-      # TODO: may convert these conditions to individual modules
-      if @path = site.resource_exists?("/sitemap.xml")
-        sitemap = Nokogiri::XML(open(site.resource(path)))
-        @pages = sitemap.search("urlset url loc").map { |loc| loc.content }
-      elsif @path = site.resource_exists?("/sitemap.txt")
-        @pages = []
+    def initialize(url)
+      super(url)
+      puts self["content-type"]
+      if self["content-type"] =~ /\/xml$/
+        (class << self; self; end).instance_eval { include XmlSitemapSupport }
       end
     end
 
-    def pages
-      (@pages || []).map { |page_url| Page.new(page_url) }
+    def page_urls
+      raise UnprocessableSitemap, "Unable to determine format of #{url}"
     end
-
-    def found?; !@path.nil?; end
   end # Sitemap
+
 end # Seota
diff --git a/lib/seota/wordable.rb b/lib/seota/wordable.rb
@@ -3,7 +3,23 @@ module Wordable
     COMMON_WORDS = %w[the of and to a in that is was he for it with as his on be at by i this had not are but
       from or have an they which one you were her all she there would their we him been has when who will
       more no if]
-
+
+    def single_word_density
+      get_words.inject(Hash.new(0)) { |dict, word| dict[word] += 1; dict }
+    end
+
+    def double_word_density
+      dict = Hash.new(0)
+      get_words[0..-2].each_with_index do |word, i|
+        dict["#{word} #{get_words[i+1]}"] += 1
+      end
+      dict
+    end
+
+    def get_words
+      @words ||= parse_words(document.css("body").inner_text)
+    end
+
     def parse_words(str)
       to_parse = str.downcase
       to_parse.gsub(/\W/i, ' ').gsub(/ +/, ' ').strip.split.reject { |word| COMMON_WORDS.include?(word) }

diff --git a/public/javascripts/app/page.js b/public/javascripts/app/page.js
@@ -100,12 +100,8 @@ Seota.Page.prototype = $.extend({}, {
     container.empty();
     container.append(Mustache.to_html("<h5><a href='{{ uri }}' target='_blank'>{{ uri }}</a></h5>",
       {"uri": this.details.uri}));
-    if (this.details.exists) {
-      var meta_div = $("<div></div>").addClass("half").
-        append(this._title_tag()).append(this._description_tag()).append(this._keywords_tag());
-      container.append(meta_div).append($("<div></div>").addClass("half").append(this._densities_tag()));
-    } else {
-      container.append($("<p></p>").text("Page could not be found"));
-    }
+    var meta_div = $("<div></div>").addClass("half").
+      append(this._title_tag()).append(this._description_tag()).append(this._keywords_tag());
+    container.append(meta_div).append($("<div></div>").addClass("half").append(this._densities_tag()));
   }
 });