improve show loading tests and structure

tardate · Oct 22, 2011 · 7c1d642 · 7c1d642
1 parent df11b22
commit 7c1d642
Show file tree

Hide file tree

Showing 6 changed files with 611 additions and 49 deletions.
diff --git a/lib/navd/scraper/show_loader.rb b/lib/navd/scraper/show_loader.rb
@@ -3,16 +3,15 @@ class ShowLoader
     attr_accessor :number, :spider, :uri, :found, :published, :errors
     attr_reader :attributes, :show_notes
 
-    attr_reader :p_shownotes_page # Nokogiri::HTML::Document of the main shownotes page being processed
+    attr_reader :p_shownotes_main # Nokogiri::HTML::Document of the main shownotes page being processed
 
     # +number+ - show number to load
     def initialize(number)
       @errors = []
       @number = number
-      @spider = Navd::Scraper::Spider.new
       @attributes = {}
       @show_notes = []
-      @p_shownotes_page = nil
+      @spider = Navd::Scraper::Spider.new
     end
 
     # Returns true if show details have been scraped without error
@@ -23,7 +22,7 @@ def loaded?
     # Loads all the show details for given show
     # If an error is encountered, +@errors+ will be present
     def scan_show_assets
-      @uri, @p_shownotes_page = spider.get_page_for_show(number)
+      @uri, @p_shownotes_main = spider.get_page_for_show(number)
       if spider.errors.present?
         @errors += spider.errors
         return false
@@ -43,32 +42,22 @@ def scan_show_assets
         :published_date => published_date,
         :cover_art_url => cover_art_url,
         :assets_url => assets_url,
-        :url =>episode_url
+        :url => episode_url,
+        :credits => credits
       })
-      # TODO: credits
       show_notes
       errors.empty?
     end
 
+
     # Returns an array of hashes with show note detail (:name,:meme_name,:description,:url)
-    # +show_note_details_uri+ URI to show note details page (e.g. http://349.nashownotes.com/shownotes)
+    # def extract_show_notes_from_page(page)
     def show_notes
-      @show_notes ||= extract_notes_from_page(get_all_notes_page)
-    end
-
-    # TODO: need some refactoring and exception handling
-    def get_all_notes_page
-      assets_page = spider.get_page(show_note_details_uri)
-      notes_uri = show_note_assets_uri.merge(extract_show_notes_link(assets_page))
-      notes_page = spider.get_page(notes_uri)
-      all_notes_uri = show_note_assets_uri.merge(extract_nodes(notes_page,:all_notes)[:href])
-      spider.get_page(all_notes_uri)
-    end
-
-    def extract_notes_from_page(page)
+      return @show_notes if @show_notes
+      # TODO: test for p_shownotes_detail_all
       notes = []
       current_meme = nil
-      page.at_css('ul.ulDirectory').children.each do |n|
+      p_shownotes_detail_all.at_css('ul.ulDirectory').children.each do |n|
         if n.name=='li' && n[:class]=='directoryItem'
           current_meme = n.text
         elsif n.name=='ul' && n[:class]=='ulDirectory'
@@ -88,8 +77,14 @@ def extract_notes_from_page(page)
           end
         end
       end
-      notes
+      @show_notes = notes
+    end
+
+    # TODO: parse/extract credits
+    def credits
+      @credits ||= nil
     end
+    protected :credits
 
     # Returns the show date (as extracted from the audio file name)
     # Dodgy approach, but seems the most reliable way of automatically getting the show date
@@ -106,48 +101,81 @@ def published_date
     # Returns the link to audio file from the main shownotes page
     # e.g. http://m.podshow.com/media/15412/episodes/299798/noagenda-299798-10-20-2011.mp3
     def mp3_url
-      @mp3_url ||= extract_nodes(p_shownotes_page,:mp3)[:href]
+      @mp3_url ||= extract_nodes(p_shownotes_main,:mp3)[:href]
     end
     protected :mp3_url
 
+    # Returns the link to official show page
+    # e.g. http://blog.curry.com/stories/2011/10/20/na34920111020.html
+    def episode_url
+      @episode_url ||= extract_nodes(p_shownotes_main,:web)[:href]
+    end
+    protected :episode_url
+
     # Returns the link to cover art
     # e.g. http://dropbox.curry.com/ShowNotesArchive/2011/10/NA-349-2011-10-20/Assets/ns349art.png
     def cover_art_url
-      @cover_art_url ||= extract_nodes(p_shownotes_page,:cover_art)[:href]
+      @cover_art_url ||= extract_nodes(p_shownotes_main,:cover_art)[:href]
     end
     protected :cover_art_url
 
     # Returns the link to show assets page
     # e.g. http://349.nashownotes.com/assets
     def assets_url
-      @assets_url ||= uri.merge(extract_nodes(p_shownotes_page,:assets)[:href]).to_s
+      @assets_url ||= uri.merge(extract_nodes(p_shownotes_main,:assets)[:href]).to_s
     end
     protected :assets_url
 
-    # Returns the URI for link to show notes detail page
+    # Returns the URI to show notes menu page
     # e.g. http://349.nashownotes.com/shownotes
-    def show_note_details_uri
-      @show_note_details_uri ||= uri.merge(extract_nodes(p_shownotes_page,:notes)[:href])
+    def shownotes_menu_uri
+      @shownotes_menu_uri ||= uri.merge(extract_nodes(p_shownotes_main,:notes)[:href])
+    end
+    # Returns Nokogiri::HTML::Document of the main shownotes menu page being processed
+    def p_shownotes_menu
+      @p_shownotes_menu ||= spider.get_page(shownotes_menu_uri)
     end
-    protected :show_note_details_uri
+    protected :shownotes_menu_uri, :p_shownotes_menu
 
+    # e.g. http://349.nashownotes.com/shownotes/na34920111020Shownotes
+    def shownotes_detail_main_uri
+      @shownotes_detail_main_uri ||= uri.merge(extract_nodes(p_shownotes_menu,:notes)[:href])
+    end
+    def p_shownotes_detail_main
+      @p_show_note_detail_main ||= spider.get_page(shownotes_detail_main_uri)
+    end
+    protected :shownotes_detail_main_uri, :p_shownotes_detail_main
+
+    # e.g. http://349.nashownotes.com/shownotes/na34920111020Shownotes/expandAllTopics
+    def shownotes_detail_all_uri
+      @shownotes_detail_all_uri ||= uri.merge(extract_nodes(p_shownotes_detail_main,:all_notes)[:href])
+    end
+    def p_shownotes_detail_all
+      @p_shownotes_detail_all ||= spider.get_page(shownotes_detail_all_uri)
+    end
+    protected :shownotes_detail_all_uri, :p_shownotes_detail_all
+
+    # Returns the URI to show credits page
     # http://349.nashownotes.com/shownotes/na34920111020Credits
-    # http://349.nashownotes.com/shownotes/na34920111020Shownotes
-
-    # Returns the link to official show page
-    # e.g. http://blog.curry.com/stories/2011/10/20/na34920111020.html
-    def episode_url
-      @episode_url ||= extract_nodes(p_shownotes_page,:web)[:href]
+    def credits_uri
+      @credits_uri ||= uri.merge(extract_nodes(p_shownotes_menu,:credits)[:href])
     end
-    protected :episode_url
+    def p_credits
+      @p_credits ||= spider.get_page(credits_uri)
+    end
+    protected :credits_uri, :p_credits
 
+    # http://349.nashownotes.com/shownotes/na34920111020Shownotes
+
     # Set of algorithms to extracts parts of a page
     # +page+ is a Nokogiri::HTML::Document
     # +item+ - symbol for note type required
     def extract_nodes(page,item)
       result = case item
       when :all_notes
         page.css('a.directoryLink').select{|n| n.text[/expand all/i] }.first
+      when :credits
+        page.css('a.directoryLink').select{|n| n.text[/credits/i] }.first
       when :assets
         page.css('a.directoryLink').select{|n| n.text[/Assets/] }.first
       when :cover_art

diff --git a/spec/models/scraper/control_spec.rb b/spec/models/scraper/control_spec.rb
@@ -1,4 +1,6 @@
 require 'spec_helper'
+require 'support/scraper_mocks'
+include ScraperMocksHelper
 
 describe "Navd::Scraper::Control" do
   let(:scraper_control) { Navd::Scraper::Control.new }
@@ -19,7 +21,7 @@
     subject { scraper_control.load_show(show_number) }
     before {
       Navd::Scraper::Spider.any_instance.stub(:get_page).and_return(Nokogiri::HTML(published_show_page_html))
-      Navd::Scraper::ShowLoader.any_instance.stub(:extract_show_notes).and_return([])
+      Navd::Scraper::ShowLoader.any_instance.stub(:show_notes).and_return([])
     }
     it "should create show record" do
       expect { subject }.to change { Show.count }.from(0).to(1)

diff --git a/spec/models/scraper/show_loader_spec.rb b/spec/models/scraper/show_loader_spec.rb
@@ -1,4 +1,6 @@
 require 'spec_helper'
+require 'support/scraper_mocks'
+include ScraperMocksHelper
 
 describe "Navd::Scraper::ShowLoader" do
   let(:show_number) { 333 }
@@ -24,24 +26,48 @@
         :published_date=>Date.parse('2011-8-25'),
         :cover_art_url=>"http://dropbox.curry.com/ShowNotesArchive/2011/08/NA-333-2011-08-25/Assets/na333art.png",
         :assets_url=>"http://333.nashownotes.com/assets",
-        :url=>"http://blog.curry.com/stories/2011/08/25/na33320110825.html"
+        :url=>"http://blog.curry.com/stories/2011/08/25/na33320110825.html",
+        :credits=>nil
       } }
       before {
         show_loader.spider.stub(:get_page).and_return(Nokogiri::HTML(published_show_page_html))
-        show_loader.stub(:extract_show_notes).and_return([])
+        show_loader.stub(:show_notes).and_return([])
         show_loader.scan_show_assets
       }
       its(:found) { should be_true }
       its(:published) { should be_true }
       its(:errors) { should be_empty }
       its(:attributes) { should eql(expected_attributes) }
+
+      describe "#shownotes_menu_uri [protected]" do
+        subject { show_loader.send(:shownotes_menu_uri) }
+        let(:expected) { URI.parse('http://333.nashownotes.com/shownotes') }
+        it { should eql(expected) }
+      end
+
+      context "parsing shownotes menu" do
+        before {
+          show_loader.stub(:p_shownotes_menu).and_return(Nokogiri::HTML(shownotes_menu_page_html))
+        }
+        describe "#shownotes_detail_main_uri [protected]" do
+          subject { show_loader.send(:shownotes_detail_main_uri) }
+          let(:expected) { URI.parse('http://333.nashownotes.com/shownotes/na33320110825Shownotes') }
+          it { should eql(expected) }
+        end
+        describe "#credits_uri [protected]" do
+          subject { show_loader.send(:credits_uri) }
+          let(:expected) { URI.parse('http://333.nashownotes.com/shownotes/na33320110825Credits') }
+          it { should eql(expected) }
+        end
+      end
+
     end
 
     context "unpublished show" do
       let(:show_number) { 33 }
       before {
         show_loader.spider.stub(:get_page).and_return(Nokogiri::HTML(unpublished_show_page_html))
-        show_loader.stub(:extract_show_notes).and_return([])
+        show_loader.stub(:show_notes).and_return([])
         show_loader.scan_show_assets
       }
       its(:found) { should be_true }
@@ -52,14 +78,13 @@
     context "bad url" do
       before {
         show_loader.spider.stub(:get_uri_for_show).and_return('bad://karma')
-        show_loader.stub(:extract_show_notes).and_return([])
+        show_loader.stub(:show_notes).and_return([])
         show_loader.scan_show_assets
       }
       its(:found) { should be_false }
       its(:published) { should be_false }
       its(:errors) { should_not be_empty }
     end
   end
-
 
 end
diff --git a/spec/models/scraper/spider_spec.rb b/spec/models/scraper/spider_spec.rb
@@ -1,4 +1,6 @@
 require 'spec_helper'
+require 'support/scraper_mocks'
+include ScraperMocksHelper
 
 describe "Navd::Scraper::Spider" do
   let(:spider) { Navd::Scraper::Spider.new }
@@ -35,7 +37,13 @@
   describe "#get_page" do
     let(:uri) { spider.get_uri_for_show(333) }
     subject { spider.get_page(uri) }
-    it { should be_a(Nokogiri::HTML::Document) }
+    context "valid uri" do
+      before {
+        # Bypassing true integration tests that make real web calls for now ...
+        spider.stub(:get_page).and_return(Nokogiri::HTML(published_show_page_html))
+      }
+      it { should be_a(Nokogiri::HTML::Document) }
+    end
     context "bad uri" do
       let(:uri) { 'bad:://karma' }
       it { should be_nil }

diff --git a/spec/models/scraper/support_spec.rb b/spec/models/scraper/support_spec.rb
@@ -21,7 +21,6 @@ class ScraperSupportTestHarness
     describe "#normalize_uri" do
       it "should return a URI object" do
         subject.normalize_uri('http://example.net').should be_a(URI)
-
       end
     end
   end