Skip to content
This repository has been archived by the owner on Nov 8, 2022. It is now read-only.

Commit

Permalink
improve show loading tests and structure
Browse files Browse the repository at this point in the history
  • Loading branch information
tardate committed Oct 22, 2011
1 parent df11b22 commit 7c1d642
Show file tree
Hide file tree
Showing 6 changed files with 611 additions and 49 deletions.
100 changes: 64 additions & 36 deletions lib/navd/scraper/show_loader.rb
Expand Up @@ -3,16 +3,15 @@ class ShowLoader
attr_accessor :number, :spider, :uri, :found, :published, :errors
attr_reader :attributes, :show_notes

attr_reader :p_shownotes_page # Nokogiri::HTML::Document of the main shownotes page being processed
attr_reader :p_shownotes_main # Nokogiri::HTML::Document of the main shownotes page being processed

# +number+ - show number to load
def initialize(number)
@errors = []
@number = number
@spider = Navd::Scraper::Spider.new
@attributes = {}
@show_notes = []
@p_shownotes_page = nil
@spider = Navd::Scraper::Spider.new
end

# Returns true if show details have been scraped without error
Expand All @@ -23,7 +22,7 @@ def loaded?
# Loads all the show details for given show
# If an error is encountered, +@errors+ will be present
def scan_show_assets
@uri, @p_shownotes_page = spider.get_page_for_show(number)
@uri, @p_shownotes_main = spider.get_page_for_show(number)
if spider.errors.present?
@errors += spider.errors
return false
Expand All @@ -43,32 +42,22 @@ def scan_show_assets
:published_date => published_date,
:cover_art_url => cover_art_url,
:assets_url => assets_url,
:url =>episode_url
:url => episode_url,
:credits => credits
})
# TODO: credits
show_notes
errors.empty?
end


# Returns an array of hashes with show note detail (:name,:meme_name,:description,:url)
# +show_note_details_uri+ URI to show note details page (e.g. http://349.nashownotes.com/shownotes)
# def extract_show_notes_from_page(page)
def show_notes
@show_notes ||= extract_notes_from_page(get_all_notes_page)
end

# TODO: need some refactoring and exception handling
def get_all_notes_page
assets_page = spider.get_page(show_note_details_uri)
notes_uri = show_note_assets_uri.merge(extract_show_notes_link(assets_page))
notes_page = spider.get_page(notes_uri)
all_notes_uri = show_note_assets_uri.merge(extract_nodes(notes_page,:all_notes)[:href])
spider.get_page(all_notes_uri)
end

def extract_notes_from_page(page)
return @show_notes if @show_notes
# TODO: test for p_shownotes_detail_all
notes = []
current_meme = nil
page.at_css('ul.ulDirectory').children.each do |n|
p_shownotes_detail_all.at_css('ul.ulDirectory').children.each do |n|
if n.name=='li' && n[:class]=='directoryItem'
current_meme = n.text
elsif n.name=='ul' && n[:class]=='ulDirectory'
Expand All @@ -88,8 +77,14 @@ def extract_notes_from_page(page)
end
end
end
notes
@show_notes = notes
end

# TODO: parse/extract credits
def credits
@credits ||= nil
end
protected :credits

# Returns the show date (as extracted from the audio file name)
# Dodgy approach, but seems the most reliable way of automatically getting the show date
Expand All @@ -106,48 +101,81 @@ def published_date
# Returns the link to audio file from the main shownotes page
# e.g. http://m.podshow.com/media/15412/episodes/299798/noagenda-299798-10-20-2011.mp3
def mp3_url
@mp3_url ||= extract_nodes(p_shownotes_page,:mp3)[:href]
@mp3_url ||= extract_nodes(p_shownotes_main,:mp3)[:href]
end
protected :mp3_url

# Returns the link to official show page
# e.g. http://blog.curry.com/stories/2011/10/20/na34920111020.html
def episode_url
@episode_url ||= extract_nodes(p_shownotes_main,:web)[:href]
end
protected :episode_url

# Returns the link to cover art
# e.g. http://dropbox.curry.com/ShowNotesArchive/2011/10/NA-349-2011-10-20/Assets/ns349art.png
def cover_art_url
@cover_art_url ||= extract_nodes(p_shownotes_page,:cover_art)[:href]
@cover_art_url ||= extract_nodes(p_shownotes_main,:cover_art)[:href]
end
protected :cover_art_url

# Returns the link to show assets page
# e.g. http://349.nashownotes.com/assets
def assets_url
@assets_url ||= uri.merge(extract_nodes(p_shownotes_page,:assets)[:href]).to_s
@assets_url ||= uri.merge(extract_nodes(p_shownotes_main,:assets)[:href]).to_s
end
protected :assets_url

# Returns the URI for link to show notes detail page
# Returns the URI to show notes menu page
# e.g. http://349.nashownotes.com/shownotes
def show_note_details_uri
@show_note_details_uri ||= uri.merge(extract_nodes(p_shownotes_page,:notes)[:href])
def shownotes_menu_uri
@shownotes_menu_uri ||= uri.merge(extract_nodes(p_shownotes_main,:notes)[:href])
end
# Returns Nokogiri::HTML::Document of the main shownotes menu page being processed
def p_shownotes_menu
@p_shownotes_menu ||= spider.get_page(shownotes_menu_uri)
end
protected :show_note_details_uri
protected :shownotes_menu_uri, :p_shownotes_menu

# e.g. http://349.nashownotes.com/shownotes/na34920111020Shownotes
def shownotes_detail_main_uri
@shownotes_detail_main_uri ||= uri.merge(extract_nodes(p_shownotes_menu,:notes)[:href])
end
def p_shownotes_detail_main
@p_show_note_detail_main ||= spider.get_page(shownotes_detail_main_uri)
end
protected :shownotes_detail_main_uri, :p_shownotes_detail_main

# e.g. http://349.nashownotes.com/shownotes/na34920111020Shownotes/expandAllTopics
def shownotes_detail_all_uri
@shownotes_detail_all_uri ||= uri.merge(extract_nodes(p_shownotes_detail_main,:all_notes)[:href])
end
def p_shownotes_detail_all
@p_shownotes_detail_all ||= spider.get_page(shownotes_detail_all_uri)
end
protected :shownotes_detail_all_uri, :p_shownotes_detail_all

# Returns the URI to show credits page
# http://349.nashownotes.com/shownotes/na34920111020Credits
# http://349.nashownotes.com/shownotes/na34920111020Shownotes

# Returns the link to official show page
# e.g. http://blog.curry.com/stories/2011/10/20/na34920111020.html
def episode_url
@episode_url ||= extract_nodes(p_shownotes_page,:web)[:href]
def credits_uri
@credits_uri ||= uri.merge(extract_nodes(p_shownotes_menu,:credits)[:href])
end
protected :episode_url
def p_credits
@p_credits ||= spider.get_page(credits_uri)
end
protected :credits_uri, :p_credits

# http://349.nashownotes.com/shownotes/na34920111020Shownotes

# Set of algorithms to extracts parts of a page
# +page+ is a Nokogiri::HTML::Document
# +item+ - symbol for note type required
def extract_nodes(page,item)
result = case item
when :all_notes
page.css('a.directoryLink').select{|n| n.text[/expand all/i] }.first
when :credits
page.css('a.directoryLink').select{|n| n.text[/credits/i] }.first
when :assets
page.css('a.directoryLink').select{|n| n.text[/Assets/] }.first
when :cover_art
Expand Down
4 changes: 3 additions & 1 deletion spec/models/scraper/control_spec.rb
@@ -1,4 +1,6 @@
require 'spec_helper'
require 'support/scraper_mocks'
include ScraperMocksHelper

describe "Navd::Scraper::Control" do
let(:scraper_control) { Navd::Scraper::Control.new }
Expand All @@ -19,7 +21,7 @@
subject { scraper_control.load_show(show_number) }
before {
Navd::Scraper::Spider.any_instance.stub(:get_page).and_return(Nokogiri::HTML(published_show_page_html))
Navd::Scraper::ShowLoader.any_instance.stub(:extract_show_notes).and_return([])
Navd::Scraper::ShowLoader.any_instance.stub(:show_notes).and_return([])
}
it "should create show record" do
expect { subject }.to change { Show.count }.from(0).to(1)
Expand Down
35 changes: 30 additions & 5 deletions spec/models/scraper/show_loader_spec.rb
@@ -1,4 +1,6 @@
require 'spec_helper'
require 'support/scraper_mocks'
include ScraperMocksHelper

describe "Navd::Scraper::ShowLoader" do
let(:show_number) { 333 }
Expand All @@ -24,24 +26,48 @@
:published_date=>Date.parse('2011-8-25'),
:cover_art_url=>"http://dropbox.curry.com/ShowNotesArchive/2011/08/NA-333-2011-08-25/Assets/na333art.png",
:assets_url=>"http://333.nashownotes.com/assets",
:url=>"http://blog.curry.com/stories/2011/08/25/na33320110825.html"
:url=>"http://blog.curry.com/stories/2011/08/25/na33320110825.html",
:credits=>nil
} }
before {
show_loader.spider.stub(:get_page).and_return(Nokogiri::HTML(published_show_page_html))
show_loader.stub(:extract_show_notes).and_return([])
show_loader.stub(:show_notes).and_return([])
show_loader.scan_show_assets
}
its(:found) { should be_true }
its(:published) { should be_true }
its(:errors) { should be_empty }
its(:attributes) { should eql(expected_attributes) }

describe "#shownotes_menu_uri [protected]" do
subject { show_loader.send(:shownotes_menu_uri) }
let(:expected) { URI.parse('http://333.nashownotes.com/shownotes') }
it { should eql(expected) }
end

context "parsing shownotes menu" do
before {
show_loader.stub(:p_shownotes_menu).and_return(Nokogiri::HTML(shownotes_menu_page_html))
}
describe "#shownotes_detail_main_uri [protected]" do
subject { show_loader.send(:shownotes_detail_main_uri) }
let(:expected) { URI.parse('http://333.nashownotes.com/shownotes/na33320110825Shownotes') }
it { should eql(expected) }
end
describe "#credits_uri [protected]" do
subject { show_loader.send(:credits_uri) }
let(:expected) { URI.parse('http://333.nashownotes.com/shownotes/na33320110825Credits') }
it { should eql(expected) }
end
end

end

context "unpublished show" do
let(:show_number) { 33 }
before {
show_loader.spider.stub(:get_page).and_return(Nokogiri::HTML(unpublished_show_page_html))
show_loader.stub(:extract_show_notes).and_return([])
show_loader.stub(:show_notes).and_return([])
show_loader.scan_show_assets
}
its(:found) { should be_true }
Expand All @@ -52,14 +78,13 @@
context "bad url" do
before {
show_loader.spider.stub(:get_uri_for_show).and_return('bad://karma')
show_loader.stub(:extract_show_notes).and_return([])
show_loader.stub(:show_notes).and_return([])
show_loader.scan_show_assets
}
its(:found) { should be_false }
its(:published) { should be_false }
its(:errors) { should_not be_empty }
end
end


end
10 changes: 9 additions & 1 deletion spec/models/scraper/spider_spec.rb
@@ -1,4 +1,6 @@
require 'spec_helper'
require 'support/scraper_mocks'
include ScraperMocksHelper

describe "Navd::Scraper::Spider" do
let(:spider) { Navd::Scraper::Spider.new }
Expand Down Expand Up @@ -35,7 +37,13 @@
describe "#get_page" do
let(:uri) { spider.get_uri_for_show(333) }
subject { spider.get_page(uri) }
it { should be_a(Nokogiri::HTML::Document) }
context "valid uri" do
before {
# Bypassing true integration tests that make real web calls for now ...
spider.stub(:get_page).and_return(Nokogiri::HTML(published_show_page_html))
}
it { should be_a(Nokogiri::HTML::Document) }
end
context "bad uri" do
let(:uri) { 'bad:://karma' }
it { should be_nil }
Expand Down
1 change: 0 additions & 1 deletion spec/models/scraper/support_spec.rb
Expand Up @@ -21,7 +21,6 @@ class ScraperSupportTestHarness
describe "#normalize_uri" do
it "should return a URI object" do
subject.normalize_uri('http://example.net').should be_a(URI)

end
end
end
Expand Down

0 comments on commit 7c1d642

Please sign in to comment.