Skip to content

Commit

Permalink
Update scraper.rb
Browse files Browse the repository at this point in the history
  • Loading branch information
yourneighbourhood committed Jul 5, 2018
1 parent aa05790 commit 485e361
Showing 1 changed file with 35 additions and 39 deletions.
74 changes: 35 additions & 39 deletions scraper.rb
@@ -1,46 +1,43 @@
require 'scraperwiki'
require 'mechanize'

# Scraping from Masterview 2.0

def scrape_page(page, comment_url)
page.at("table.rgMasterTable").search("tr.rgRow,tr.rgAltRow").each do |tr|
tds = tr.search('td').map{|t| t.inner_html.gsub("\r\n", "").strip}
day, month, year = tds[2].split("/").map{|s| s.to_i}
record = {
"info_url" => (page.uri + tr.search('td').at('a')["href"]).to_s,
"council_reference" => tds[1],
"date_received" => Date.new(year, month, day).to_s,
"description" => tds[3].gsub("&amp;", "&").split("<br>")[1].squeeze(" ").strip,
"address" => tds[3].gsub("&amp;", "&").split("<br>")[0].gsub("\r", " ").gsub("<b>","").gsub("</b>","").squeeze(" ").strip,
"date_scraped" => Date.today.to_s,
"comment_url" => comment_url
}
#p record
if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true)
ScraperWiki.save_sqlite(['council_reference'], record)
else
puts "Skipping already saved record " + record['council_reference']
def scrape_page(page)
page.at("table#ctl00_cphContent_ctl01_ctl00_RadGrid1_ctl00 tbody").search("tr").each do |tr|
begin
tds = tr.search('td').map{|t| t.inner_text.gsub("\r\n", "").strip}
day, month, year = tds[3].split("/").map{|s| s.to_i}
record = {
"info_url" => (page.uri + tr.search('td').at('a')["href"]).to_s,
"council_reference" => tds[1].split(" - ")[0].squeeze(" ").strip,
"date_received" => Date.new(year, month, day).to_s,
"description" => tds[1].split(" - ")[1..-1].join(" - ").squeeze(" ").strip,
"address" => tds[2].squeeze(" ").strip,
"date_scraped" => Date.today.to_s
}
if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true)
ScraperWiki.save_sqlite(['council_reference'], record)
# else
# puts "Skipping already saved record " + record['council_reference']
end
rescue
next
end
end
end


# Implement a click on a link that understands stupid asp.net doPostBack
def click(page, doc)
return nil if doc.nil?

js = doc["href"] || doc["onclick"]
if js =~ /javascript:__doPostBack\('(.*)','(.*)'\)/
event_target = $1
event_argument = $2
form = page.form_with(id: "aspnetForm")
form["__EVENTTARGET"] = event_target
form["__EVENTARGUMENT"] = event_argument
form.submit
elsif js =~ /return false;__doPostBack\('(.*)','(.*)'\)/
nil
else
# TODO Just follow the link likes it's a normal link
begin
js = doc["href"] || doc["onclick"]
if js =~ /javascript:__doPostBack\('(.*)','(.*)'\)/
event_target = $1
event_argument = $2
form = page.form_with(id: "aspnetForm")
form["__EVENTTARGET"] = event_target
form["__EVENTARGUMENT"] = event_argument
form.submit
elsif js =~ /return false;__doPostBack\('(.*)','(.*)'\)/
nil
else
# TODO Just follow the link likes it's a normal link
raise
end
rescue
Expand All @@ -61,9 +58,8 @@ def click(page, doc)
period = period + "&2=" + Date.new(matches[0][0].to_i, matches[0][1].to_i, -1).strftime("%d/%m/%Y")

puts "Getting data in `" + periodstr + "`."

url = "http://pdonline.moretonbay.qld.gov.au/Modules/applicationmaster/default.aspx?page=found&1=thismonth&6=F"
comment_url = "mailto:mbrc@moretonbay.qld.gov.au"

agent = Mechanize.new

Expand Down

0 comments on commit 485e361

Please sign in to comment.