Skip to content

Commit

Permalink
Update scraper.rb
Browse files Browse the repository at this point in the history
  • Loading branch information
yourneighbourhood committed Apr 22, 2020
1 parent c1fdc1a commit 1b7bd36
Showing 1 changed file with 65 additions and 25 deletions.
90 changes: 65 additions & 25 deletions scraper.rb
Original file line number Diff line number Diff line change
@@ -1,25 +1,65 @@
# This is a template for a Ruby scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful

# require 'scraperwiki'
# require 'mechanize'
#
# agent = Mechanize.new
#
# # Read in a page
# page = agent.get("http://foo.com")
#
# # Find somehing on the page using css selectors
# p page.at('div.content')
#
# # Write out to the sqlite database using scraperwiki library
# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
#
# # An arbitrary query against the database
# ScraperWiki.select("* from data where 'name'='peter'")

# You don't have to do things with the Mechanize or ScraperWiki libraries.
# You can use whatever gems you want: https://morph.io/documentation/ruby
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
require 'scraperwiki'
require 'mechanize'

case ENV['MORPH_PERIOD']
when 'thismonth'
period = 'thismonth'
when 'lastmonth'
period = 'lastmonth'
else
period = 'thisweek'
end
puts "Getting '" + period + "' data, changable via MORPH_PERIOD environment";

url_base = 'http://pdonline.redland.qld.gov.au'
da_url = url_base + '/Pages/XC.Track/SearchApplication.aspx?d=' + period + '&k=LodgementDate&t=BD,BW,BA,MC,MCU,OPW,BWP,APS,MCSS,OP,EC,SB,SBSS,PD,BX,ROL,QRAL'
comment_url = 'mailto:rcc@redland.qld.gov.au?subject=Development Application Enquiry: '

# setup agent and turn off gzip as council web site returning 'encoded-content: gzip,gzip'
agent = Mechanize.new
agent.request_headers = { "Accept-Encoding" => "" }

# Accept terms
page = agent.get(url_base + '/Common/Common/terms.aspx')
form = page.forms.first
form["ctl00$ctMain$BtnAgree"] = "I Agree"
page = form.submit

# Scrape DA page
page = agent.get(da_url)
results = page.search('div.result')

results.each do |result|
council_reference = result.search('a.search')[0].inner_text.strip.split.join(" ")

description = result.inner_text
description = description.split( /\r?\n/ )
description = description[4].strip.split.join(" ")

info_url = result.search('a.search')[0]['href']
info_url = info_url.sub!('../..', '')
info_url = url_base + info_url

date_received = result.inner_text
date_received = date_received.split( /\r?\n/ )
date_received = Date.parse(date_received[6].strip.to_s)

record = {
'council_reference' => council_reference,
'address' => result.search('strong')[0].inner_text.strip.split.join(" "),
'description' => description,
'info_url' => info_url,
'comment_url' => comment_url + council_reference,
'date_scraped' => Date.today.to_s,
'date_received' => date_received
}

# Saving data
if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true)
puts "Saving record " + record['council_reference'] + ", " + record['address']
# puts record
ScraperWiki.save_sqlite(['council_reference'], record)
else
puts "Skipping already saved record " + record['council_reference']
end
end

0 comments on commit 1b7bd36

Please sign in to comment.