Update scraper.rb

yourneighbourhood · Apr 14, 2020 · dd32967 · dd32967
1 parent 81053bf
commit dd32967
Showing 1 changed file with 100 additions and 25 deletions.
diff --git a/scraper.rb b/scraper.rb
@@ -1,25 +1,100 @@
-# This is a template for a Ruby scraper on morph.io (https://morph.io)
-# including some code snippets below that you should find helpful
-
-# require 'scraperwiki'
-# require 'mechanize'
-#
-# agent = Mechanize.new
-#
-# # Read in a page
-# page = agent.get("http://foo.com")
-#
-# # Find somehing on the page using css selectors
-# p page.at('div.content')
-#
-# # Write out to the sqlite database using scraperwiki library
-# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
-#
-# # An arbitrary query against the database
-# ScraperWiki.select("* from data where 'name'='peter'")
-
-# You don't have to do things with the Mechanize or ScraperWiki libraries.
-# You can use whatever gems you want: https://morph.io/documentation/ruby
-# All that matters is that your final data is written to an SQLite database
-# called "data.sqlite" in the current working directory which has at least a table
-# called "data".
+<?php
+require_once 'vendor/autoload.php';
+require_once 'vendor/openaustralia/scraperwiki/scraperwiki.php';
+
+use PGuardiario\PGBrowser;
+use Sunra\PhpSimple\HtmlDomParser;
+
+date_default_timezone_set('Australia/Sydney');
+
+# Default to 'thisweek', use MORPH_PERIOD to change to 'thismonth' or 'lastmonth' for data recovery
+switch(getenv('MORPH_PERIOD')) {
+    case 'thismonth' :
+        $period = 'thismonth';
+        break;
+    case 'lastmonth' :
+        $period = 'lastmonth';
+        break;
+    default         :
+        $period = 'thisweek';
+        break;
+}
+print "Getting data for `" .$period. "`, changable via MORPH_PERIOD environment\n";
+
+$url_base = "https://da.bundaberg.qld.gov.au/modules/ApplicationMaster/";
+$term_url = "https://da.bundaberg.qld.gov.au/modules/common/Default.aspx?page=disclaimer";
+$da_page  = $url_base . "default.aspx?page=found&1=" .$period. "&4a=333,322,321,324,323,325&6=F";
+$comment_base = "mailto:CEO@bundaberg.qld.gov.au?subject=Development Application Enquiry: ";
+
+# Agreed Terms
+$browser = new PGBrowser();
+$page = $browser->get($term_url);
+$form = $page->form();
+if ( empty($form) ) {
+    print "Distil networks kicks in..... :-( \n";
+    exit(0);
+}
+$form->set('ctl00$cphContent$ctl00$btnOk', 'Agree');
+$page = $form->submit();
+sleep(mt_rand(3, 5));
+
+$page = $browser->get($url_base . 'Default.aspx?page=search');
+sleep(mt_rand(3, 5));
+
+$page = $browser->get($da_page);
+$dom = HtmlDomParser::str_get_html($page->html);
+
+$NumPages = count($dom->find('div[class=rgWrap rgNumPart] a'));
+if ($NumPages === 0) {
+    $NumPages = 1;
+}
+
+for ($i = 1; $i <= $NumPages; $i++) {
+    print "Scraping page $i of $NumPages \n";
+
+    # If more than a single page, fetch the page
+    if ($i > 1) {
+        $form = $page->form();
+        sleep(mt_rand(3, 5));
+        $page = $form->doPostBack($dom->find('div[class=rgWrap rgNumPart] a', $i-1)->href);
+        $dom  = HtmlDomParser::str_get_html($page->html);
+    }
+
+    $results = $dom->find("tr[class=rgRow], tr[class=rgAltRow]");
+    if ( empty($results) ) {
+      print "Distil networks kicks in..... :-( \n";
+      exit(0);
+    }
+
+    # The usual, look for the data set and if needed, save it
+    foreach ($results as $result) {
+        # Slow way to transform the date but it works
+        $date_received = explode(' ', (trim($result->children(2)->plaintext)), 2);
+        $date_received = explode('/', $date_received[0]);
+        $date_received = "$date_received[2]-$date_received[1]-$date_received[0]";
+
+        # Prep a bit more, ready to add these to the array
+        $tempstr = explode('<br/>', $result->children(3)->innertext);
+
+        # Put all information in an array
+        $record = array (
+            'council_reference' => trim($result->children(1)->plaintext),
+            'address'           => trim(preg_replace('/\s+/', ' ', $tempstr[0]) . ", QLD"),
+            'description'       => preg_replace('/\s+/', ' ', $tempstr[1]),
+            'info_url'          => $url_base . trim($result->find('a',0)->href),
+            'comment_url'       => $comment_base . trim($result->children(1)->plaintext),
+            'date_scraped'      => date('Y-m-d'),
+            'date_received'     => date('Y-m-d', strtotime($date_received))
+        );
+
+        # Check if record exist, if not, INSERT, else do nothing
+        $existingRecords = scraperwiki::select("* from data where `council_reference`='" . $record['council_reference'] . "'");
+        if (count($existingRecords) == 0) {
+            print ("Saving record " . $record['council_reference'] . " - " . $record['address']. "\n");
+#            print_r ($record);
+            scraperwiki::save(['council_reference'], $record);
+        } else {
+            print ("Skipping already saved record " . $record['council_reference'] . "\n");
+        }
+    }
+}