Skip to content

Commit

Permalink
initial scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
tmtmtmtm committed Sep 9, 2015
0 parents commit e03d29a
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .gitignore
@@ -0,0 +1,6 @@

.cache/*

*.swp

*.sqlite
16 changes: 16 additions & 0 deletions Gemfile
@@ -0,0 +1,16 @@
# It's easy to add more libraries or choose different versions. Any libraries
# specified here will be installed and made available to your morph.io scraper.
# Find out more: https://morph.io/documentation/ruby

source "https://rubygems.org"

ruby "2.0.0"

gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "execjs"
gem "pry"
gem "colorize"
gem "nokogiri"
gem "open-uri-cached"
gem "fuzzy_match"
gem 'wikidata-client', '~> 0.0.7', require: 'wikidata'
55 changes: 55 additions & 0 deletions Gemfile.lock
@@ -0,0 +1,55 @@
GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
branch: morph_defaults
specs:
scraperwiki (3.0.1)
httpclient
sqlite_magic

GEM
remote: https://rubygems.org/
specs:
coderay (1.1.0)
colorize (0.7.7)
excon (0.45.4)
execjs (2.5.2)
faraday (0.9.1)
multipart-post (>= 1.2, < 3)
faraday_middleware (0.10.0)
faraday (>= 0.7.4, < 0.10)
fuzzy_match (2.1.0)
hashie (3.4.2)
httpclient (2.6.0.1)
method_source (0.8.2)
mini_portile (0.6.2)
multipart-post (2.0.0)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
open-uri-cached (0.0.5)
pry (0.10.1)
coderay (~> 1.1.0)
method_source (~> 0.8.1)
slop (~> 3.4)
slop (3.6.0)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3
wikidata-client (0.0.7)
excon (~> 0.40)
faraday (~> 0.9)
faraday_middleware (~> 0.9)
hashie (~> 3.3)

PLATFORMS
ruby

DEPENDENCIES
colorize
execjs
fuzzy_match
nokogiri
open-uri-cached
pry
scraperwiki!
wikidata-client (~> 0.0.7)
1 change: 1 addition & 0 deletions README.md
@@ -0,0 +1 @@
This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation)
48 changes: 48 additions & 0 deletions scraper.rb
@@ -0,0 +1,48 @@
#!/bin/env ruby
# encoding: utf-8

require 'scraperwiki'
require 'nokogiri'
require 'colorize'
require 'pry'
require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'

class String
def tidy
self.gsub(/[[:space:]]+/, ' ').strip
end
end

def noko_for(url)
Nokogiri::HTML(open(url).read)
end

def scrape_list(url)
noko = noko_for(url)
header = noko.xpath('//tr[contains(.,"CONSTITUENCY REPRESENTED")]').last
header.xpath('following-sibling::tr').each do |tr|
tds = tr.css('td')
next if tds.count < 4

# Don't need anything extra from this yet...
source = tds[0].css('a/@href').text
next if source.to_s.empty?
source = URI.join(url, source).to_s

data = {
id: source.split('/').last.split('-').first,
name: tds[0].text.sub('Hon. ','').tidy,
constituency: tds[1].text.tidy,
party: tds[2].text.tidy,
image: tds[3].css('img/@src').text,
term: 2012,
source: source,
}
data[:image] = URI.join(url, data[:image]).to_s unless data[:image].to_s.empty?

ScraperWiki.save_sqlite([:id, :term], data)
end
end

scrape_list('http://nationalassembly.gov.bz/index.php/hor-lowerhouse/present-members-house')

0 comments on commit e03d29a

Please sign in to comment.