Skip to content

Commit

Permalink
Initial scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
tmtmtmtm committed Jul 17, 2015
0 parents commit ab078cc
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .gitignore
@@ -0,0 +1,6 @@

.cache/*

*.swp

*.sqlite
15 changes: 15 additions & 0 deletions Gemfile
@@ -0,0 +1,15 @@
# It's easy to add more libraries or choose different versions. Any libraries
# specified here will be installed and made available to your morph.io scraper.
# Find out more: https://morph.io/documentation/ruby

source "https://rubygems.org"

ruby "2.0.0"

gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "execjs"
gem "pry"
gem "colorize"
gem "nokogiri"
gem "open-uri-cached"
gem "fuzzy_match"
42 changes: 42 additions & 0 deletions Gemfile.lock
@@ -0,0 +1,42 @@
GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
branch: morph_defaults
specs:
scraperwiki (3.0.1)
httpclient
sqlite_magic

GEM
remote: https://rubygems.org/
specs:
coderay (1.1.0)
colorize (0.7.7)
execjs (2.5.2)
fuzzy_match (2.1.0)
httpclient (2.6.0.1)
method_source (0.8.2)
mini_portile (0.6.2)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
open-uri-cached (0.0.5)
pry (0.10.1)
coderay (~> 1.1.0)
method_source (~> 0.8.1)
slop (~> 3.4)
slop (3.6.0)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3

PLATFORMS
ruby

DEPENDENCIES
colorize
execjs
fuzzy_match
nokogiri
open-uri-cached
pry
scraperwiki!
1 change: 1 addition & 0 deletions README.md
@@ -0,0 +1 @@
This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation)
48 changes: 48 additions & 0 deletions scraper.rb
@@ -0,0 +1,48 @@
#!/bin/env ruby
# encoding: utf-8

require 'scraperwiki'
require 'nokogiri'
require 'open-uri'
require 'colorize'

require 'pry'
require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'

def noko_for(url)
Nokogiri::HTML(open(url).read)
end

def party_info(text)
if text =~ /Fiji First/i
return [ "Fiji First", "FF" ]
elsif text =~ /SODELPA/
return [ "Social Democratic Liberal Party" , "SODELPA" ]
elsif text =~ /NATIONAL FEDERATION PARTY/
return [ "National Federation Party" , "NFP" ]
else
warn "Unknown party: #{text}"
end
end

def scrape_list(url)
noko = noko_for(url)

noko.xpath('.//td[img]').each do |td|
party, party_id = party_info ( td.xpath('preceding::strong[1]').text )
data = {
name: td.text.gsub(/[[:space:]]+/, ' ').strip,
image: td.css('img/@src').text,
party: party,
party_id: party_id,
term: '2014',
source: url,
}
data[:image] = URI.join(url, data[:image]).to_s unless data[:image].to_s.empty?
# puts data
ScraperWiki.save_sqlite([:name, :term], data)
end
end

scrape_list('http://www.parliament.gov.fj/Members/Parliamentery-Parties.aspx')

0 comments on commit ab078cc

Please sign in to comment.