Skip to content

Commit

Permalink
initial scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
tmtmtmtm committed Sep 21, 2015
0 parents commit 696b0bb
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .gitignore
@@ -0,0 +1,6 @@

.cache/*

*.swp

*.sqlite
16 changes: 16 additions & 0 deletions Gemfile
@@ -0,0 +1,16 @@
# It's easy to add more libraries or choose different versions. Any libraries
# specified here will be installed and made available to your morph.io scraper.
# Find out more: https://morph.io/documentation/ruby

source "https://rubygems.org"

ruby "2.0.0"

gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "execjs"
gem "pry"
gem "colorize"
gem "nokogiri"
gem "open-uri-cached"
gem "fuzzy_match"
gem 'wikidata-client', '~> 0.0.7', require: 'wikidata'
55 changes: 55 additions & 0 deletions Gemfile.lock
@@ -0,0 +1,55 @@
GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
branch: morph_defaults
specs:
scraperwiki (3.0.1)
httpclient
sqlite_magic

GEM
remote: https://rubygems.org/
specs:
coderay (1.1.0)
colorize (0.7.7)
excon (0.45.4)
execjs (2.5.2)
faraday (0.9.1)
multipart-post (>= 1.2, < 3)
faraday_middleware (0.10.0)
faraday (>= 0.7.4, < 0.10)
fuzzy_match (2.1.0)
hashie (3.4.2)
httpclient (2.6.0.1)
method_source (0.8.2)
mini_portile (0.6.2)
multipart-post (2.0.0)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
open-uri-cached (0.0.5)
pry (0.10.1)
coderay (~> 1.1.0)
method_source (~> 0.8.1)
slop (~> 3.4)
slop (3.6.0)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3
wikidata-client (0.0.7)
excon (~> 0.40)
faraday (~> 0.9)
faraday_middleware (~> 0.9)
hashie (~> 3.3)

PLATFORMS
ruby

DEPENDENCIES
colorize
execjs
fuzzy_match
nokogiri
open-uri-cached
pry
scraperwiki!
wikidata-client (~> 0.0.7)
1 change: 1 addition & 0 deletions README.md
@@ -0,0 +1 @@
This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation)
38 changes: 38 additions & 0 deletions scraper.rb
@@ -0,0 +1,38 @@
#!/bin/env ruby
# encoding: utf-8

require 'scraperwiki'
require 'nokogiri'
require 'colorize'
require 'pry'
require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'

class String
def tidy
self.gsub(/[[:space:]]+/, ' ').strip
end
end

def noko_for(url)
Nokogiri::HTML(open(url).read)
end

def scrape_list(url)
noko = noko_for(url)
noko.xpath('//table[.//th[text()="Circonscription"]]//tr[td]').each do |tr|
term = tr.xpath('preceding::h3/span[@class="mw-headline"]').last.text[/(\d+)/, 1]
tds = tr.css('td')
data = {
name: tds[0].text.tidy,
wikiname: tds[0].xpath('.//a[not(@class="new")]/@title').text,
party: tds[1].text.tidy,
area: tds[2].text.tidy,
term: term,
source: url.to_s,
}
ScraperWiki.save_sqlite([:name, :area, :term], data)
end
end

scrape_list('https://fr.wikipedia.org/wiki/Conseil_territorial_de_Saint-Pierre-et-Miquelon')

0 comments on commit 696b0bb

Please sign in to comment.