Skip to content

Commit

Permalink
initial scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
tmtmtmtm committed Sep 12, 2015
0 parents commit a9213c1
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .gitignore
@@ -0,0 +1,6 @@

.cache/*

*.swp

*.sqlite
16 changes: 16 additions & 0 deletions Gemfile
@@ -0,0 +1,16 @@
# It's easy to add more libraries or choose different versions. Any libraries
# specified here will be installed and made available to your morph.io scraper.
# Find out more: https://morph.io/documentation/ruby

source "https://rubygems.org"

ruby "2.0.0"

gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "execjs"
gem "pry"
gem "colorize"
gem "nokogiri"
gem "open-uri-cached"
gem "fuzzy_match"
gem 'wikidata-client', '~> 0.0.7', require: 'wikidata'
55 changes: 55 additions & 0 deletions Gemfile.lock
@@ -0,0 +1,55 @@
GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
branch: morph_defaults
specs:
scraperwiki (3.0.1)
httpclient
sqlite_magic

GEM
remote: https://rubygems.org/
specs:
coderay (1.1.0)
colorize (0.7.7)
excon (0.45.4)
execjs (2.5.2)
faraday (0.9.1)
multipart-post (>= 1.2, < 3)
faraday_middleware (0.10.0)
faraday (>= 0.7.4, < 0.10)
fuzzy_match (2.1.0)
hashie (3.4.2)
httpclient (2.6.0.1)
method_source (0.8.2)
mini_portile (0.6.2)
multipart-post (2.0.0)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
open-uri-cached (0.0.5)
pry (0.10.1)
coderay (~> 1.1.0)
method_source (~> 0.8.1)
slop (~> 3.4)
slop (3.6.0)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3
wikidata-client (0.0.7)
excon (~> 0.40)
faraday (~> 0.9)
faraday_middleware (~> 0.9)
hashie (~> 3.3)

PLATFORMS
ruby

DEPENDENCIES
colorize
execjs
fuzzy_match
nokogiri
open-uri-cached
pry
scraperwiki!
wikidata-client (~> 0.0.7)
1 change: 1 addition & 0 deletions README.md
@@ -0,0 +1 @@
This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation)
84 changes: 84 additions & 0 deletions scraper.rb
@@ -0,0 +1,84 @@
#!/bin/env ruby
# encoding: utf-8

require 'scraperwiki'
require 'nokogiri'
require 'colorize'
require 'pry'
require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'

class String
def tidy
self.gsub(/[[:space:]]+/, ' ').strip
end
end

def noko_for(url)
Nokogiri::HTML(open(url).read)
end

def scrape_term(term, url)
noko = noko_for(url)
noko.css('li.parl-i').each do |li|
etrap = li.css('.parl-i--d').text.tidy

if etrap.downcase.include? 'okrug'
area, area_id = li.css('.parl-i--d').text.match(/Saýlaw etrap №(\d+)-nj[iy] "(.*?)" saýlaw okrugy/).captures
else
area_id = li.css('.parl-i--d').text[/Saýlaw etrap №(\d+)/, 1]
area = etrap
end

person = {
id: li.css('a/@href').text.split('/').last,
name: li.css('a').text.tidy,
area: area,
area_id: area_id,
image: li.css('img/@src').text,
term: term[:id],
source: li.css('a/@href').text,
}
person[:image] = URI.join(term[:source], person[:image]).to_s unless person[:image].to_s.empty?
person[:source] = URI.join(term[:source], person[:source]).to_s unless person[:source].to_s.empty?
data = person.merge(scrape_person(person))
ScraperWiki.save_sqlite([:id, :term], data)
end

unless (next_page = noko.css('.p-l .p-i--next a/@href')).empty?
scrape_term(term, URI.join(url, next_page.text))
end
end

def scrape_person(person)
noko = noko_for(person[:source])
data = {
district: noko.css('.bio-i--rg .bio-i--cnt').text,
party: noko.css('.bio-i--lbl .bio-i--cnt').text,
birth_year: noko.css('.bio-i--by .bio-i--cnt').text,
}
end

terms = [
{
id: 5,
name: '5th Convocation',
start_date: '2013',
source: 'http://mejlis2.bushluk.com/tm/parliamentaries/search/?convocation=14118',
},
{
id: 4,
name: '4th Convocation',
start_date: '2009',
end_date: '2013',
source: 'http://mejlis2.bushluk.com/tm/parliamentaries/search/?convocation=2159',
},
]

terms.each do |term|
puts term
# ScraperWiki.save_sqlite([:id], term, 'terms')
scrape_term(term, term[:source])
end


0 comments on commit a9213c1

Please sign in to comment.