Skip to content

Commit

Permalink
migrate to Scraped
Browse files Browse the repository at this point in the history
  • Loading branch information
tmtmtmtm committed Jul 18, 2018
1 parent c170514 commit 9543157
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 27 deletions.
10 changes: 8 additions & 2 deletions Gemfile
Expand Up @@ -2,22 +2,28 @@

source 'https://rubygems.org'

ruby '2.3.3'
ruby '2.4.1'

git_source(:github) { |repo_name| "https://github.com/#{repo_name}.git" }

gem 'combine_popolo_memberships', github: 'everypolitician/combine_popolo_memberships'
gem 'execjs'
gem 'minitest'
gem 'minitest-around'
gem 'minitest-vcr'
gem 'nokogiri'
gem 'open-uri-cached'
gem 'pry'
gem 'rake'
gem 'rest-client'
gem 'rubocop'
gem 'scraped', github: 'everypolitician/scraped'
gem 'scraped', github: 'everypolitician/scraped', branch: 'scraper-class'
gem 'scraped_page_archive', github: 'everypolitician/scraped_page_archive'
gem 'scraper_test', github: 'everypolitician/scraper_test'
gem 'scraperwiki', github: 'openaustralia/scraperwiki-ruby',
branch: 'morph_defaults'
gem 'table_unspanner', github: 'everypolitician/table_unspanner'
gem 'vcr'
gem 'webmock'
gem 'wikidata-fetcher', github: 'everypolitician/wikidata-fetcher'
gem 'wikidata_ids_decorator', github: 'everypolitician/wikidata_ids_decorator'
94 changes: 92 additions & 2 deletions Gemfile.lock
@@ -1,6 +1,13 @@
GIT
remote: https://github.com/everypolitician/combine_popolo_memberships.git
revision: 5769841c55abc712ce59835a0fb944b1de12d982
specs:
combine_popolo_memberships (0.2.0)

GIT
remote: https://github.com/everypolitician/scraped.git
revision: 7fd43913456af49d32e882d0cb79f3da1f5117ef
revision: ecb23adeca95fba5356509d6445d528e212b3905
branch: scraper-class
specs:
scraped (0.6.2)
field_serializer (>= 0.3.0)
Expand All @@ -15,13 +22,48 @@ GIT
git (~> 1.3.0)
vcr-archive (~> 0.3.0)

GIT
remote: https://github.com/everypolitician/scraper_test.git
revision: 9b4326c1e04ea7c8caf368d99bb5ac3711726199
specs:
scraper_test (0.1.0)
minitest (~> 5.0)
pry
vcr (>= 3.0.3)
webmock (>= 2.0)

GIT
remote: https://github.com/everypolitician/table_unspanner.git
revision: a70a98a104a75b470f4ea339fdd728366a40b4d8
specs:
table_unspanner (0.1.0)
nokogiri

GIT
remote: https://github.com/everypolitician/wikidata-fetcher.git
revision: 0bef78c6f8070a66211c5a6ef78417f66483a50a
specs:
wikidata-fetcher (0.21.0)
colorize
diskcached
json
mediawiki_api
nokogiri
require_all
rest-client
scraperwiki
wikidata-client (~> 0.0.7)
wikisnakker

GIT
remote: https://github.com/everypolitician/wikidata_ids_decorator.git
revision: 17265d480df935631957a8188de38bdd42cbd29f
specs:
wikidata_ids_decorator (0.2.0)
pry
scraped
wikidata-fetcher

GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
Expand All @@ -38,14 +80,38 @@ GEM
public_suffix (>= 2.0.2, < 4.0)
ast (2.4.0)
coderay (1.1.2)
colorize (0.8.1)
crack (0.4.3)
safe_yaml (~> 1.0.0)
diskcached (1.1.3)
domain_name (0.5.20180417)
unf (>= 0.0.5, < 1.0.0)
excon (0.62.0)
execjs (2.7.0)
faraday (0.15.2)
multipart-post (>= 1.2, < 3)
faraday-cookie_jar (0.0.6)
faraday (>= 0.7.4)
http-cookie (~> 1.0.0)
faraday_middleware (0.12.2)
faraday (>= 0.7.4, < 1.0)
field_serializer (0.3.0)
git (1.3.0)
hashdiff (0.3.7)
hashie (3.5.7)
http-cookie (1.0.3)
domain_name (~> 0.5)
httpclient (2.8.3)
jaro_winkler (1.5.1)
json (2.1.0)
mediawiki_api (0.7.1)
faraday (~> 0.9, >= 0.9.0)
faraday-cookie_jar (~> 0.0, >= 0.0.6)
faraday_middleware (~> 0.10, >= 0.10.0)
method_source (0.9.0)
mime-types (3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2016.0521)
mini_portile2 (2.3.0)
minispec-metadata (2.0.0)
minitest
Expand All @@ -56,6 +122,8 @@ GEM
minispec-metadata (~> 2.0)
minitest (>= 4.7.5)
vcr (>= 2.9)
multipart-post (2.0.0)
netrc (0.11.0)
nokogiri (1.8.4)
mini_portile2 (~> 2.3.0)
open-uri-cached (0.0.5)
Expand All @@ -70,6 +138,10 @@ GEM
rainbow (3.0.0)
rake (12.3.1)
require_all (2.0.0)
rest-client (2.0.2)
http-cookie (>= 1.0.2, < 2.0)
mime-types (>= 1.16, < 4.0)
netrc (~> 0.8)
rubocop (0.58.1)
jaro_winkler (~> 1.5.1)
parallel (~> 1.10)
Expand All @@ -83,6 +155,9 @@ GEM
sqlite3 (1.3.13)
sqlite_magic (0.0.6)
sqlite3
unf (0.1.4)
unf_ext
unf_ext (0.0.7.5)
unicode-display_width (1.4.0)
vcr (3.0.3)
vcr-archive (0.3.0)
Expand All @@ -92,28 +167,43 @@ GEM
addressable (>= 2.3.6)
crack (>= 0.3.2)
hashdiff
wikidata-client (0.0.12)
excon (~> 0.40)
faraday (~> 0.9)
faraday_middleware (~> 0.9)
hashie (~> 3.3)
wikisnakker (0.9.1)
require_all
yajl-ruby
yajl-ruby (1.4.0)

PLATFORMS
ruby

DEPENDENCIES
combine_popolo_memberships!
execjs
minitest
minitest-around
minitest-vcr
nokogiri
open-uri-cached
pry
rake
rest-client
rubocop
scraped!
scraped_page_archive!
scraper_test!
scraperwiki!
table_unspanner!
vcr
webmock
wikidata-fetcher!
wikidata_ids_decorator!

RUBY VERSION
ruby 2.3.3p222
ruby 2.4.1p111

BUNDLED WITH
1.16.2
3 changes: 2 additions & 1 deletion Rakefile
@@ -1,6 +1,7 @@
# frozen_string_literal: true

require 'rubocop/rake_task'

RuboCop::RakeTask.new

task default: %w(rubocop)
task default: %w[rubocop]
66 changes: 44 additions & 22 deletions scraper.rb 100644 → 100755
@@ -1,37 +1,59 @@
#!/bin/env ruby
# encoding: utf-8
# frozen_string_literal: true

require 'pry'
require 'scraped'
require 'scraperwiki'
require 'wikidata_ids_decorator'

require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'

def noko_for(url)
Nokogiri::HTML(open(url).read)
class MembersPage < Scraped::HTML
decorator WikidataIdsDecorator::Links

field :members do
member_rows.map { |row| fragment(row => MemberRow).to_h }
end

private

def member_tables
noko.xpath('//table[.//th[contains(.,"MP")]]')
end

def member_rows
member_tables.xpath('.//tr[td[2]]')
end
end

def scrape_list(url)
noko = noko_for(url)
rows = noko.xpath('//table[.//th[contains(.,"MP")]]//tr[td]')
raise 'No rows' if rows.count.zero?
rows.each do |tr|
td = tr.css('td')
data = {
name: td[1].text.tidy,
wikiname: td[1].xpath('.//a[not(@class="new")]/@title').text,
party: td[2].text.tidy,
party_wikiname: td[2].xpath('.//a[not(@class="new")]/@title').text,
area: td[1].xpath('preceding::h3/span[@class="mw-headline"]').last.text,
term: 5,
source: url,
}
puts data.reject { |_, v| v.to_s.empty? }.sort_by { |k, _| k }.to_h if ENV['MORPH_DEBUG']
ScraperWiki.save_sqlite(%i(name area party term), data)
class MemberRow < Scraped::HTML
field :id do
tds[1].css('a/@wikidata').map(&:text).first
end

field :name do
tds[1].text.tidy
end

field :party do
tds[2].text.tidy
end

field :party_id do
tds[2].css('a/@wikidata').map(&:text).first
end

field :area do
noko.xpath('preceding::h3/span[@class="mw-headline"]').last.text
end

private

def tds
noko.css('td')
end
end

ScraperWiki.sqliteexecute('DELETE FROM data') rescue nil
scrape_list('https://en.wikipedia.org/wiki/List_of_MPs_of_the_National_Assembly_of_Cambodia')
url = 'https://en.wikipedia.org/wiki/List_of_MPs_of_the_National_Assembly_of_Cambodia'
Scraped::Scraper.new(url => MembersPage).store(:members, index: %i[name party area])

0 comments on commit 9543157

Please sign in to comment.