Skip to content

Commit

Permalink
Modernise scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
tmtmtmtm committed Feb 24, 2016
1 parent fa29198 commit 640484f
Showing 1 changed file with 5 additions and 33 deletions.
38 changes: 5 additions & 33 deletions scraper.rb
@@ -1,39 +1,11 @@
#!/bin/env ruby
# encoding: utf-8

require 'rest-client'
require 'scraperwiki'
require 'wikidata/fetcher'
require 'nokogiri'
require 'colorize'
require 'pry'
require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'

names = EveryPolitician::Wikidata.wikipedia_xpath(
url: 'https://id.wikipedia.org/wiki/Daftar_anggota_DPR_RI_2014–2019',
xpath: '//table[.//tr[th[.="Nama"]]]//tr[td]//td[1]//a[not(@class="new")]/@title',
)

def noko_for(url)
Nokogiri::HTML(open(URI.escape(URI.unescape(url))).read)
end

def wikinames_from(url)
noko = noko_for(url)
names = noko.xpath('//table[.//tr[th[.="Nama"]]]//tr[td]//td[1]//a[not(@class="new")]/@title').map(&:text)
abort "No names" if names.count.zero?
names
end

def fetch_info(names)
WikiData.ids_from_pages('id', names).each do |name, id|
data = WikiData::Fetcher.new(id: id).data('id') rescue nil
unless data
warn "No data for #{p}"
next
end
data[:original_wikiname] = name
ScraperWiki.save_sqlite([:id], data)
end
end

fetch_info wikinames_from('https://id.wikipedia.org/wiki/Daftar_anggota_DPR_RI_2014%E2%80%932019')
warn RestClient.post ENV['MORPH_REBUILDER_URL'], {} if ENV['MORPH_REBUILDER_URL']

EveryPolitician::Wikidata.scrape_wikidata(names: { id: names }, output: false)

0 comments on commit 640484f

Please sign in to comment.