Skip to content

Commit

Permalink
Modernise scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
tmtmtmtm committed Jan 14, 2016
1 parent a5dc43f commit b6fdfd5
Showing 1 changed file with 6 additions and 26 deletions.
32 changes: 6 additions & 26 deletions scraper.rb
Original file line number Diff line number Diff line change
@@ -1,32 +1,12 @@
#!/bin/env ruby
# encoding: utf-8

require 'scraperwiki'
require 'wikidata/fetcher'
require 'nokogiri'
require 'open-uri'
require 'pry'
require 'rest-client'

def noko_for(url)
Nokogiri::HTML(open(url).read)
end

def wikinames(url)
noko = noko_for(url)
noko.xpath('//table[.//th[contains(.,"Nume")]]//tr[td]/td[2]//a[not(@class="new")]/@title').map(&:text)
end

names = wikinames('https://ro.wikipedia.org/wiki/Legislatura_2014-2018_(Republica_Moldova)')
abort "No names" if names.count.zero?

WikiData.ids_from_pages('ro', names).each_with_index do |p, i|
data = WikiData::Fetcher.new(id: p.last).data('ro') rescue nil
unless data
warn "No data for #{p}"
next
end
ScraperWiki.save_sqlite([:id], data)
end
warn RestClient.post ENV['MORPH_REBUILDER_URL'], {} if ENV['MORPH_REBUILDER_URL']

names = EveryPolitician::Wikidata.wikipedia_xpath(
url: 'https://ro.wikipedia.org/wiki/Legislatura_2014-2018_(Republica_Moldova)',
xpath: '//table[.//th[contains(.,"Nume")]]//tr[td]/td[2]//a[not(@class="new")]/@title',
)
EveryPolitician::Wikidata.scrape_wikidata(names: { ro: names })
warn EveryPolitician::Wikidata.notify_rebuilder

0 comments on commit b6fdfd5

Please sign in to comment.