Skip to content

Commit

Permalink
Initial scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
tmtmtmtm committed Dec 21, 2015
1 parent 44190f1 commit f793576
Showing 1 changed file with 41 additions and 0 deletions.
41 changes: 41 additions & 0 deletions scraper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/env ruby
# encoding: utf-8

require 'rest-client'
require 'scraperwiki'
require 'wikidata/fetcher'
require 'nokogiri'
require 'colorize'
require 'pry'
require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'


def noko_for(url)
Nokogiri::HTML(open(URI.escape(URI.unescape(url))).read)
end

def wikinames_from(url)
noko = noko_for(url)
sd = 'Список депутатов'
names = noko.xpath('//h2/span[.="Список депутатов"]//following::table[1]//td[1]//a[not(@class="new")]/@title').map(&:text).uniq
raise "No names found in #{url}" if names.count.zero?
return names
end

def fetch_info(names)
WikiData.ids_from_pages('ru', names).each do |name, id|
data = WikiData::Fetcher.new(id: id).data('ru') rescue nil
unless data
warn "No data for #{p}"
next
end
data[:original_wikiname] = name
ScraperWiki.save_sqlite([:id], data)
end
end

fetch_info wikinames_from('https://ru.wikipedia.org/wiki/%D0%93%D0%BE%D1%81%D1%83%D0%B4%D0%B0%D1%80%D1%81%D1%82%D0%B2%D0%B5%D0%BD%D0%BD%D0%B0%D1%8F_%D0%B4%D1%83%D0%BC%D0%B0_%D0%A4%D0%B5%D0%B4%D0%B5%D1%80%D0%B0%D0%BB%D1%8C%D0%BD%D0%BE%D0%B3%D0%BE_%D1%81%D0%BE%D0%B1%D1%80%D0%B0%D0%BD%D0%B8%D1%8F_%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D0%B9%D1%81%D0%BA%D0%BE%D0%B9_%D0%A4%D0%B5%D0%B4%D0%B5%D1%80%D0%B0%D1%86%D0%B8%D0%B8_VI_%D1%81%D0%BE%D0%B7%D1%8B%D0%B2%D0%B0')

warn RestClient.post ENV['MORPH_REBUILDER_URL'], {} if ENV['MORPH_REBUILDER_URL']

0 comments on commit f793576

Please sign in to comment.