Skip to content

Commit

Permalink
initial scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
tmtmtmtm committed Oct 10, 2015
1 parent 474279a commit e547bdb
Showing 1 changed file with 40 additions and 0 deletions.
40 changes: 40 additions & 0 deletions scraper.rb
@@ -0,0 +1,40 @@
#!/bin/env ruby
# encoding: utf-8

require 'scraperwiki'
require 'nokogiri'
require 'colorize'
require 'pry'
require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'

class String
def tidy
self.gsub(/[[:space:]]+/, ' ').strip
end
end

def noko_for(url)
Nokogiri::HTML(open(url).read)
end

def scrape_list(url)
noko = noko_for(url)
rows = noko.xpath('//table[.//th[contains(.,"MP")]]//tr[td]')
raise "No rows" if rows.count.zero?
rows.each do |tr|
td = tr.css('td')
data = {
name: td[1].text.tidy,
wikiname: td[1].xpath('.//a[not(@class="new")]/@title').text,
party: td[2].text.tidy,
party_wikiname: td[2].xpath('.//a[not(@class="new")]/@title').text,
area: td[1].xpath('preceding::h3/span[@class="mw-headline"]').last.text,
term: 5,
source: url,
}
ScraperWiki.save_sqlite([:name, :area, :party, :term], data)
end
end

scrape_list("https://en.wikipedia.org/wiki/List_of_MPs_of_the_National_Assembly_of_Cambodia")

0 comments on commit e547bdb

Please sign in to comment.