Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
initial scraper
  • Loading branch information
tmtmtmtm committed Sep 13, 2015
0 parents commit 2c5e35d
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .gitignore
@@ -0,0 +1,6 @@

.cache/*

*.swp

*.sqlite
17 changes: 17 additions & 0 deletions Gemfile
@@ -0,0 +1,17 @@
# It's easy to add more libraries or choose different versions. Any libraries
# specified here will be installed and made available to your morph.io scraper.
# Find out more: https://morph.io/documentation/ruby

source "https://rubygems.org"

ruby "2.0.0"

gem "colorize"
gem "mediawiki_api"
gem "nokogiri"
gem "open-uri-cached"
gem "pry"
gem "rest-client"
gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "wikidata-fetcher", '>=0.4.0', git: "https://github.com/everypolitician/wikidata-fetcher.git"

83 changes: 83 additions & 0 deletions Gemfile.lock
@@ -0,0 +1,83 @@
GIT
remote: https://github.com/everypolitician/wikidata-fetcher.git
revision: 914fea8b17b047a143ed3667d650f1fd0e221f1f
specs:
wikidata-fetcher (0.4.0)
colorize
diskcached
mediawiki_api
wikidata-client (~> 0.0.7)

GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
branch: morph_defaults
specs:
scraperwiki (3.0.1)
httpclient
sqlite_magic

GEM
remote: https://rubygems.org/
specs:
coderay (1.1.0)
colorize (0.7.7)
diskcached (1.1.2)
domain_name (0.5.24)
unf (>= 0.0.5, < 1.0.0)
excon (0.45.4)
faraday (0.9.1)
multipart-post (>= 1.2, < 3)
faraday-cookie_jar (0.0.6)
faraday (>= 0.7.4)
http-cookie (~> 1.0.0)
faraday_middleware (0.10.0)
faraday (>= 0.7.4, < 0.10)
hashie (3.4.2)
http-cookie (1.0.2)
domain_name (~> 0.5)
httpclient (2.6.0.1)
mediawiki_api (0.4.1)
faraday (~> 0.9, >= 0.9.0)
faraday-cookie_jar (~> 0.0, >= 0.0.6)
method_source (0.8.2)
mime-types (2.6.1)
mini_portile (0.6.2)
multipart-post (2.0.0)
netrc (0.10.3)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
open-uri-cached (0.0.5)
pry (0.10.1)
coderay (~> 1.1.0)
method_source (~> 0.8.1)
slop (~> 3.4)
rest-client (1.8.0)
http-cookie (>= 1.0.2, < 2.0)
mime-types (>= 1.16, < 3.0)
netrc (~> 0.7)
slop (3.6.0)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3
unf (0.1.4)
unf_ext
unf_ext (0.0.7.1)
wikidata-client (0.0.8)
excon (~> 0.40)
faraday (~> 0.9)
faraday_middleware (~> 0.9)
hashie (~> 3.3)

PLATFORMS
ruby

DEPENDENCIES
colorize
mediawiki_api
nokogiri
open-uri-cached
pry
rest-client
scraperwiki!
wikidata-fetcher (>= 0.4.0)!
29 changes: 29 additions & 0 deletions scraper.rb
@@ -0,0 +1,29 @@
#!/bin/env ruby
# encoding: utf-8

require 'json'
require 'pry'
require 'rest-client'
require 'scraperwiki'
require 'wikidata/fetcher'
require 'mediawiki_api'

def members
morph_api_url = 'https://api.morph.io/tmtmtmtm/northern_cyprus_parliament_wikipedia/data.json'
morph_api_key = ENV["MORPH_API_KEY"]
result = RestClient.get morph_api_url, params: {
key: morph_api_key,
query: "select DISTINCT(wikiname) AS wikiname from data"
}
JSON.parse(result, symbolize_names: true)
end

WikiData.ids_from_pages('en', members.map { |c| c[:wikiname] }).each_with_index do |p, i|
data = WikiData::Fetcher.new(id: p.last).data('tr') rescue nil
unless data
warn "No data for #{p}"
next
end
ScraperWiki.save_sqlite([:id], data)
end

0 comments on commit 2c5e35d

Please sign in to comment.