Skip to content

Commit

Permalink
Add Lookup table for existing members
Browse files Browse the repository at this point in the history
Try to find a suitable match for anyone with no Wikidata ID.
  • Loading branch information
tmtmtmtm committed Jul 13, 2020
1 parent 20b7436 commit fb11fa5
Showing 1 changed file with 36 additions and 1 deletion.
37 changes: 36 additions & 1 deletion scraper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,34 @@
require_relative 'lib/remove_notes'
require_relative 'lib/unspan_all_tables'

class ExistingMembers
def initialize(pathname)
@pathname = pathname
end

def single_exact_match_for(name)
found = by_name[name] or return
ids = found.map(&:last).uniq
unless ids.count == 1
warn "More than one match for #{name}"
return
end
ids.first
end

private

attr_reader :pathname

def csv
@csv ||= CSV.parse(pathname.read)
end

def by_name
csv.group_by(&:first)
end
end

class MembersPage < Scraped::HTML
decorator RemoveNotes
decorator WikidataIdsDecorator::Links
Expand All @@ -25,7 +53,6 @@ def members_list
end
end


class MemberItem < Scraped::HTML
field :id do
tds[0].css('a/@wikidata').map(&:text).first
Expand Down Expand Up @@ -53,6 +80,14 @@ def tds
url = URI.encode 'https://en.wikipedia.org/wiki/Eleventh_Seimas_of_Lithuania'
data = Scraped::Scraper.new(url => MembersPage).scraper.members

# Generated from:
# wd sparql all-members.sparql | jq -r '.[] | [.item.label, .item.value] | @csv' | sort | uniq
all_members_csv = Pathname.new('all-members.csv')
if all_members_csv.exist?
lookup = ExistingMembers.new(all_members_csv)
data.each { |mem| mem[:id] ||= lookup.single_exact_match_for(mem[:name]) }
end

header = data.first.keys.to_csv
rows = data.map { |row| row.values.to_csv }
puts header + rows.join

0 comments on commit fb11fa5

Please sign in to comment.