Skip to content

Commit

Permalink
Bring the election scraper into line with 'current'
Browse files Browse the repository at this point in the history
  • Loading branch information
wfdd committed Jun 7, 2016
1 parent 892744b commit 70d72f4
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 14 deletions.
Binary file modified data.sqlite
Binary file not shown.
4 changes: 2 additions & 2 deletions scrape_current.py
Expand Up @@ -89,7 +89,7 @@ def parse_pages(session):


def prepare_row(row, url):
area, _, family, party, _ = (i.text_content().strip() for i in row)
area, _, family, group, _ = (i.text_content().strip() for i in row)
birth_date, name = parse_bio_doc(row[-1].xpath('.//a/@href')[0])
given, family = generate_names(family, name)
id_ = create_id(name)
Expand All @@ -100,7 +100,7 @@ def prepare_row(row, url):
family + ', ' + given,
birth_date,
ids_to_gender[id_],
party,
group,
'8',
area,
ids_to_photo.get(id_, None),
Expand Down
31 changes: 19 additions & 12 deletions scrape_elected.py
Expand Up @@ -2,24 +2,18 @@
import itertools as it
import sqlite3

from scrape_current import decap_name, start_session, title_match
from scrape_current import start_session


def prepare_row(row):
first, last, *etc = (i.strip() for i in row)
first, last = decap_name(title_match.sub('', first)), decap_name(last)
return (first + ' ' + last, first, last, *etc)


def parse_table(doc):
return (prepare_row(i.text_content() for i in v.xpath('./td'))
def parse_table(doc, url):
return (prepare_row((i.text_content() for i in v.xpath('./td')), url)
for v in doc.xpath('//table[@id="ctl00_ContentPlaceHolder1_ASPxGridView1_DXMainTable"]'
'//tr[@class="dxgvDataRow"]'))


def parse_pages(session):
while True:
yield parse_table(session.document())
yield parse_table(session.document(), session.url())
page = session.at_xpath('//td[@class="dxpPageNumber dxpCurrentPageNumber"]'
'/following-sibling::td[@class="dxpPageNumber"]')
if not page:
Expand All @@ -35,17 +29,30 @@ def parse_pages(session):
...


def prepare_row(row, url):
first, last, group, election_year, area = (i.strip() for i in row)
return (first + ' ' + last,
first,
last,
last + ', ' + first,
group,
election_year,
area,
url)


def main():
session = start_session('Secimler.aspx')

with sqlite3.connect('data.sqlite') as c:
c.execute('''\
CREATE TABLE IF NOT EXISTS elected
(name, given_name, family_name, 'group', election_year, area,
(name, given_name, family_name, sort_name,
'group', election_year, area, source,
UNIQUE (name, 'group', election_year, area))''')
while True:
c.executemany('''\
INSERT OR REPLACE INTO elected VALUES (?, ?, ?, ?, ?)''',
INSERT OR REPLACE INTO elected VALUES (?, ?, ?, ?, ?, ?, ?, ?)''',
it.chain.from_iterable(parse_pages(session)))
year = session.at_xpath('//select[@name="ctl00$ContentPlaceHolder1$DropDownList1"]'
'/option[@selected="selected"]/following-sibling::option')
Expand Down

0 comments on commit 70d72f4

Please sign in to comment.