Skip to content

Commit

Permalink
Scrape current MPs separately
Browse files Browse the repository at this point in the history
  • Loading branch information
wfdd committed Jun 6, 2016
1 parent 84579ce commit 4b8ad41
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 5 deletions.
Binary file modified data.sqlite
Binary file not shown.
45 changes: 45 additions & 0 deletions scrape_current.py
@@ -0,0 +1,45 @@

import itertools as it
import sqlite3

from scrape_elected import decap_name, start_session, title_match


def tidy_up_row(row):
area, first, last, party, *_ = (i.strip() for i in row)
return (decap_name(title_match.sub('', first)), decap_name(last),
party, '2013', area)


def parse_table(doc):
return (tidy_up_row(i.text_content() for i in v.xpath('./td'))
for v in doc.xpath('//table[@id="ctl00_ContentPlaceHolder1_ASPxPageControl1_ASPxGridView3_DXMainTable"]'
'//tr[@class="dxgvDataRow"]'))


def parse_pages(session):
while True:
yield parse_table(session.document())
page = session.at_xpath('//table[@id="ctl00_ContentPlaceHolder1_ASPxPageControl1_ASPxGridView3_DXPagerBottom"]'
'//td[@class="dxpPageNumber dxpCurrentPageNumber"]'
'/following-sibling::td[@class="dxpPageNumber"]')
if not page:
break
page.click()
while session.at_css('#ctl00_ContentPlaceHolder1_ASPxGridView1_LPV'):
# Wait for the table to be updated
...


def main():
session = start_session('Milletvekillerimiz1.aspx')
with sqlite3.connect('data.sqlite') as c:
c.execute('''\
CREATE TABLE IF NOT EXISTS data
(first_name, last_name, party, term, area,
UNIQUE (first_name, last_name, party, term, area))''')
c.executemany('INSERT OR REPLACE INTO data VALUES (?, ?, ?, ?, ?)',
it.chain.from_iterable(parse_pages(session)))

if __name__ == '__main__':
main()
14 changes: 9 additions & 5 deletions scrape.py → scrape_elected.py
Expand Up @@ -10,6 +10,12 @@
title_match = re.compile(r'D[Rr]\.\s*')
decap_name = icu.Transliterator.createInstance('tr-title').transliterate

def start_session(page):
session = dryscrape.Session(base_url='http://www.cm.gov.nc.tr/')
session.set_attribute('auto_load_images', False)
session.visit(page)
return session


def tidy_up_row(row):
first, last, *etc = (i.strip() for i in row)
Expand Down Expand Up @@ -41,17 +47,15 @@ def parse_pages(session):


def main():
session = dryscrape.Session(base_url='http://www.cm.gov.nc.tr/')
session.set_attribute('auto_load_images', False)
session.visit('Secimler.aspx')
session = start_session('Secimler.aspx')

with sqlite3.connect('data.sqlite') as c:
c.execute('''\
CREATE TABLE IF NOT EXISTS data
CREATE TABLE IF NOT EXISTS elected
(first_name, last_name, party, election_year, area,
UNIQUE (first_name, last_name, party, election_year, area))''')
while True:
c.executemany('INSERT OR REPLACE INTO data VALUES (?, ?, ?, ?, ?)',
c.executemany('INSERT OR REPLACE INTO elected VALUES (?, ?, ?, ?, ?)',
it.chain.from_iterable(parse_pages(session)))
year = session.at_xpath('//select[@name="ctl00$ContentPlaceHolder1$DropDownList1"]'
'/option[@selected="selected"]/following-sibling::option')
Expand Down

0 comments on commit 4b8ad41

Please sign in to comment.