Skip to content

Commit

Permalink
Remove first and last names and scrape full names from bio docs
Browse files Browse the repository at this point in the history
The names in the table are arbitrarily accented.
  • Loading branch information
wfdd committed Jun 7, 2016
1 parent ca200d1 commit 3009a82
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 31 deletions.
Binary file modified data.sqlite
Binary file not shown.
52 changes: 35 additions & 17 deletions scrape_current.py
Expand Up @@ -2,38 +2,48 @@
import itertools as it
import re
import sqlite3
import subprocess
import urllib.request

import dryscrape
import icu

from scrape_elected import decap_name, start_session, title_match

nonword_match = re.compile(r'[^\w\s-]')
title_match = re.compile(r'(?:D[RrTt]|Prof)\.\s*')
whitespace_match = re.compile(r'[\s-]+')

tr2ascii = icu.Transliterator.createInstance('tr-ASCII; lower')\
.transliterate
decap_name = icu.Transliterator.createInstance('tr-title').transliterate
tr2lcascii = icu.Transliterator.createInstance('tr-ASCII; lower').transliterate


def create_id(s):
return whitespace_match.sub('-', nonword_match.sub('', tr2ascii(s)))
return whitespace_match.sub('-', nonword_match.sub('', tr2lcascii(s)))


def extract_name(url):
with urllib.request.urlopen(url) as file:
doc = file.read()
text = subprocess.run(('antiword', '-w 0', '-'),
input=doc, stdout=subprocess.PIPE).stdout.decode()
name = text.replace('[pic]', '').strip().partition('\n')[0]
name = decap_name(title_match.sub('', ' '.join(name.split())))
return name


def tidy_up_row(row, url):
area, first, last, party, *_ = (i.strip() for i in row)
first, last = decap_name(title_match.sub('', first)), decap_name(last)
return (create_id(' '.join((first, last))),
first + ' ' + last,
first,
last,
area, _, _, party, _ = (i.text_content().strip() for i in row)
name = extract_name(row[-1].xpath('.//a/@href')[0])
return (create_id(name),
name,
party,
'2013–',
'8',
area,
None,
url)


def parse_table(doc, url):
return (tidy_up_row((i.text_content() for i in v.xpath('./td')), url)
return (tidy_up_row(v.xpath('./td'), url)
for v in doc.xpath('//table[@id="ctl00_ContentPlaceHolder1_ASPxPageControl1_ASPxGridView3_DXMainTable"]'
'//tr[@class="dxgvDataRow"]'))

Expand All @@ -47,19 +57,27 @@ def parse_pages(session):
if not page:
break
page.click()
while session.at_css('#ctl00_ContentPlaceHolder1_ASPxGridView1_LPV'):
while session.at_css('#ctl00_ContentPlaceHolder1_ASPxPageControl1_'
'ASPxGridView3_LPV'):
# Wait for the table to be updated
...


def start_session(page):
session = dryscrape.Session(base_url='http://www.cm.gov.nc.tr/')
session.set_attribute('auto_load_images', False)
session.visit(page)
return session


def main():
session = start_session('Milletvekillerimiz1.aspx')
with sqlite3.connect('data.sqlite') as c:
c.execute('''\
CREATE TABLE IF NOT EXISTS data
(id, name, given_name, family_name, 'group', term, area, image, source,
UNIQUE (id, name, given_name, family_name, 'group', term, area, image, source))''')
c.executemany('INSERT OR REPLACE INTO data VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)',
(id, name, 'group', term, area, image, source,
UNIQUE (id, name, 'group', term, area, image, source))''')
c.executemany('INSERT OR REPLACE INTO data VALUES (?, ?, ?, ?, ?, ?, ?)',
it.chain.from_iterable(parse_pages(session)))

if __name__ == '__main__':
Expand Down
15 changes: 1 addition & 14 deletions scrape_elected.py
@@ -1,21 +1,8 @@

import itertools as it
import re
import sqlite3

import dryscrape
import icu


title_match = re.compile(r'D[RrTt]\.\s*')
decap_name = icu.Transliterator.createInstance('tr-title').transliterate


def start_session(page):
session = dryscrape.Session(base_url='http://www.cm.gov.nc.tr/')
session.set_attribute('auto_load_images', False)
session.visit(page)
return session
from scrape_current import decap_name, start_session, title_match


def tidy_up_row(row):
Expand Down

4 comments on commit 3009a82

@tmtmtmtm
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has broken our import of these into EP. We don't necessarily need individual first and last names, but having a sort_name column would be useful…

@wfdd
Copy link
Owner Author

@wfdd wfdd commented on 3009a82 Jun 7, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tmtmtmtm, I've put them back in @9de4f5c. As a bonus, they're no longer misspelled...

@tmtmtmtm
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Excellent, thanks! Updated.

@wfdd
Copy link
Owner Author

@wfdd wfdd commented on 3009a82 Jun 7, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tmtmtmtm, I've also scraped the birth dates now if you wanna add those too.

Please sign in to comment.