Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
wfdd committed Jul 7, 2016
0 parents commit 5c26435
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
__pycache__
data.sqlite*
1 change: 1 addition & 0 deletions README.md
@@ -0,0 +1 @@
A [morph.io](https://morph.io) scraper for the members of the Colombian Senate.
3 changes: 3 additions & 0 deletions requirements.txt
@@ -0,0 +1,3 @@
aiohttp==0.21.6
chardet==2.3.0
lxml==3.6.0
1 change: 1 addition & 0 deletions runtime.txt
@@ -0,0 +1 @@
python-3.5.1
88 changes: 88 additions & 0 deletions scraper.py
@@ -0,0 +1,88 @@

import asyncio
import re
import sqlite3
from urllib.parse import urljoin
import sys

import aiohttp
from lxml.html import (document_fromstring as parse_html,
tostring as unparse_html)

base_url = 'http://www.secretariasenado.gov.co/'

numeric_escape_match = re.compile(r'&#(\d+);')
email_match = re.compile(r'\n var addy\d+ = (.*);\n addy\d+ = addy\d+ \+ (.*);')


async def scrape_person(session, semaphore, params):
def extract_photo():
try:
photo, = source.xpath('.//img[1]/@src')
except ValueError:
return
return urljoin(base_url, photo)

def deobfuscate_email():
email = unparse_html(source).decode()
try:
email = next(email_match.finditer(email)).groups()
except StopIteration:
return print("Couldn't find email in " + repr(resp.url),
file=sys.stderr)
email = ''.join(eval(m) for m in email)
email = numeric_escape_match.sub(lambda m: chr(int(m.group(1))), email)
return email

def extract_other_item(caption):
val = ''.join(source.xpath(('.//td[contains(string(.), "{}")]'
'/following-sibling::td/text()'
).format(caption))).strip()
if caption == 'TWITTER:':
val = val.lstrip('@')
if not val or val.lower() == 'no tiene':
return
return val

async with semaphore, session.get(base_url, params=params) as resp:
source, = (parse_html(await resp.text())
.xpath('//div[@class = "art-article"]'))
return (source.text_content().strip().splitlines()[0].strip(),
extract_photo(),
extract_other_item('FILIACIÓN POLÍTICA:'),
deobfuscate_email(),
*map(extract_other_item,
('TELÉFONO:', 'PÁGINA WEB:', 'FACEBOOK:', 'TWITTER:',
'LUGAR DE NACIMIENTO:')),
resp.url)


async def gather_people(session, semaphore):
async with session.get(base_url + 'index.php/buscar-senador') as resp:
source = parse_html(await resp.text())
base_params = dict((*i.xpath('./@name'), *i.xpath('./@value'))
for i in source.xpath('//form[@name = "ddaForm"]'
'/input[@type = "hidden"]'))
people_ids = source.xpath('//form[@name = "ddaForm"]/select[@name = "id"]'
'/option[position() > 1]/@value')
people = await asyncio.gather(*(scrape_person(session, semaphore,
{**base_params, 'id': i})
for i in people_ids))
return people


def main():
loop = asyncio.get_event_loop()
with aiohttp.ClientSession(loop=loop) as session:
people = loop.run_until_complete(gather_people(session,
asyncio.Semaphore(10)))
with sqlite3.connect('data.sqlite') as cursor:
cursor.execute('''\
CREATE TABLE IF NOT EXISTS data
(name, image, 'group', email, phone, website, facebook, twitter, place_of_birth,
source, UNIQUE (source))''')
cursor.executemany('''\
INSERT OR REPLACE INTO data VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', people)

if __name__ == '__main__':
main()

0 comments on commit 5c26435

Please sign in to comment.