First commit

wfdd · Jul 7, 2016 · 5c26435 · 5c26435
commit 5c26435
Show file tree

Hide file tree

Showing 5 changed files with 95 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+data.sqlite*
diff --git a/README.md b/README.md
@@ -0,0 +1 @@
+A [morph.io](https://morph.io) scraper for the members of the Colombian Senate.
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+aiohttp==0.21.6
+chardet==2.3.0
+lxml==3.6.0
diff --git a/runtime.txt b/runtime.txt
@@ -0,0 +1 @@
+python-3.5.1
diff --git a/scraper.py b/scraper.py
@@ -0,0 +1,88 @@
+
+import asyncio
+import re
+import sqlite3
+from urllib.parse import urljoin
+import sys
+
+import aiohttp
+from lxml.html import (document_fromstring as parse_html,
+                       tostring as unparse_html)
+
+base_url = 'http://www.secretariasenado.gov.co/'
+
+numeric_escape_match = re.compile(r'&#(\d+);')
+email_match = re.compile(r'\n var addy\d+ = (.*);\n addy\d+ = addy\d+ \+ (.*);')
+
+
+async def scrape_person(session, semaphore, params):
+    def extract_photo():
+        try:
+            photo, = source.xpath('.//img[1]/@src')
+        except ValueError:
+            return
+        return urljoin(base_url, photo)
+
+    def deobfuscate_email():
+        email = unparse_html(source).decode()
+        try:
+            email = next(email_match.finditer(email)).groups()
+        except StopIteration:
+            return print("Couldn't find email in " + repr(resp.url),
+                         file=sys.stderr)
+        email = ''.join(eval(m) for m in email)
+        email = numeric_escape_match.sub(lambda m: chr(int(m.group(1))), email)
+        return email
+
+    def extract_other_item(caption):
+        val = ''.join(source.xpath(('.//td[contains(string(.), "{}")]'
+                                    '/following-sibling::td/text()'
+                                    ).format(caption))).strip()
+        if caption == 'TWITTER:':
+            val = val.lstrip('@')
+        if not val or val.lower() == 'no tiene':
+            return
+        return val
+
+    async with semaphore, session.get(base_url, params=params) as resp:
+        source, = (parse_html(await resp.text())
+                   .xpath('//div[@class = "art-article"]'))
+        return (source.text_content().strip().splitlines()[0].strip(),
+                extract_photo(),
+                extract_other_item('FILIACIÓN POLÍTICA:'),
+                deobfuscate_email(),
+                *map(extract_other_item,
+                     ('TELÉFONO:', 'PÁGINA WEB:', 'FACEBOOK:', 'TWITTER:',
+                      'LUGAR DE NACIMIENTO:')),
+                resp.url)
+
+
+async def gather_people(session, semaphore):
+    async with session.get(base_url + 'index.php/buscar-senador') as resp:
+        source = parse_html(await resp.text())
+    base_params = dict((*i.xpath('./@name'), *i.xpath('./@value'))
+                       for i in source.xpath('//form[@name = "ddaForm"]'
+                                             '/input[@type = "hidden"]'))
+    people_ids = source.xpath('//form[@name = "ddaForm"]/select[@name = "id"]'
+                              '/option[position() > 1]/@value')
+    people = await asyncio.gather(*(scrape_person(session, semaphore,
+                                                  {**base_params, 'id': i})
+                                    for i in people_ids))
+    return people
+
+
+def main():
+    loop = asyncio.get_event_loop()
+    with aiohttp.ClientSession(loop=loop) as session:
+        people = loop.run_until_complete(gather_people(session,
+                                                       asyncio.Semaphore(10)))
+    with sqlite3.connect('data.sqlite') as cursor:
+        cursor.execute('''\
+CREATE TABLE IF NOT EXISTS data
+(name, image, 'group', email, phone, website, facebook, twitter, place_of_birth,
+ source, UNIQUE (source))''')
+        cursor.executemany('''\
+INSERT OR REPLACE INTO data VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', people)
+
+if __name__ == '__main__':
+    main()