Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Loading…

Update IL scraper for older sessions #105

Merged
merged 4 commits into from

2 participants

@JoeGermuska

I see that the ILGA has made some changes to their URL structure since last I worked on this code. I poked around and worked out how to get URLs for the four previous sessions of the assembly. I haven't exhaustively tested things, but I did run the entire bill scraper and several days later, there were no fatal errors and the JSON I've looked at seems sound.

It does seem that there are six kinds of documents which are not currently being passed to bill.add_document and I'll see if I can find time to check those out and process them, but I figured I'd offer what works here rather than defer that indefinitely...

@jamesturk jamesturk merged commit 40728ce into from
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
This page is out of date. Refresh to see the latest.
View
18 openstates/il/__init__.py
@@ -13,9 +13,25 @@
'terms': [
{'name': '97th', 'sessions': ['97th'],
'start_year': 2011, 'end_year': 2012},
+ {'name': '96th', 'sessions': ['96th'],
+ 'start_year': 2009, 'end_year': 2010},
+ {'name': '95th', 'sessions': ['95th', 'Special_95th'],
+ 'start_year': 2007, 'end_year': 2008},
+ {'name': '94th', 'sessions': ['94th'],
+ 'start_year': 2005, 'end_year': 2006},
+ {'name': '93rd', 'sessions': ['93rd'],
+ 'start_year': 2003, 'end_year': 2004},
],
'feature_flags': [],
'session_details': {
- '97th': {'display_name': '97th Regular Session', 'session_id': '84'},
+ '97th': {'display_name': '97th Regular Session', 'params': { 'GA': '97', 'SessionId': '84' }},
+ '96th': {'display_name': '96th Regular Session', 'params': { 'GA': '96', 'SessionId': '76' }},
+ 'Special_96th': {'display_name': '96th Special Session', 'params': { 'GA': '96', 'SessionId': '82', 'SpecSess': '1' }},
+ '95th': {'display_name': '95th Regular Session', 'params': { 'GA': '95', 'SessionId': '51' }},
+ 'Special_95th': {'display_name': '95th Special Session', 'params': { 'GA': '95', 'SessionId': '52', 'SpecSess': '1' }},
+ '94th': {'display_name': '94th Regular Session', 'params': { 'GA': '94', 'SessionId': '50' }},
+ '93rd': {'display_name': '93rd Regular Session', 'params': { 'GA': '93', 'SessionId': '3' }},
+ 'Special_93rd': {'display_name': '93rd Special Session', 'params': { 'GA': '93', 'SessionID': '14', 'SpecSess': '1' }},
}
}
+
View
46 openstates/il/bills.py
@@ -3,6 +3,7 @@
import os
import datetime
import lxml.html
+from urllib import urlencode
from billy.scrape.bills import BillScraper, Bill
from billy.scrape.votes import Vote
@@ -16,10 +17,6 @@ def group(lst, n):
yield tuple(val)
-# chamber prefix, doc id, session_id
-LEGISLATION_URL = ('http://ilga.gov/legislation/grplist.asp?num1=1&num2=10000&'
- 'DocTypeID=%s%s&SessionID=%s')
-
TITLE_REMOVING_PATTERN = re.compile(".*(Rep|Sen). (.+)$")
SPONSOR_PATTERN = re.compile("^(Added |Removed )?(.+Sponsor) (Rep|Sen). (.+)$")
@@ -61,29 +58,38 @@ def _categorize_action(action):
return atype
return 'other'
+LEGISLATION_URL = ('http://ilga.gov/legislation/grplist.asp')
-class ILBillScraper(BillScraper):
+def build_url_for_legislation_list(metadata, chamber, session, doc_type):
+ base_params = metadata['session_details'][session].get('params',{})
+ base_params['num1'] = '1'
+ base_params['num2'] = '10000'
+ params = dict(base_params)
+ params['DocTypeID'] = '%s%s' % (chamber_slug(chamber),doc_type)
+ return '?'.join([LEGISLATION_URL,urlencode(params)])
- state = 'il'
+def chamber_slug(chamber):
+ if chamber == 'lower':
+ return 'H'
+ return 'S'
+class ILBillScraper(BillScraper):
+ state = 'il'
+ def get_bill_urls(self, chamber, session, doc_type):
+ url = build_url_for_legislation_list(self.metadata, chamber, session, doc_type)
+ html = self.urlopen(url)
+ doc = lxml.html.fromstring(html)
+ doc.make_links_absolute(url)
+ for bill_url in doc.xpath('//li/a/@href'):
+ yield bill_url
+
def scrape(self, chamber, session):
- session_id = self.metadata['session_details'][session]['session_id']
- chamber_slug = 'H' if chamber == 'lower' else 'S'
-
-
for doc_type in DOC_TYPES:
- url = LEGISLATION_URL % (chamber_slug, doc_type, session_id)
- html = self.urlopen(url)
- doc = lxml.html.fromstring(html)
- doc.make_links_absolute(url)
-
- for bill_url in doc.xpath('//li/a/@href'):
- self.scrape_bill(chamber, session, chamber_slug+doc_type,
- bill_url)
-
-
+ for bill_url in self.get_bill_urls(chamber, session, doc_type):
+ self.scrape_bill(chamber, session, chamber_slug(chamber)+doc_type, bill_url)
+
def scrape_bill(self, chamber, session, doc_type, url):
html = self.urlopen(url)
doc = lxml.html.fromstring(html)
View
33 openstates/il/tests/test_bill_metadata.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+
+import unittest
+from openstates.il import metadata
+from openstates.il.bills import DOC_TYPES, ILBillScraper
+import logging
+
+log = logging.getLogger('openstates.il.tests.test_bill_metadata')
+
+class TestBillMetadata(unittest.TestCase):
+ """Run a basic sanity check to ensure that something would get scraped for each session in the metadata"""
+
+ def setUp(self):
+ self.scraper = ILBillScraper(metadata)
+
+ def test_lists(self):
+ chambers = ['H','S']
+ sessions = []
+ for term in metadata['terms']:
+ sessions.extend(term['sessions'])
+ self.assertTrue(len(sessions) > 0, "Expected non-zero list of sessions")
+
+ for session in sessions:
+ for chamber in chambers:
+ session_chamber_count = 0
+ for doc_type in DOC_TYPES:
+ count = len(list(self.scraper.get_bill_urls(chamber, session, doc_type)))
+ log.info("Session: %s Chamber: %s Doc Type: %s Count: %i" % (session, chamber, doc_type, count))
+ session_chamber_count += count
+ self.assertTrue(session_chamber_count > 0, "Expected non-zero bill count for Session %s, Chamber %s" % (session, chamber))
+if __name__ == '__main__':
+ unittest.main()
+
Something went wrong with that request. Please try again.