Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Update IL scraper for older sessions #105

Merged
merged 4 commits into from over 2 years ago

2 participants

Joe Germuska James Turk
Joe Germuska

I see that the ILGA has made some changes to their URL structure since last I worked on this code. I poked around and worked out how to get URLs for the four previous sessions of the assembly. I haven't exhaustively tested things, but I did run the entire bill scraper and several days later, there were no fatal errors and the JSON I've looked at seems sound.

It does seem that there are six kinds of documents which are not currently being passed to bill.add_document and I'll see if I can find time to check those out and process them, but I figured I'd offer what works here rather than defer that indefinitely...

James Turk jamesturk merged commit 40728ce into from November 07, 2011
James Turk jamesturk closed this November 07, 2011
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
This page is out of date. Refresh to see the latest.
18  openstates/il/__init__.py
@@ -13,9 +13,25 @@
13 13
     'terms': [
14 14
         {'name': '97th', 'sessions': ['97th'],
15 15
          'start_year': 2011, 'end_year': 2012},
  16
+        {'name': '96th', 'sessions': ['96th'],
  17
+         'start_year': 2009, 'end_year': 2010},
  18
+        {'name': '95th', 'sessions': ['95th', 'Special_95th'],
  19
+         'start_year': 2007, 'end_year': 2008},
  20
+        {'name': '94th', 'sessions': ['94th'],
  21
+         'start_year': 2005, 'end_year': 2006},
  22
+        {'name': '93rd', 'sessions': ['93rd'],
  23
+         'start_year': 2003, 'end_year': 2004},
16 24
     ],
17 25
     'feature_flags': [],
18 26
     'session_details': {
19  
-        '97th': {'display_name': '97th Regular Session', 'session_id': '84'},
  27
+        '97th': {'display_name': '97th Regular Session', 'params': { 'GA': '97', 'SessionId': '84' }},
  28
+        '96th': {'display_name': '96th Regular Session', 'params': { 'GA': '96', 'SessionId': '76' }},
  29
+        'Special_96th': {'display_name': '96th Special Session', 'params': { 'GA': '96', 'SessionId': '82', 'SpecSess': '1' }},
  30
+        '95th': {'display_name': '95th Regular Session', 'params': { 'GA': '95', 'SessionId': '51' }},
  31
+        'Special_95th': {'display_name': '95th Special Session', 'params': { 'GA': '95', 'SessionId': '52', 'SpecSess': '1' }},
  32
+        '94th': {'display_name': '94th Regular Session', 'params': { 'GA': '94', 'SessionId': '50' }},
  33
+        '93rd': {'display_name': '93rd Regular Session', 'params': { 'GA': '93', 'SessionId': '3' }},
  34
+        'Special_93rd': {'display_name': '93rd Special Session', 'params': { 'GA': '93', 'SessionID': '14', 'SpecSess': '1' }},
20 35
     }
21 36
 }
  37
+
46  openstates/il/bills.py
@@ -3,6 +3,7 @@
3 3
 import os
4 4
 import datetime
5 5
 import lxml.html
  6
+from urllib import urlencode
6 7
 
7 8
 from billy.scrape.bills import BillScraper, Bill
8 9
 from billy.scrape.votes import Vote
@@ -16,10 +17,6 @@ def group(lst, n):
16 17
             yield tuple(val)
17 18
 
18 19
 
19  
-# chamber prefix, doc id, session_id
20  
-LEGISLATION_URL = ('http://ilga.gov/legislation/grplist.asp?num1=1&num2=10000&'
21  
-                   'DocTypeID=%s%s&SessionID=%s')
22  
-
23 20
 TITLE_REMOVING_PATTERN = re.compile(".*(Rep|Sen). (.+)$")
24 21
 SPONSOR_PATTERN = re.compile("^(Added |Removed )?(.+Sponsor) (Rep|Sen). (.+)$")
25 22
 
@@ -61,29 +58,38 @@ def _categorize_action(action):
61 58
             return atype
62 59
     return 'other'
63 60
 
  61
+LEGISLATION_URL = ('http://ilga.gov/legislation/grplist.asp')
64 62
 
65  
-class ILBillScraper(BillScraper):
  63
+def build_url_for_legislation_list(metadata, chamber, session, doc_type):
  64
+    base_params = metadata['session_details'][session].get('params',{})
  65
+    base_params['num1'] = '1'
  66
+    base_params['num2'] = '10000'
  67
+    params = dict(base_params)
  68
+    params['DocTypeID'] = '%s%s' % (chamber_slug(chamber),doc_type)
  69
+    return '?'.join([LEGISLATION_URL,urlencode(params)])
66 70
 
67  
-    state = 'il'
  71
+def chamber_slug(chamber):
  72
+    if chamber == 'lower':
  73
+        return 'H'
  74
+    return 'S'
68 75
 
  76
+class ILBillScraper(BillScraper):
69 77
 
  78
+    state = 'il'
70 79
 
  80
+    def get_bill_urls(self, chamber, session, doc_type):
  81
+        url = build_url_for_legislation_list(self.metadata, chamber, session, doc_type)
  82
+        html = self.urlopen(url)
  83
+        doc = lxml.html.fromstring(html)
  84
+        doc.make_links_absolute(url)
  85
+        for bill_url in doc.xpath('//li/a/@href'):
  86
+            yield bill_url
  87
+    
71 88
     def scrape(self, chamber, session):
72  
-        session_id = self.metadata['session_details'][session]['session_id']
73  
-        chamber_slug = 'H' if chamber == 'lower' else 'S'
74  
-
75  
-
76 89
         for doc_type in DOC_TYPES:
77  
-            url = LEGISLATION_URL % (chamber_slug, doc_type, session_id)
78  
-            html = self.urlopen(url)
79  
-            doc = lxml.html.fromstring(html)
80  
-            doc.make_links_absolute(url)
81  
-
82  
-            for bill_url in doc.xpath('//li/a/@href'):
83  
-                self.scrape_bill(chamber, session, chamber_slug+doc_type,
84  
-                                 bill_url)
85  
-
86  
-
  90
+            for bill_url in self.get_bill_urls(chamber, session, doc_type):
  91
+                self.scrape_bill(chamber, session, chamber_slug(chamber)+doc_type, bill_url)
  92
+    
87 93
     def scrape_bill(self, chamber, session, doc_type, url):
88 94
         html = self.urlopen(url)
89 95
         doc = lxml.html.fromstring(html)
33  openstates/il/tests/test_bill_metadata.py
... ...
@@ -0,0 +1,33 @@
  1
+#!/usr/bin/env python
  2
+
  3
+import unittest
  4
+from openstates.il import metadata
  5
+from openstates.il.bills import DOC_TYPES, ILBillScraper
  6
+import logging
  7
+
  8
+log = logging.getLogger('openstates.il.tests.test_bill_metadata')
  9
+
  10
+class TestBillMetadata(unittest.TestCase):
  11
+    """Run a basic sanity check to ensure that something would get scraped for each session in the metadata"""
  12
+    
  13
+    def setUp(self):
  14
+        self.scraper = ILBillScraper(metadata)
  15
+
  16
+    def test_lists(self):
  17
+        chambers = ['H','S']
  18
+        sessions = []
  19
+        for term in metadata['terms']:
  20
+            sessions.extend(term['sessions'])
  21
+        self.assertTrue(len(sessions) > 0, "Expected non-zero list of sessions")
  22
+
  23
+        for session in sessions:
  24
+            for chamber in chambers:
  25
+                session_chamber_count = 0
  26
+                for doc_type in DOC_TYPES:
  27
+                    count = len(list(self.scraper.get_bill_urls(chamber, session, doc_type)))
  28
+                    log.info("Session: %s Chamber: %s Doc Type: %s Count: %i" % (session, chamber, doc_type, count))
  29
+                    session_chamber_count += count
  30
+                self.assertTrue(session_chamber_count > 0, "Expected non-zero bill count for Session %s, Chamber %s" % (session, chamber))
  31
+if __name__ == '__main__':
  32
+    unittest.main()
  33
+
Commit_comment_tip

Tip: You can add notes to lines in a file. Hover to the left of a line to make a note

Something went wrong with that request. Please try again.