Skip to content

Commit

Permalink
NY: improve action categorization
Browse files Browse the repository at this point in the history
  • Loading branch information
twneale committed Sep 19, 2012
1 parent 0fbb4b0 commit 60675f2
Show file tree
Hide file tree
Showing 2 changed files with 207 additions and 30 deletions.
200 changes: 200 additions & 0 deletions openstates/ny/actions.py
@@ -0,0 +1,200 @@
'''
NY needs an @after_categorize function to expand committee names
and help the importer figure out which committees are being mentioned.
'''
import re
from functools import partial
from collections import namedtuple, defaultdict
from types import MethodType


class Rule(namedtuple('Rule', 'regexes types stop attrs')):
'''If anyh of ``regexes`` matches the action text, the resulting
action's types should include ``types``.
If stop is true, no other rules should be tested after this one;
in other words, this rule conclusively determines the action's
types and attrs.
The resulting action should contain ``attrs``, which basically
enables overwriting certain attributes, like the chamber if
the action was listed in the wrong column.
'''
def __new__(_cls, regexes, types=None, stop=False, **kwargs):
'Create new instance of Rule(regex, types, attrs, stop)'

# Regexes can be a string or a sequence.
if isinstance(regexes, basestring):
regexes = set([regexes])
regexes = set(regexes or [])

# Types can be a string or a sequence.
if isinstance(types, basestring):
types = set([types])
types = set(types or [])

return tuple.__new__(_cls, (regexes, types, stop, kwargs))


class BaseCategorizer(object):
'''A class that exposes a main categorizer function
and before and after hooks, in case a state requires specific
steps that make use of action or category info. The return
value is a 2-tuple of category types and a dictionary of
attributes to overwrite on the target action object.
'''
rules = []

def __init__(self):
before_funcs = []
after_funcs = []
for name in dir(self):
attr = getattr(self, name)
if isinstance(attr, MethodType):
# func = partial(attr, self)
func = attr
if getattr(attr, 'before', None):
before_funcs.append(func)
if getattr(attr, 'after', None):
after_funcs.append(func)
self._before_funcs = before_funcs
self._after_funcs = after_funcs

def categorize(self, text):

whitespace = partial(re.sub, '\s{1,4}', '\s{,4}')

# Run the before hook.
text = self.before_categorize(text)
for func in self._before_funcs:
text = func(text)

types = set()
attrs = defaultdict(set)
for rule in self.rules:

for regex in rule.regexes:

# Try to match the regex.
m = re.search(whitespace(regex), text)
if m or (regex in text):
# If so, apply its associated types to this action.
types |= rule.types

# Also add its specified attrs.
for k, v in m.groupdict().items():
attrs[k].add(v)

for k, v in rule.attrs.items():
attrs[k].add(v)

# Break if the rule says so, otherwise
# continue testing against other rules.
if rule.stop is True:
break

# Returns types, attrs
return_val = (list(types), attrs)
return_val = self.after_categorize(return_val)
for func in self._after_funcs:
return_val = func(*return_val)
return self.finalize(return_val)

def before_categorize(self, text):
'''A precategorization hook. Takes/returns text.
'''
return text

def after_categorize(self, return_val):
'''A post-categorization hook. Takes, returns
a tuple like (types, attrs), where types is a sequence
of categories (e.g., bill:passed), and attrs is a
dictionary of addition attributes that can be used to
augment the action (or whatever).
'''
return return_val

def finalize(self, return_val):
'''Before the types and attrs get passed to the
importer they need to be altered by converting lists to
sets, etc.
'''
types, attrs = return_val
_attrs = {}

# Get rid of defaultdict.
for k, v in attrs.items():

# Skip empties.
if not v:
continue
else:
v = filter(None, v)

# Get rid of sets.
if isinstance(v, set):
v = list(v)

# Some vals should be strings, not seqs.
if k == 'actor' and len(v) == 1:
v = v.pop()

_attrs[k] = v

return types, _attrs


def after_categorize(f):
'''A decorator to mark a function to be run
before categorization has happened.
'''
f.after = True
return f


def before_categorize(f):
'''A decorator to mark a function to be run
before categorization has happened.
'''
f.before = True
return f


# These are regex patterns that map to action categories.
_categorizer_rules = (

# Senate passage.
Rule(r'(?i)^(RE)?PASSED', 'bill:passed'),
Rule(r'(?i)^ADOPTED', 'bill:passed'),

# Amended
Rule(r'(?i)AMENDED (?P<bill_id>\d+)', 'amendment:passed'),
Rule(r'(?i)AMEND AND RECOMMIT TO (?P<committees>.+)',
['amendment:passed', 'committee:referred']),
Rule(r'(?i)amend .+? and recommit to (?P<committees>.+)',
['amendment:passed', 'committee:referred']),
Rule(r'(?i)AMENDED ON THIRD READING (\(T\) )?(?P<bill_id>.+)',
'amendment:passed'),
Rule(r'(?i)print number (?P<bill_id>\d+)', 'amendment:passed'),
Rule(r'(?i)tabled', 'amendment:tabled'),

# Committees
Rule(r'(?i)held .+? in (?P<committees>.+)', 'bill:failed'),
Rule(r'(?i)REFERRED TO (?P<committees>.+)', 'committee:referred'),
Rule(r'(?i)reference changed to (?P<committees>.+)',
'committee:referred'),
Rule(r'(?i) committed to (?P<committees>.+)', 'committee:referred'),
Rule(r'(?i)^reported$'),

# Governor
Rule(r'(?i)signed chap.(?P<session_laws>\d+)', 'governor:signed'),
Rule(r'(?i)vetoed memo.(?P<veto_memo>.+)', 'governor:vetoed'),
Rule(r'(?i)DELIVERED TO GOVERNOR', 'governor:received'),

# Random.
Rule(r'(?i)substituted by (?P<bill_id>\w\d+)')
)


class Categorizer(BaseCategorizer):
rules = _categorizer_rules
37 changes: 7 additions & 30 deletions openstates/ny/bills.py
Expand Up @@ -9,11 +9,15 @@
import lxml.html
import lxml.etree

from .actions import Categorizer


class NYBillScraper(BillScraper):

state = 'ny'

categorizer = Categorizer()

def scrape(self, chamber, session):

errors = 0
Expand Down Expand Up @@ -163,40 +167,13 @@ def scrape_bill(self, bill, url):

actions.append((date, action))

first = True
categorizer = self.categorizer
for date, action in reversed(actions):
act_chamber = ('upper' if action.isupper() else 'lower')
atype = []
if first:
atype.append('bill:introduced')
first = False

if 'REFERRED TO' in action:
atype.append('committee:referred')
elif action == 'ADOPTED':
atype.append('bill:passed')
elif action in ('PASSED SENATE', 'PASSED ASSEMBLY'):
atype.append('bill:passed')
elif action in ('DELIVERED TO SENATE',
'DELIVERED TO ASSEMBLY'):
first = True
act_chamber = {'upper': 'lower',
'lower': 'upper'}[act_chamber]
elif (action.startswith('AMENDED') or
action.startswith('AMEND (T) AND') or
action.startswith('AMEND AND')):
atype.append('amendment:passed')
elif action.startswith('RECOMMIT,'):
atype.append('committee:referred')

if 'RECOMMIT TO' in action:
atype.append('committee:referred')

if not atype:
atype = ['other']

types, attrs = categorizer.categorize(action)
bill.add_action(act_chamber, action, date,
type=atype)
type=types)

self.scrape_versions(bill, page, url)

Expand Down

0 comments on commit 60675f2

Please sign in to comment.