-
Notifications
You must be signed in to change notification settings - Fork 462
/
bills.py
497 lines (464 loc) · 24.5 KB
/
bills.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
import re
from billy.scrape import NoDataForPeriod
from billy.scrape.bills import BillScraper, Bill
from billy.scrape.votes import Vote
from openstates.az import utils
from openstates.az.action_utils import get_action_type
from lxml import html
BASE_URL = 'http://www.azleg.gov/'
class AZBillScraper(BillScraper):
"""
Arizona Bill Scraper.
"""
state = 'az'
def get_session_id(self, session):
"""
returns the session id for a given session
"""
return self.metadata['session_details'][session]['session_id']
def scrape_bill(self, chamber, session, bill_id):
"""
Scrapes documents, actions, vote counts and votes for
a given bill.
"""
session_id = self.get_session_id(session)
url = BASE_URL + 'DocumentsForBill.asp?Bill_Number=%s&Session_ID=%s' % (
bill_id.replace(' ', ''), session_id)
with self.urlopen(url) as docs_for_bill:
root = html.fromstring(docs_for_bill)
bill_title = root.xpath(
'//div[@class="ContentPageTitle"]')[1].text.strip()
b_type = utils.get_bill_type(bill_id)
bill = Bill(session, chamber, bill_id, bill_title, type=b_type)
bill.add_source(url)
path = '//tr[contains(td/font/text(), "%s")]'
link_path = '//tr[contains(td/a/@href, "%s")]'
link_path2 = '//tr[contains(td/font/a/@href, "%s")]'
# versions
rows = root.xpath(path % 'd Version')
for row in rows:
tds = row.cssselect('td')
bill_version = tds[1].text_content().strip()
bill_html = tds[2].xpath('string(font/a/@href)')
bill.add_version(bill_version, bill_html)
#fact sheets and summary
rows = root.xpath(link_path2 % '/summary/')
for row in rows:
tds = row.cssselect('td')
fact_sheet = tds[1].text_content().strip()
fact_sheet_url = tds[1].xpath('string(font/a/@href)')
bill.add_document(fact_sheet, fact_sheet_url, type="summary")
#agendas
# skipping revised, cancelled, date, time and room from agendas
# but how to get the agenda type cleanly? meaning whether it is
# house or senate?
rows = root.xpath(link_path % '/agendas')
for row in rows:
tds = row.cssselect('td')
agenda_committee = tds[0].text_content().strip()
agenda_html = tds[7].xpath('string(a/@href)').strip()
if agenda_html == '':
agenda_html = tds[6].xpath('string(a/@href)').strip()
bill.add_document(agenda_committee, agenda_html)
# House Calendars
# skipping calendar number, modified, date
rows = root.xpath(link_path % '/calendar/h')
for row in rows:
tds = row.cssselect('td')
calendar_name = tds[0].text_content().strip()
calendar_html = tds[5].xpath('string(a/@href)')
bill.add_document(calendar_name, calendar_html,
type='house calendar')
# Senate Calendars
# skipping calendar number, modified, date
rows = root.xpath(link_path % '/calendar/s')
for row in rows:
tds = row.cssselect('td')
calendar_name = tds[0].text_content().strip()
calendar_html = tds[5].xpath('string(a/@href)')
bill.add_document(calendar_name, calendar_html,
type='senate calendar')
# amendments
rows = root.xpath(path % 'AMENDMENT:')
for row in rows:
tds = row.cssselect('td')
amendment_title = tds[1].text_content().strip()
amendment_link = tds[2].xpath('string(font/a/@href)')
bill.add_document(amendment_title, amendment_link,
type='amendment')
# videos
# http://azleg.granicus.com/MediaPlayer.php?view_id=13&clip_id=7684
rows = root.xpath(link_path % '&clip_id')
for row in rows:
tds = row.cssselect('td')
video_title = tds[1].text_content().strip()
video_link = tds[2].xpath('string(a/@href)')
video_date = tds[0].text_content().strip()
bill.add_document(video_title, video_link, date=video_date,
type='video')
self.scrape_actions(chamber, session, bill)
def scrape_actions(self, chamber, session, bill):
"""
Scrape the actions for a given bill
"""
ses_num = utils.legislature_to_number(session)
bill_id = bill['bill_id'].replace(' ', '')
action_url = BASE_URL + 'FormatDocument.asp?inDoc=/legtext/%s/bills/%so.asp' % (ses_num, bill_id.lower())
with self.urlopen(action_url) as action_page:
bill.add_source(action_url)
root = html.fromstring(action_page)
base_table = root.xpath('//table[@class="ContentAreaBackground"]')[0]
# common xpaths
table_path = '//table[contains(tr/td/b/text(), "%s")]'
#sponsors
sponsors = base_table.xpath('//sponsor')
for sponsor in sponsors:
name = sponsor.text.strip()
# sponsor.xpath('string(ancestor::td[1]/following-sibling::td[1]/text())').strip()
s_type = sponsor.getparent().getparent().getnext().text_content().strip()
bill.add_sponsor(s_type, name)
#titles
table = base_table.xpath(table_path % 'TITLE')
if table:
for row in table[0].iterchildren('tr'):
title = row[1].text_content().strip()
if title != bill['title']:
bill.add_title(title)
for table in base_table.xpath('tr/td/table'):
action = table.xpath('string(tr[1]/td[1])').strip()
if action == '':
action = table.xpath('string(tr[1])').strip()
if (action.endswith('FIRST READ:') or
action.endswith('SECOND READ:') or 'WAIVED' in action):
rows = table.xpath('tr')
for row in rows:
action = row[0].text_content().strip()[:-1]
actor = 'lower' if action.startswith('H') else 'upper'
date = utils.get_date(row[1])
# bill:introduced
if (action.endswith('FIRST READ') or
action.endswith('FIRST WAIVED')):
if actor == chamber:
a_type = ['bill:introduced', 'bill:reading:1']
else:
a_type = 'bill:reading:1'
bill.add_action(actor, action, date, type=a_type)
else:
a_type = 'bill:reading:2'
bill.add_action(actor, action, date, type=a_type)
continue
elif action == 'COMMITTEES:':
# committee assignments
rows = table.xpath('tr')[1:]
for row in rows:
# First add the committee assigned action
meta_tag = row.cssselect('meta')[0]
h_or_s = meta_tag.get('name')[0] # @name is HCOMMITTEE OR SCOMMITTEE
committee = meta_tag.get('content') # @content is committee abbrv
#actor is house or senate referring the bill to committee
actor = 'lower' if h_or_s.lower() == 'h' else 'upper'
act = 'assigned to committee: ' + committee
date = utils.get_date(row[1])
bill.add_action(actor, act, date, type='committee:referred')
# now lets see if there is a vote
vote_url = row[0].xpath('string(a/@href)')
if vote_url:
date = utils.get_date(row[3])
act = row[5].text_content().strip()
a_type = get_action_type(act, 'COMMITTEES:')
bill.add_action(actor, committee + ":" + act, date,
type=a_type)
self.scrape_votes(actor, vote_url, bill, date,
motion='committee: ' + act,
committee=committee,
type='other')
elif len(row) == 5:
# probably senate rules committee
date = utils.get_date(row[3])
if date == '':
date = utils.get_date(row[1])
act = row[4].text_content().strip()
a_type = get_action_type(act, 'COMMITTEES:')
bill.add_action(actor, committee + ":" + act, date,
type=a_type)
continue
elif 'CAUCUS' in action:
rows = table.xpath('tr')[0:2]
for row in rows:
actor = utils.get_actor(row, chamber)
action = row[0].text_content().strip()
if action.endswith(':'):
action = action[:-1]
result = row[2].text_content().strip()
# majority caucus Y|N
action = action + " concur: " + result
date = utils.get_date(row[1])
bill.add_action(actor, action, date, concur=result,
type='other')
continue
# transmit to house or senate
elif 'TRANSMIT TO' in action:
rows = table.xpath('tr')
for row in rows:
action = row[0].text_content().strip()[:-1]
actor = 'upper' if action.endswith('HOUSE') else 'lower'
date = utils.get_date(row[1])
bill.add_action(actor, action, date, type='other')
continue
# Committee of the whole actions
elif 'COW ACTION' in action:
rows = table.xpath('tr')
actor = utils.get_actor(rows[0], chamber)
if 'SIT COW ACTION' in action:
act = rows[0][-1].text_content().strip()
date = utils.get_date(rows[0][1])
else:
act = rows[1][2].text_content().strip()
date = utils.get_date(rows[1][1])
action = action + " " + act # COW ACTION 1 DPA
bill.add_action(actor, action, date, type='other')
if rows[1][0].text_content().strip() == 'Vote Detail':
vote_url = rows[1][0].xpath('string(a/@href)')
self.scrape_votes(actor, vote_url, bill, date,
motion=action, type='other',
extra=act)
continue
# AMENDMENTS
elif 'AMENDMENTS' in action:
rows = table.xpath('tr')[1:]
for row in rows:
act = row.text_content().strip()
if act == '':
continue
if 'passed' in act or 'adopted' in act:
a_type = 'amendment:passed'
elif 'failed' in act:
a_type = 'amendment:failed'
elif 'withdrawn' in act:
a_type = 'amendment:withdrawn'
else:
a_type = 'other'
# actor and date will same as previous action
bill.add_action(actor, act, date, type=a_type)
continue
# CONFERENCE COMMITTEE
# http://www.azleg.gov/FormatDocument.asp?inDoc=/legtext/49Leg/2r/bills/hb2083o.asp
# MISCELLANEOUS MOTION
# MOTION TO RECONSIDER
elif action == 'MOTION TO RECONSIDER:':
date = utils.get_date(table[1][1])
if date:
if table[1][0].text_content().strip() == 'Vote Detail':
vote_url = table[1][0].xpath('string(a/@href)')
bill.add_action(actor, action, date, type=a_type)
self.scrape_votes(actor, vote_url, bill, vote_date,
motion='motion to reconsider',
type='other')
else:
action = table[-1][1].text_content().strip()
bill.add_action(actor, action, date, type='other')
continue
elif (action.endswith('FINAL READ:') or
action.endswith('THIRD READ:')):
# house|senate final and third read
rows = table.xpath('tr')
# need to find out if third read took place in house or senate
# if an ancestor table contains 'TRANSMIT TO' then the action
# is taking place in that chamber, else it is in chamber
actor = utils.get_actor(rows[0], chamber)
# get a dict of keys from the header and values from the row
k_rows = utils.get_rows(rows[1:], rows[0])
action = rows[0][0].text_content().strip()
for row in k_rows:
a_type = [get_action_type(action, 'Generic')]
if row[action].text_content().strip() == 'Vote Detail':
vote_url = row.pop(action).xpath('string(a/@href)')
vote_date = utils.get_date(row.pop('DATE'))
passed = row.pop('RESULT').text_content().strip()
# leaves vote counts, ammended, emergency, two-thirds
# and possibly rfe left in k_rows. get the vote counts
# from scrape votes and pass ammended and emergency
# as kwargs to sort them in scrap_votes
pass_fail = {'PASSED': 'bill:passed',
'FAILED': 'bill:failed'}[passed]
a_type.append(pass_fail)
bill.add_action(actor, action, vote_date,
type=a_type)
row['type'] = 'passage'
self.scrape_votes(actor, vote_url, bill, vote_date,
passed=passed, motion=action,
**row)
else:
date = utils.get_date(row.pop('DATE'))
if date:
bill.add_action(actor, action, date, type=a_type)
continue
elif 'TRANSMITTED TO' in action:
# transmitted to Governor or secretary of the state
# SoS if it goes to voters as a proposition and memorials, etc
rows = table.xpath('tr')
actor = utils.get_actor(rows[0], chamber)
# actor is the actor from the previous statement because it is
# never transmitted to G or S without third or final read
sent_to = rows[0][1].text_content().strip()
date = utils.get_date(rows[0][2])
a_type = 'governor:received' if sent_to[0] == 'G' else 'other'
bill.add_action(actor, "TRANSMITTED TO " + sent_to, date,
type=a_type)
# See if the actor is the governor and whether he signed
# the bill or vetoed it
act, date, chapter, version = '', '', '', ''
for row in rows[1:]:
if row[0].text_content().strip() == 'ACTION:':
act = row[1].text_content().strip()
date = utils.get_date(row[2])
elif row[0].text_content().strip() == 'CHAPTER:':
chapter = row[1].text_content().strip()
elif row[0].text_content().strip() == 'CHAPTERED VERSION:':
version = row[1].text_content().strip()
elif row[0].text_content().strip() == 'TRANSMITTED VERSION:':
version = row[1].text_content().strip()
if act and sent_to == 'GOVERNOR':
a_type = 'governor:signed' if act == 'SIGNED' else 'governor:vetoed'
if chapter:
bill.add_action(sent_to.lower(), act, date,
type=a_type, chapter=chapter,
chaptered_version=version)
else:
bill.add_action(sent_to.lower(), act, date,
type=a_type)
continue
# this is probably only important for historical legislation
elif 'FINAL DISPOSITION' in action:
rows = table.xpath('tr')
if rows:
disposition = rows[0][1].text_content().strip()
bill['final_disposition'] = disposition
bill = self.sort_bill_actions(bill)
self.save_bill(bill)
def scrape(self, chamber, session):
try:
session_id = self.get_session_id(session)
except KeyError:
raise NoDataForPeriod(session)
view = {'lower':'allhouse', 'upper':'allsenate'}[chamber]
url = BASE_URL + 'Bills.asp?view=%s&Session_ID=%s' % (view, session_id)
with self.urlopen(url) as bills_index:
root = html.fromstring(bills_index)
bill_links = root.xpath('//div/table/tr[3]/td[4]/table/tr/td/' +
'table[2]/tr[2]/td/table/tr/td[2]/table/tr/td//a')
for link in bill_links:
bill_id = link.text.strip()
bill_id = " ".join(re.split('([A-Z]*)([0-9]*)', bill_id)).strip()
self.scrape_bill(chamber, session, bill_id)
def scrape_votes(self, chamber, url, bill, date, **kwargs):
"""
Scrapes the votes from a vote detail page with the legislator's names
this handles all of the votes and expects the following keyword
arguments: motion ... hmm I guess thats it :)
"""
o_args = {}
passed = '' # to test if we need to compare vote counts later
v_type = kwargs.pop('type')
if 'passed' in kwargs:
passed = {'PASSED': True, 'FAILED': False}[kwargs.pop('passed')]
if 'AMEND' in kwargs:
o_args['amended'] = kwargs.pop('AMEND').text_content().strip()
if 'motion' in kwargs:
motion = kwargs.pop('motion')
if 'EMER' in kwargs and kwargs['EMER'].text_content().strip():
o_args['EMER'] = kwargs.pop('EMER').text_content().strip()
if '2/3 VOTE' in kwargs and kwargs['2/3 VOTE'].text_content().strip():
o_args['2/3 VOTE'] = kwargs.pop('2/3 VOTE').text_content().strip()
if 'committee' in kwargs:
o_args['committee'] = utils.get_committee_name(kwargs.pop('committee'),
chamber)
with self.urlopen(url) as vote_page:
root = html.fromstring(vote_page)
vote_table = root.xpath('/html/body/div/table/tr[3]/td[4]/table/tr/td/table/tr/td/table')[0]
vote_count = vote_table.xpath('following-sibling::p/following-sibling::text()')
vote_string = vote_count[0].replace(u'\xa0', '').strip()
v_count = re.compile(r'\b[A-Z]*\s*[A-z]*:\s\d*')
v_list = v_count.findall(vote_string)
o_count = 0
for x in v_list:
k, v = x.split(':')
# make NOT VOTING not_voting
k = k.strip().replace(' ', '_').lower()
v = int(v.strip())
if k == 'ayes':
yes_count = int(v)
elif k == 'nays':
no_count = int(v)
else:
o_args.update({str(k):v})
o_count += int(v)
if passed == '':
passed = yes_count > no_count
if 'committee' not in o_args:
if chamber == 'upper' and passed:
if 'EMER' in o_args or '2/3 VOTE' in o_args:
passed = yes_count > 20
else:
passed = yes_count > 16
elif chamber == 'lower' and passed:
if 'EMER' in o_args or '2/3 VOTE' in o_args:
passed = yes_count > 40
else:
passed = yes_count > 31
vote = Vote(chamber, date, motion, passed, yes_count, no_count,
o_count, type=v_type, **o_args)
vote.add_source(url)
# grab all the tables descendant tds
tds = vote_table.xpath('descendant::td')
# pair 'em up
matched = [ tds[y:y+2] for y in range(0, len(tds), 2) ]
for name, v in iter(matched):
v = v.text_content().strip()
name = name.text_content().strip()
if name == 'Member Name':
continue
if v == 'Y':
vote.yes(name)
elif v == 'N':
vote.no(name)
else:
vote.other(name)
bill.add_vote(vote)
def sort_bill_actions(self, bill):
actions = bill['actions']
actions_list = []
out_of_order = []
new_list = []
if not actions:
return bill
action_date = actions[0]['date']
actions_list.append(actions[0])
# seperate the actions that are out of order
for action in actions[1:]:
if action['date'] < action_date:
out_of_order.append(action)
else:
actions_list.append(action)
action_date = action['date']
action_date = actions_list[0]['date']
for action in actions_list:
# this takes care of the actions in beween
for act in out_of_order:
if act['date'] < action_date:
o_index = out_of_order.index(act)
new_list.append(out_of_order.pop(o_index))
if act['date'] >= action_date and act['date'] < action['date']:
o_index = out_of_order.index(act)
new_list.append(out_of_order.pop(o_index))
new_list.append(action)
for act in out_of_order:
if act['date'] == action['date']:
o_index = out_of_order.index(act)
new_list.append(out_of_order.pop(o_index))
if out_of_order != []:
self.log("Unable to sort " + bill['bill_id'])
return bill
else:
bill['actions'] = new_list
return bill