-
Notifications
You must be signed in to change notification settings - Fork 462
/
bills.py
165 lines (136 loc) · 5.16 KB
/
bills.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from collections import defaultdict
from urlparse import urljoin
from datetime import datetime
import lxml.html
from billy.scrape import NoDataForPeriod, ScrapeError
from billy.scrape.bills import Bill, BillScraper
from billy.scrape.votes import Vote
from .actions import NDCategorizer
import re
base_url = "http://www.legis.nd.gov/assembly/%s-%s/subject-index/major-topic.html"
class NDBillScraper(BillScraper):
"""
Scrapes available legislative information from the website of the North
Dakota legislature and stores it in the openstates backend.
"""
jurisdiction = 'nd'
categorizer = NDCategorizer()
def scrape_actions(self, session, subject, href, bid):
page = self.urlopen(href)
page = lxml.html.fromstring(page)
page.make_links_absolute(href)
table = page.xpath(
"//table[@summary='Bill Number Breakdown']"
)
if len(table) > 1: # Pre-2013 pages.
ttable, table = table
ttrows = ttable.xpath(".//tr")
descr = ttrows[-1]
else:
table = table[0]
curnode = page.xpath("//div[@id='fastpath']")[0].getnext()
ret = []
while curnode.tag != "table":
curnode = curnode.getnext()
ret.append(curnode)
ttrows = ret
descr = page.xpath("//div[@class='section']//p")[-2]
title = re.sub("\s+", " ", descr.text_content()).strip()
ttrows = ttrows[:-1]
chamber = {
"H": "lower",
"S": "upper"
}[bid[0]]
bill = Bill(session,
chamber,
bid,
title,
subject=subject)
bill.add_source(href)
for row in ttrows:
sponsors = row.text_content().strip()
sinf = re.match(
"(?i)introduced by( (rep\.|sen\.))? (?P<sponsors>.*)",
sponsors
)
if sinf:
sponsors = sinf.groupdict()
for sponsor in [
x.strip() for x in sponsors['sponsors'].split(",")
]:
bill.add_sponsor('primary',
sponsor)
dt = None
for row in table.xpath(".//tr"):
if row.text_content().strip() == '':
continue
if "Meeting Description" in [
x.strip() for x in row.xpath(".//th/text()")
]:
continue
row = row.xpath("./*")
row = [x.text_content().strip() for x in row]
if len(row) > 3:
row = row[:3]
date, chamber, action = row
try:
chamber = {
"House": "lower",
"Senate": "upper"
}[chamber]
except KeyError:
chamber = "other"
if date != '':
dt = datetime.strptime("%s %s" % (date, self.year), "%m/%d %Y")
kwargs = self.categorizer.categorize(action)
bill.add_action(chamber,
action,
dt,
**kwargs)
version_url = page.xpath("//a[contains(text(), 'Versions')]")
if len(version_url) == 1:
href = version_url[0].attrib['href']
bill = self.scrape_versions(bill, href)
self.save_bill(bill)
def scrape_versions(self, bill, href):
page = self.urlopen(href)
page = lxml.html.fromstring(page)
page.make_links_absolute(href)
versions = page.xpath("//a[contains(@href, '/documents/')]")
for version in versions:
name, href = version.text, version.attrib['href']
bill.add_version(name, href, mimetype='application/pdf')
return bill
def scrape_subject(self, session, href, subject):
page = self.urlopen(href)
page = lxml.html.fromstring(page)
page.make_links_absolute(href)
bills = page.xpath("//a[contains(@href, 'bill-actions')]")
for bill in bills:
bt = bill.text_content()
typ, idd, _, = bt.split()
bid = "%s %s" % (typ, idd)
self.scrape_actions(session, subject, bill.attrib['href'], bid)
def scrape(self, term, chambers):
# figuring out starting year from metadata
for t in self.metadata['terms']:
if t['name'] == term:
start_year = t['start_year']
self.year = start_year
break
url = base_url % (term, start_year)
page = self.urlopen(url)
page = lxml.html.fromstring(page)
page.make_links_absolute(url)
subjects = page.xpath(
"//div[@id='application']"
"//a[not(contains(@href, 'major-topic'))]"
)
for subject in subjects:
subject_name = subject.xpath("text()")
if subject_name == [] \
or subject_name[0].strip() == '' \
or 'href' not in subject.attrib:
continue
href = subject.attrib['href']
self.scrape_subject(term, href, subject.text.strip())