/
utils.py
239 lines (191 loc) · 7.18 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# Helpful functions for finding data about members and committees
CURRENT_CONGRESS = 112
import os, errno, sys, traceback
import re, htmlentitydefs
import pprint
from datetime import datetime
def parse_date(date):
return datetime.strptime(date, "%Y-%m-%d").date()
def log(object):
if isinstance(object, (str, unicode)):
print object
else:
pprint(object)
def uniq(seq):
seen = set()
seen_add = seen.add
return [ x for x in seq if x not in seen and not seen_add(x)]
def flags():
options = {}
for arg in sys.argv[1:]:
if arg.startswith("--"):
if "=" in arg:
key, value = arg.split('=')
else:
key, value = arg, True
key = key.split("--")[1]
if value == 'True': value = True
elif value == 'False': value = False
options[key.lower()] = value
return options
##### Data management
def data_dir():
return ".."
def load_data(path):
return yaml_load(os.path.join(data_dir(), path))
def save_data(data, path):
return yaml_dump(data, os.path.join(data_dir(), path))
##### Downloading
import scrapelib
scraper = scrapelib.Scraper(requests_per_minute=120, follow_robots=False, retry_attempts=3)
def cache_dir():
return "cache"
def download(url, destination, force=False, options=None):
if not options:
options = {}
cache = os.path.join(cache_dir(), destination)
if not force and os.path.exists(cache):
if options.get('debug', False):
log("Cached: (%s, %s)" % (cache, url))
with open(cache, 'r') as f:
body = f.read()
else:
try:
if options.get('debug', False):
log("Downloading: %s" % url)
response = scraper.urlopen(url)
body = response.encode('utf-8')
except scrapelib.HTTPError as e:
log("Error downloading %s" % url)
return None
# don't allow 0-byte files
if (not body) or (not body.strip()):
return None
# cache content to disk
write(body, cache)
return body
def write(content, destination):
mkdir_p(os.path.dirname(destination))
f = open(destination, 'w')
f.write(content)
f.close()
# mkdir -p in python, from:
# http://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST:
pass
else:
raise
def format_exception(exception):
exc_type, exc_value, exc_traceback = sys.exc_info()
return "\n".join(traceback.format_exception(exc_type, exc_value, exc_traceback))
# taken from http://effbot.org/zone/re-sub.htm#unescape-html
def unescape(text):
def remove_unicode_control(str):
remove_re = re.compile(u'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]')
return remove_re.sub('', str)
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
text = re.sub("&#?\w+;", fixup, text)
text = remove_unicode_control(text)
return text
##### YAML serialization ######
# In order to preserve the order of attributes, YAML must be
# hooked to load mappings as OrderedDicts. Adapted from:
# https://gist.github.com/317164
import yaml
try:
from yaml import CSafeLoader as Loader, CDumper as Dumper
except ImportError:
from yaml import SafeLoader as Loader, Dumper
from collections import OrderedDict
def construct_odict(load, node):
omap = OrderedDict()
yield omap
if not isinstance(node, yaml.MappingNode):
raise yaml.constructor.ConstructorError(
"while constructing an ordered map",
node.start_mark,
"expected a map, but found %s" % node.id, node.start_mark
)
for key, value in node.value:
key = load.construct_object(key)
value = load.construct_object(value)
omap[key] = value
Loader.add_constructor(u'tag:yaml.org,2002:map', construct_odict)
def ordered_dict_serializer(self, data):
return self.represent_mapping('tag:yaml.org,2002:map', data.items())
Dumper.add_representer(OrderedDict, ordered_dict_serializer)
# Likewise, when we store unicode objects make sure we don't write
# them with weird YAML tags indicating the Python data type. The
# standard string type is fine. We should do this:
# Dumper.add_representer(unicode, lambda dumper, value: dumper.represent_scalar(u'tag:yaml.org,2002:str', value))
#
# However, the standard PyYAML representer for strings does something
# weird: if a value cannot be parsed as an integer quotes are omitted.
#
# This is incredibly odd when the value is an integer with a leading
# zero. These values are typically parsed as octal integers, meaning
# quotes would normally be required (that's good). But when the value
# has an '8' or '9' in it, this would make it an invalid octal number
# and so quotes would no longer be required (that's confusing).
# We will override str and unicode output to choose the quotation
# style with our own logic. (According to PyYAML, style can be one of
# the empty string, ', ", |, or >, or None to, presumably, choose
# automatically.
def our_string_representer(dumper, value):
# If it looks like an octal number, force '-quote style.
style = None
if re.match(r"^0\d*$", value): style = "'"
return dumper.represent_scalar(u'tag:yaml.org,2002:str', value, style=style)
Dumper.add_representer(str, our_string_representer)
Dumper.add_representer(unicode, our_string_representer)
# Add a representer for nulls too. YAML accepts "~" for None, but the
# default output converts that to "null".
Dumper.add_representer(type(None), lambda dumper, value : \
dumper.represent_scalar(u'tag:yaml.org,2002:null', u"~"))
# Apply some common settings for loading/dumping YAML and cache the
# data in pickled format which is a LOT faster than YAML.
def yaml_load(path, use_cache=True):
# Loading YAML is ridiculously slow, so cache the YAML data
# in a pickled file which loads much faster.
# Check if the .pickle file exists and a hash stored inside it
# matches the hash of the YAML file, and if so unpickle it.
import cPickle as pickle, os.path, hashlib
h = hashlib.sha1(open(path).read()).hexdigest()
if use_cache and os.path.exists(path + ".pickle"):
store = pickle.load(open(path + ".pickle"))
if store["hash"] == h:
return store["data"]
# No cached pickled data exists, so load the YAML file.
data = yaml.load(open(path), Loader=Loader)
# Store in a pickled file for fast access later.
pickle.dump({ "hash": h, "data": data }, open(path+".pickle", "w"))
return data
def yaml_dump(data, path):
yaml.dump(data, open(path, "w"), default_flow_style=False, allow_unicode=True, Dumper=Dumper)
# Store in a pickled file for fast access later.
import cPickle as pickle, hashlib
h = hashlib.sha1(open(path).read()).hexdigest()
pickle.dump({ "hash": h, "data": data }, open(path+".pickle", "w"))
def pprint(data):
yaml.dump(data, sys.stdout, default_flow_style=False, allow_unicode=True)