/
grabber.py
166 lines (140 loc) · 4.95 KB
/
grabber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import re, sys, subprocess, os
import posixpath, shutil, requests
if len(sys.argv) < 2:
print "no host specified!"
sys.exit(1)
url = sys.argv[1]
proto, host = re.match(r'(\w+://)?([\w\.]+)', url).groups()
if not proto:
proto = "http"
url = "http://" + url
##
## Stage One - Grab the base site
##
subprocess.check_call(["wget", "-mk", url])
show_dir = os.path.join(host, "neatline", "show")
for name in os.listdir(show_dir):
path = posixpath.join(url, 'neatline', 'fullscreen', name)
subprocess.check_call(["wget", "-mk", path])
shutil.rmtree(show_dir)
# move fullscreen to show
os.rename(os.path.join(host, "neatline", "fullscreen"), show_dir)
##
## Stage Two - Make the site work
##
url_cache = set()
def wget_resource(url):
if url not in url_cache:
url_cache.add(url)
subprocess.check_call(["wget", "-x", url])
def to_relative(path, root):
relative = "\/".join([".."] * root.count("/"))
if not relative:
relative = "."
return relative + path
js_escape = url.replace("/","\/")
escaped_url = re.compile(re.escape(js_escape)+r"((?:\\\/|[\w\-\.#])+)")
def replace_urls(page, root):
def replace(match):
relative_url = to_relative(match.group(1), root)
absolute = url + match.group(1).replace('\\','')
# print replace_text, absolute
# raw_input("??")
if "#" not in absolute:
wget_resource(absolute)
return relative_url
page, count = re.subn(escaped_url, replace, page)
return (page, count != 0)
image = re.compile(r'"openlayers_theme":"((?:\\\/|[\w\-])+)"')
small_images = ["layer-switcher-maximize.png",
"layer-switcher-minimize.png",
"blank.gif",
"slider.png",
"zoombar.png",
"marker.png",
"blank.gif",
"marker.png",
"cloud-popup-relative.png",
"layer-switcher-maximize.png",
"layer-switcher-minimize.png",
"north-mini.png",
"west-mini.png",
"east-mini.png",
"south-mini.png",
"zoom-plus-mini.png",
"zoom-world-mini.png",
"zoom-minus-mini.png",
"north-mini.png",
"west-mini.png",
"zoom-world-mini.png",
"east-mini.png",
"south-mini.png",
"zoom-plus-mini.png",
"zoom-minus-mini.png",
"zoom-plus-mini.png",
"zoom-minus-mini.png",
"zoom-world-mini.png"]
def get_small_images(page, root):
def replace(match):
relative_url = to_relative(match.group(1), root)
absolute = url + match.group(1).replace('\\','')
# print relative_url, absolute
# raw_input("??")
for i in small_images:
wget_resource(posixpath.join(absolute, i))
return '"openlayers_theme":"{}"'.format(relative_url)
page, count = re.subn(image, replace, page)
return (page, count != 0)
records_api = re.compile(r'"record_api":"((?:\\\/|[\w\-])+)"')
exhibit_id = re.compile(r'"id":(\d+)')
neatline_func = """
Neatline.Shared.Record.Collection.prototype.fetch = function(params) {
var out = this.set(this.parse(neatline_static_record_data));
if (out) {
params.success && params.success(this, neatline_static_record_data);
this.trigger("sync", this, neatline_static_record_data);
}
}
Neatline.Shared.Record.Collection.prototype.model.prototype.fetch = function(params) {
var model = _.find(neatline_static_record_data.records, {id: +this.id})
var out = this.set(this.parse(model));
if (out) {
params.success && params.success(this, model);
this.trigger("sync", this, model);
}
}"""
href_re = re.compile(r'(?<=href=\\")(\\\/(?:[-\w#]|\\\/)+)')
def get_records(page, root):
records_url = re.search(records_api, page)
eid = re.search(exhibit_id, page)
if records_url and eid:
absolute = url + records_url.group(1).replace('\\','')
param_data = {"exhibit_id": eid.group(1)}
print "making request", absolute, param_data
data = requests.get(absolute, params=param_data).text
data, _ = replace_urls(data, root)
data = re.sub(r"\\\/fullscreen\\\/", r"\/show\/", data)
def repl(m):
return to_relative(m.group(1), root)
data = re.sub(href_re, repl, data)
template = "<script>\n var neatline_static_record_data = {}; {}\n</script>"
data = template.format(data, neatline_func)
page = re.sub("</head>", lambda m: data + "\n</head>", page, count = 1)
return page, True
return page, False
login_page = re.compile('<h1>Log In</h1>')
for root, dirs, files in os.walk(host):
for i in files:
with open(os.path.join(root, i), 'r+') as file:
page = file.read()
if re.search(login_page, page):
file.close()
os.remove(os.path.join(root, i))
continue
page, modified0 = replace_urls(page, root)
page, modified1 = get_small_images(page, root)
page, modified2 = get_records(page, root)
if modified0 or modified1 or modified2:
file.seek(0)
file.write(page)
file.truncate()