Skip to content

Commit

Permalink
Misc Fixes for RC5 (#534)
Browse files Browse the repository at this point in the history
* misc fixes (rc 5):
- banner: only auto init banner if not in top-frame (check for no-frame mode and replay url is set)
- index: 'cdx+' fix for use as internal index: if cdx has a warc filename and offset, don't attempt default live web load
- improved self-redirect: avoid www2 -> www redirect altogether, not just for second redirect
- tests: update tests for improved self-redirect checking
- bump version to pywb-2.4.0-rc5
  • Loading branch information
ikreymer committed Jan 18, 2020
1 parent 93ce4f6 commit fa021ee
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 12 deletions.
4 changes: 2 additions & 2 deletions pywb/static/default_banner.js
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,8 @@ This file is part of pywb, https://github.com/webrecorder/pywb
// all banners will expose themselves by adding themselves as WBBanner on window
window.WBBanner = new DefaultBanner();

// if in replay frame, init immediately
if (window.wbinfo) {
// if wbinfo.url is set and not-framed, init banner in content frame
if (window.wbinfo && window.wbinfo.url && !window.wbinfo.is_framed) {
if (document.readyState === "loading") {
document.addEventListener("DOMContentLoaded", function() {
window.WBBanner.init();
Expand Down
2 changes: 1 addition & 1 deletion pywb/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '2.4.0rc4'
__version__ = '2.4.0-rc5'

if __name__ == '__main__':
print(__version__)
14 changes: 9 additions & 5 deletions pywb/warcserver/resource/responseloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,18 +139,19 @@ def raise_on_self_redirect(self, params, cdx, status_code, location_url):
request_url = request_url.split('://', 1)[-1].rstrip('/')

self_redir = False
orig_key = params.get('sr-urlkey') or cdx['urlkey']

if request_url == location_url:
self_redir = True
elif params.get('sr-urlkey'):
# if new location canonicalized matches old key, also self-redirect
if canonicalize(location_url) == params.get('sr-urlkey'):
self_redir = True

# if new location canonicalized matches old key, also self-redirect
elif canonicalize(location_url) == orig_key:
self_redir = True

if self_redir:
msg = 'Self Redirect {0} -> {1}'
msg = msg.format(request_url, location_url)
params['sr-urlkey'] = cdx['urlkey']
params['sr-urlkey'] = orig_key
raise LiveResourceException(msg)

@staticmethod
Expand Down Expand Up @@ -267,6 +268,9 @@ def __init__(self, forward_proxy_prefix=None, adapter=None):
self.socks_proxy = None

def load_resource(self, cdx, params):
if cdx.get('filename') and cdx.get('offset') is not None:
return None

load_url = cdx.get('load_url')
if not load_url:
return None
Expand Down
4 changes: 2 additions & 2 deletions pywb/warcserver/test/test_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,8 @@ def test_agg_select_mem_unrewrite_headers(self):
buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False)
print(record.http_headers)
assert record.http_headers.get_statuscode() == '302'
assert record.http_headers.get_header('Location') == 'https://www.iana.org/'
assert record.http_headers.get_statuscode() == '200'
#assert record.http_headers.get_header('Location') == 'https://www.iana.org/'

@patch('pywb.warcserver.index.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live'))
def test_agg_select_live(self):
Expand Down
26 changes: 24 additions & 2 deletions tests/test_redirects.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class TestRedirects(CollsDirMixin, BaseConfigTest):
def setup_class(cls):
super(TestRedirects, cls).setup_class('config_test.yaml')

def create_redirect_record(self, url, redirect_url, timestamp):
def create_redirect_record(self, url, redirect_url, timestamp, status='301'):
warc_headers = {}
warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)

Expand All @@ -26,7 +26,7 @@ def create_redirect_record(self, url, redirect_url, timestamp):
('Location', redirect_url)
]

http_headers = StatusAndHeaders('301 Permanent Redirect', headers_list, protocol='HTTP/1.0')
http_headers = StatusAndHeaders(status + ' Redirect', headers_list, protocol='HTTP/1.0')

rec = self.writer.create_warc_record(url, 'response',
payload=BytesIO(payload),
Expand Down Expand Up @@ -140,4 +140,26 @@ def test_revisit_redirect_skip_self_redir(self, fmod):
res = self.get('/redir/20190626101112{0}/http://www.example.com/', fmod, status=200)
assert res.text == 'Some Text'

def test_init_2(self):
filename = os.path.join(self.root_dir, 'redir2.warc.gz')
with open(filename, 'wb') as fh:
self.writer = WARCWriter(fh, gzip=True)

redirect = self.create_redirect_record('http://www.example.com/path', 'https://www.example.com/path/', '20191003115920')
redirect = self.create_redirect_record('https://www.example.com/path/', 'https://www2.example.com/path', '20191003115927', status='302')
response = self.create_response_record('https://www2.example.com/path', '20191024125646', 'Some Text')
revisit = self.create_revisit_record('https://www2.example.com/path', '20191024125648', 'https://www2.example.com/path', response.rec_headers['WARC-Date'])

wb_manager(['init', 'redir2'])

wb_manager(['add', 'redir2', filename])

assert os.path.isfile(os.path.join(self.root_dir, self.COLLS_DIR, 'redir2', 'indexes', 'index.cdxj'))

def test_revisit_redirect_skip_self_redir_2(self, fmod):
res = self.get('/redir2/20191024125648{0}/http://www2.example.com/path', fmod, status=200)
assert res.text == 'Some Text'

res = self.get('/redir2/20191024125648{0}/https://www.example.com/path', fmod, status=200)
assert res.text == 'Some Text'

0 comments on commit fa021ee

Please sign in to comment.