Skip to content

Commit

Permalink
Edge-case and HTML Rewrite Fixes (#441)
Browse files Browse the repository at this point in the history
* recoder fix: ensure Transfer-Encoding header is not passed through by RecorderApp,
as may result in duplicate Transfer-Encoding in py2.7, fixes #432

* html rewriter fixes:
- html detection: allow for UTF-8 BOM when detecting if text is html
- html decl parsing: modify base parser regex to allow IE conditional declaration to also
end with -->, eg. support '<![endif]-->' in addition to '<![endif]>', fixes #425

* travis: add allow failure for integration tests (for now)
  • Loading branch information
ikreymer committed Feb 18, 2019
1 parent 100c7f5 commit 38c1b1c
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 2 deletions.
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ after_success:
- codecov

matrix:
allow_failures:
- env: WR_TEST=yes

exclude:
- env: WR_TEST=yes
python: "2.7"
Expand Down
2 changes: 1 addition & 1 deletion pywb/rewrite/content_rewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ def rewrite_text_stream_to_gen(self, stream, rwinfo):

# ============================================================================
class RewriteInfo(object):
TAG_REGEX = re.compile(b'^\s*\<')
TAG_REGEX = re.compile(b'^(\xef\xbb\xbf)?\s*\<')
TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]')
JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML

Expand Down
12 changes: 11 additions & 1 deletion pywb/rewrite/html_rewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

from pywb.rewrite.content_rewriter import StreamingRewriter

from six import text_type

import six.moves.html_parser

try:
Expand All @@ -21,7 +23,15 @@
except:
orig_unescape = None

from six import text_type

try:
import _markupbase as markupbase
except:
import markupbase as markupbase

# ensure invalid cond ending ']-->' closing decl
# is treated same as ']>'
markupbase._msmarkedsectionclose = re.compile(r']\s*-{0,2}>')


#=================================================================
Expand Down
11 changes: 11 additions & 0 deletions pywb/rewrite/test/test_html_rewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,10 @@
>>> parse('<!DOCTYPE html>Some Text without any tags <!-- comments -->', head_insert = '<script>load_stuff();</script>')
<!DOCTYPE html>Some Text without any tags <!-- comments --><script>load_stuff();</script>
# UTF-8 BOM
>>> parse('\ufeff<!DOCTYPE html>Some Text without any tags <!-- comments -->', head_insert = '<script>load_stuff();</script>')
\ufeff<!DOCTYPE html>Some Text without any tags <!-- comments --><script>load_stuff();</script>
# no parse comments
>>> parse('<html><!-- <a href="/foo.html"> --></html>')
<html><!-- <a href="/foo.html"> --></html>
Expand Down Expand Up @@ -395,6 +399,13 @@
>>> parse('<html><a href="javascript:alert()"></a></html>', js_proxy=True)
<html><a href="javascript:alert()"></a></html>
# IE conditional
>>> parse('<!--[if !IE]><html><![endif]--><a href="http://example.com/"><!--[if IE]><![endif]--><a href="http://example.com/"></html>')
<!--[if !IE]><html><![endif]--><a href="/web/20131226101010/http://example.com/"><!--[if IE]><![endif]--><a href="/web/20131226101010/http://example.com/"></html>
# IE conditional with invalid ']-->' ending, rewritten as ']>'
>>> parse('<!--[if !IE]> --><html><![endif]--><a href="http://example.com/"><!--[if IE]><![endif]--><a href="http://example.com/"></html>')
<!--[if !IE]> --><html><![endif]><a href="/web/20131226101010/http://example.com/"><!--[if IE]><![endif]--><a href="/web/20131226101010/http://example.com/"></html>
# Test blank
Expand Down

0 comments on commit 38c1b1c

Please sign in to comment.