Skip to content

Commit 6725dad

Browse files
serhiy-storchakaezio-melottiwaylan
authored andcommitted
[3.12] pythongh-135661: Fix parsing start and end tags in HTMLParser according to the HTML5 standard (pythonGH-135930) (pythonGH-136268)
* Whitespaces no longer accepted between `</` and the tag name. E.g. `</ script>` does not end the script section. * Vertical tabulation (`\v`) and non-ASCII whitespaces no longer recognized as whitespaces. The only whitespaces are `\t\n\r\f `. * Null character (U+0000) no longer ends the tag name. * Attributes and slashes after the tag name in end tags are now ignored, instead of terminating after the first `>` in quoted attribute value. E.g. `</script/foo=">"/>`. * Multiple slashes and whitespaces between the last attribute and closing `>` are now ignored in both start and end tags. E.g. `<a foo=bar/ //>`. * Multiple `=` between attribute name and value are no longer collapsed. E.g. `<a foo==bar>` produces attribute "foo" with value "=bar". * Whitespaces between the `=` separator and attribute name or value are no longer ignored. E.g. `<a foo =bar>` produces two attributes "foo" and "=bar", both with value None; `<a foo= bar>` produces two attributes: "foo" with value "" and "bar" with value None. * Fix data loss after unclosed script or style tag (pythongh-86155). Also backport test.support.subTests() (pythongh-135120). --------- (cherry picked from commit 0243f97) (cherry picked from commit c555f88) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com> Co-authored-by: Waylan Limberg <waylan.limberg@icloud.com>
1 parent 8d1b3df commit 6725dad

File tree

5 files changed

+222
-120
lines changed

5 files changed

+222
-120
lines changed

Lib/html/parser.py

Lines changed: 70 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,43 @@
2929
piclose = re.compile('>')
3030
commentclose = re.compile(r'--\s*>')
3131
# Note:
32-
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
33-
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
32+
# 1) if you change tagfind/attrfind remember to update locatetagend too;
33+
# 2) if you change tagfind/attrfind and/or locatetagend the parser will
3434
# explode, so don't do it.
35-
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
36-
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
37-
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
38-
attrfind_tolerant = re.compile(
39-
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
40-
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
35+
# see the HTML5 specs section "13.2.5.6 Tag open state",
36+
# "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
37+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
38+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
39+
# https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
40+
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*')
41+
attrfind_tolerant = re.compile(r"""
42+
(
43+
(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
44+
)
45+
(= # value indicator
46+
('[^']*' # LITA-enclosed value
47+
|"[^"]*" # LIT-enclosed value
48+
|(?!['"])[^>\t\n\r\f ]* # bare value
49+
)
50+
)?
51+
(?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
52+
""", re.VERBOSE)
53+
locatetagend = re.compile(r"""
54+
[a-zA-Z][^\t\n\r\f />]* # tag name
55+
[\t\n\r\f /]* # optional whitespace before attribute name
56+
(?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
57+
(?:= # value indicator
58+
(?:'[^']*' # LITA-enclosed value
59+
|"[^"]*" # LIT-enclosed value
60+
|(?!['"])[^>\t\n\r\f ]* # bare value
61+
)
62+
)?
63+
[\t\n\r\f /]* # possibly followed by a space
64+
)*
65+
>?
66+
""", re.VERBOSE)
67+
# The following variables are not used, but are temporarily left for
68+
# backward compatibility.
4169
locatestarttagend_tolerant = re.compile(r"""
4270
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
4371
(?:[\s/]* # optional whitespace before attribute name
@@ -54,8 +82,6 @@
5482
\s* # trailing whitespace
5583
""", re.VERBOSE)
5684
endendtag = re.compile('>')
57-
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
58-
# </ and the tag name, so maybe this should be fixed
5985
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
6086

6187

@@ -122,7 +148,8 @@ def get_starttag_text(self):
122148

123149
def set_cdata_mode(self, elem):
124150
self.cdata_elem = elem.lower()
125-
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
151+
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
152+
re.IGNORECASE|re.ASCII)
126153

127154
def clear_cdata_mode(self):
128155
self.interesting = interesting_normal
@@ -147,7 +174,7 @@ def goahead(self, end):
147174
# & near the end and see if it's followed by a space or ;.
148175
amppos = rawdata.rfind('&', max(i, n-34))
149176
if (amppos >= 0 and
150-
not re.compile(r'[\s;]').search(rawdata, amppos)):
177+
not re.compile(r'[\t\n\r\f ;]').search(rawdata, amppos)):
151178
break # wait till we get all the text
152179
j = n
153180
else:
@@ -260,7 +287,7 @@ def goahead(self, end):
260287
else:
261288
assert 0, "interesting.search() lied"
262289
# end while
263-
if end and i < n and not self.cdata_elem:
290+
if end and i < n:
264291
if self.convert_charrefs and not self.cdata_elem:
265292
self.handle_data(unescape(rawdata[i:n]))
266293
else:
@@ -291,7 +318,7 @@ def parse_html_declaration(self, i):
291318
return self.parse_bogus_comment(i)
292319

293320
# Internal -- parse bogus comment, return length or -1 if not terminated
294-
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
321+
# see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
295322
def parse_bogus_comment(self, i, report=1):
296323
rawdata = self.rawdata
297324
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
@@ -317,6 +344,8 @@ def parse_pi(self, i):
317344

318345
# Internal -- handle starttag, return end or -1 if not terminated
319346
def parse_starttag(self, i):
347+
# See the HTML5 specs section "13.2.5.8 Tag name state"
348+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
320349
self.__starttag_text = None
321350
endpos = self.check_for_whole_start_tag(i)
322351
if endpos < 0:
@@ -369,76 +398,42 @@ def parse_starttag(self, i):
369398
# or -1 if incomplete.
370399
def check_for_whole_start_tag(self, i):
371400
rawdata = self.rawdata
372-
m = locatestarttagend_tolerant.match(rawdata, i)
373-
if m:
374-
j = m.end()
375-
next = rawdata[j:j+1]
376-
if next == ">":
377-
return j + 1
378-
if next == "/":
379-
if rawdata.startswith("/>", j):
380-
return j + 2
381-
if rawdata.startswith("/", j):
382-
# buffer boundary
383-
return -1
384-
# else bogus input
385-
if j > i:
386-
return j
387-
else:
388-
return i + 1
389-
if next == "":
390-
# end of input
391-
return -1
392-
if next in ("abcdefghijklmnopqrstuvwxyz=/"
393-
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
394-
# end of input in or before attribute value, or we have the
395-
# '/' from a '/>' ending
396-
return -1
397-
if j > i:
398-
return j
399-
else:
400-
return i + 1
401-
raise AssertionError("we should not get here!")
401+
match = locatetagend.match(rawdata, i+1)
402+
assert match
403+
j = match.end()
404+
if rawdata[j-1] != ">":
405+
return -1
406+
return j
402407

403408
# Internal -- parse endtag, return end or -1 if incomplete
404409
def parse_endtag(self, i):
410+
# See the HTML5 specs section "13.2.5.7 End tag open state"
411+
# https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
405412
rawdata = self.rawdata
406413
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
407-
match = endendtag.search(rawdata, i+1) # >
408-
if not match:
414+
if rawdata.find('>', i+2) < 0: # fast check
409415
return -1
410-
gtpos = match.end()
411-
match = endtagfind.match(rawdata, i) # </ + tag + >
412-
if not match:
413-
if self.cdata_elem is not None:
414-
self.handle_data(rawdata[i:gtpos])
415-
return gtpos
416-
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
417-
namematch = tagfind_tolerant.match(rawdata, i+2)
418-
if not namematch:
419-
# w3.org/TR/html5/tokenization.html#end-tag-open-state
420-
if rawdata[i:i+3] == '</>':
421-
return i+3
422-
else:
423-
return self.parse_bogus_comment(i)
424-
tagname = namematch.group(1).lower()
425-
# consume and ignore other stuff between the name and the >
426-
# Note: this is not 100% correct, since we might have things like
427-
# </tag attr=">">, but looking for > after the name should cover
428-
# most of the cases and is much simpler
429-
gtpos = rawdata.find('>', namematch.end())
430-
self.handle_endtag(tagname)
431-
return gtpos+1
416+
if not endtagopen.match(rawdata, i): # </ + letter
417+
if rawdata[i+2:i+3] == '>': # </> is ignored
418+
# "missing-end-tag-name" parser error
419+
return i+3
420+
else:
421+
return self.parse_bogus_comment(i)
432422

433-
elem = match.group(1).lower() # script or style
434-
if self.cdata_elem is not None:
435-
if elem != self.cdata_elem:
436-
self.handle_data(rawdata[i:gtpos])
437-
return gtpos
423+
match = locatetagend.match(rawdata, i+2)
424+
assert match
425+
j = match.end()
426+
if rawdata[j-1] != ">":
427+
return -1
438428

439-
self.handle_endtag(elem)
429+
# find the name: "13.2.5.8 Tag name state"
430+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
431+
match = tagfind_tolerant.match(rawdata, i+2)
432+
assert match
433+
tag = match.group(1).lower()
434+
self.handle_endtag(tag)
440435
self.clear_cdata_mode()
441-
return gtpos
436+
return j
442437

443438
# Overridable -- finish processing of start+end tag: <tag.../>
444439
def handle_startendtag(self, tag, attrs):

Lib/test/support/__init__.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1551,6 +1551,31 @@ def check_sizeof(test, o, size):
15511551
% (type(o), result, size)
15521552
test.assertEqual(result, size, msg)
15531553

1554+
def subTests(arg_names, arg_values, /, *, _do_cleanups=False):
1555+
"""Run multiple subtests with different parameters.
1556+
"""
1557+
single_param = False
1558+
if isinstance(arg_names, str):
1559+
arg_names = arg_names.replace(',',' ').split()
1560+
if len(arg_names) == 1:
1561+
single_param = True
1562+
arg_values = tuple(arg_values)
1563+
def decorator(func):
1564+
if isinstance(func, type):
1565+
raise TypeError('subTests() can only decorate methods, not classes')
1566+
@functools.wraps(func)
1567+
def wrapper(self, /, *args, **kwargs):
1568+
for values in arg_values:
1569+
if single_param:
1570+
values = (values,)
1571+
subtest_kwargs = dict(zip(arg_names, values))
1572+
with self.subTest(**subtest_kwargs):
1573+
func(self, *args, **kwargs, **subtest_kwargs)
1574+
if _do_cleanups:
1575+
self.doCleanups()
1576+
return wrapper
1577+
return decorator
1578+
15541579
#=======================================================================
15551580
# Decorator for running a function in a different locale, correctly resetting
15561581
# it afterwards.

0 commit comments

Comments
 (0)