29
29
piclose = re .compile ('>' )
30
30
commentclose = re .compile (r'--\s*>' )
31
31
# Note:
32
- # 1) if you change tagfind/attrfind remember to update locatestarttagend too;
33
- # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
32
+ # 1) if you change tagfind/attrfind remember to update locatetagend too;
33
+ # 2) if you change tagfind/attrfind and/or locatetagend the parser will
34
34
# explode, so don't do it.
35
- # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
36
- # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
37
- tagfind_tolerant = re .compile (r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*' )
38
- attrfind_tolerant = re .compile (
39
- r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
40
- r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*' )
35
+ # see the HTML5 specs section "13.2.5.6 Tag open state",
36
+ # "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
37
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
38
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
39
+ # https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
40
+ tagfind_tolerant = re .compile (r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*' )
41
+ attrfind_tolerant = re .compile (r"""
42
+ (
43
+ (?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
44
+ )
45
+ (= # value indicator
46
+ ('[^']*' # LITA-enclosed value
47
+ |"[^"]*" # LIT-enclosed value
48
+ |(?!['"])[^>\t\n\r\f ]* # bare value
49
+ )
50
+ )?
51
+ (?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
52
+ """ , re .VERBOSE )
53
+ locatetagend = re .compile (r"""
54
+ [a-zA-Z][^\t\n\r\f />]* # tag name
55
+ [\t\n\r\f /]* # optional whitespace before attribute name
56
+ (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
57
+ (?:= # value indicator
58
+ (?:'[^']*' # LITA-enclosed value
59
+ |"[^"]*" # LIT-enclosed value
60
+ |(?!['"])[^>\t\n\r\f ]* # bare value
61
+ )
62
+ )?
63
+ [\t\n\r\f /]* # possibly followed by a space
64
+ )*
65
+ >?
66
+ """ , re .VERBOSE )
67
+ # The following variables are not used, but are temporarily left for
68
+ # backward compatibility.
41
69
locatestarttagend_tolerant = re .compile (r"""
42
70
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
43
71
(?:[\s/]* # optional whitespace before attribute name
54
82
\s* # trailing whitespace
55
83
""" , re .VERBOSE )
56
84
endendtag = re .compile ('>' )
57
- # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
58
- # </ and the tag name, so maybe this should be fixed
59
85
endtagfind = re .compile (r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>' )
60
86
61
87
@@ -122,7 +148,8 @@ def get_starttag_text(self):
122
148
123
149
def set_cdata_mode (self , elem ):
124
150
self .cdata_elem = elem .lower ()
125
- self .interesting = re .compile (r'</\s*%s\s*>' % self .cdata_elem , re .I )
151
+ self .interesting = re .compile (r'</%s(?=[\t\n\r\f />])' % self .cdata_elem ,
152
+ re .IGNORECASE | re .ASCII )
126
153
127
154
def clear_cdata_mode (self ):
128
155
self .interesting = interesting_normal
@@ -147,7 +174,7 @@ def goahead(self, end):
147
174
# & near the end and see if it's followed by a space or ;.
148
175
amppos = rawdata .rfind ('&' , max (i , n - 34 ))
149
176
if (amppos >= 0 and
150
- not re .compile (r'[\s ;]' ).search (rawdata , amppos )):
177
+ not re .compile (r'[\t\n\r\f ;]' ).search (rawdata , amppos )):
151
178
break # wait till we get all the text
152
179
j = n
153
180
else :
@@ -260,7 +287,7 @@ def goahead(self, end):
260
287
else :
261
288
assert 0 , "interesting.search() lied"
262
289
# end while
263
- if end and i < n and not self . cdata_elem :
290
+ if end and i < n :
264
291
if self .convert_charrefs and not self .cdata_elem :
265
292
self .handle_data (unescape (rawdata [i :n ]))
266
293
else :
@@ -291,7 +318,7 @@ def parse_html_declaration(self, i):
291
318
return self .parse_bogus_comment (i )
292
319
293
320
# Internal -- parse bogus comment, return length or -1 if not terminated
294
- # see http ://www.w3. org/TR/html5/tokenization .html#bogus-comment-state
321
+ # see https ://html.spec.whatwg. org/multipage/parsing .html#bogus-comment-state
295
322
def parse_bogus_comment (self , i , report = 1 ):
296
323
rawdata = self .rawdata
297
324
assert rawdata [i :i + 2 ] in ('<!' , '</' ), ('unexpected call to '
@@ -317,6 +344,8 @@ def parse_pi(self, i):
317
344
318
345
# Internal -- handle starttag, return end or -1 if not terminated
319
346
def parse_starttag (self , i ):
347
+ # See the HTML5 specs section "13.2.5.8 Tag name state"
348
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
320
349
self .__starttag_text = None
321
350
endpos = self .check_for_whole_start_tag (i )
322
351
if endpos < 0 :
@@ -369,76 +398,42 @@ def parse_starttag(self, i):
369
398
# or -1 if incomplete.
370
399
def check_for_whole_start_tag (self , i ):
371
400
rawdata = self .rawdata
372
- m = locatestarttagend_tolerant .match (rawdata , i )
373
- if m :
374
- j = m .end ()
375
- next = rawdata [j :j + 1 ]
376
- if next == ">" :
377
- return j + 1
378
- if next == "/" :
379
- if rawdata .startswith ("/>" , j ):
380
- return j + 2
381
- if rawdata .startswith ("/" , j ):
382
- # buffer boundary
383
- return - 1
384
- # else bogus input
385
- if j > i :
386
- return j
387
- else :
388
- return i + 1
389
- if next == "" :
390
- # end of input
391
- return - 1
392
- if next in ("abcdefghijklmnopqrstuvwxyz=/"
393
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ" ):
394
- # end of input in or before attribute value, or we have the
395
- # '/' from a '/>' ending
396
- return - 1
397
- if j > i :
398
- return j
399
- else :
400
- return i + 1
401
- raise AssertionError ("we should not get here!" )
401
+ match = locatetagend .match (rawdata , i + 1 )
402
+ assert match
403
+ j = match .end ()
404
+ if rawdata [j - 1 ] != ">" :
405
+ return - 1
406
+ return j
402
407
403
408
# Internal -- parse endtag, return end or -1 if incomplete
404
409
def parse_endtag (self , i ):
410
+ # See the HTML5 specs section "13.2.5.7 End tag open state"
411
+ # https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
405
412
rawdata = self .rawdata
406
413
assert rawdata [i :i + 2 ] == "</" , "unexpected call to parse_endtag"
407
- match = endendtag .search (rawdata , i + 1 ) # >
408
- if not match :
414
+ if rawdata .find ('>' , i + 2 ) < 0 : # fast check
409
415
return - 1
410
- gtpos = match .end ()
411
- match = endtagfind .match (rawdata , i ) # </ + tag + >
412
- if not match :
413
- if self .cdata_elem is not None :
414
- self .handle_data (rawdata [i :gtpos ])
415
- return gtpos
416
- # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
417
- namematch = tagfind_tolerant .match (rawdata , i + 2 )
418
- if not namematch :
419
- # w3.org/TR/html5/tokenization.html#end-tag-open-state
420
- if rawdata [i :i + 3 ] == '</>' :
421
- return i + 3
422
- else :
423
- return self .parse_bogus_comment (i )
424
- tagname = namematch .group (1 ).lower ()
425
- # consume and ignore other stuff between the name and the >
426
- # Note: this is not 100% correct, since we might have things like
427
- # </tag attr=">">, but looking for > after the name should cover
428
- # most of the cases and is much simpler
429
- gtpos = rawdata .find ('>' , namematch .end ())
430
- self .handle_endtag (tagname )
431
- return gtpos + 1
416
+ if not endtagopen .match (rawdata , i ): # </ + letter
417
+ if rawdata [i + 2 :i + 3 ] == '>' : # </> is ignored
418
+ # "missing-end-tag-name" parser error
419
+ return i + 3
420
+ else :
421
+ return self .parse_bogus_comment (i )
432
422
433
- elem = match . group ( 1 ). lower () # script or style
434
- if self . cdata_elem is not None :
435
- if elem != self . cdata_elem :
436
- self . handle_data ( rawdata [i : gtpos ])
437
- return gtpos
423
+ match = locatetagend . match ( rawdata , i + 2 )
424
+ assert match
425
+ j = match . end ()
426
+ if rawdata [j - 1 ] != ">" :
427
+ return - 1
438
428
439
- self .handle_endtag (elem )
429
+ # find the name: "13.2.5.8 Tag name state"
430
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
431
+ match = tagfind_tolerant .match (rawdata , i + 2 )
432
+ assert match
433
+ tag = match .group (1 ).lower ()
434
+ self .handle_endtag (tag )
440
435
self .clear_cdata_mode ()
441
- return gtpos
436
+ return j
442
437
443
438
# Overridable -- finish processing of start+end tag: <tag.../>
444
439
def handle_startendtag (self , tag , attrs ):
0 commit comments