5
5
import unittest
6
6
7
7
from unittest .mock import patch
8
+ from test import support
8
9
9
10
10
11
class EventCollector (html .parser .HTMLParser ):
@@ -430,28 +431,34 @@ def test_tolerant_parsing(self):
430
431
('data' , '<' ),
431
432
('starttag' , 'bc<' , [('a' , None )]),
432
433
('endtag' , 'html' ),
433
- ('data' , '\n <img src="URL>' ),
434
- ('comment' , '/img' ),
435
- ('endtag' , 'html<' )])
434
+ ('data' , '\n ' )])
436
435
437
436
def test_starttag_junk_chars (self ):
437
+ self ._run_check ("<" , [('data' , '<' )])
438
+ self ._run_check ("<>" , [('data' , '<>' )])
439
+ self ._run_check ("< >" , [('data' , '< >' )])
440
+ self ._run_check ("< " , [('data' , '< ' )])
438
441
self ._run_check ("</>" , [])
442
+ self ._run_check ("<$>" , [('data' , '<$>' )])
439
443
self ._run_check ("</$>" , [('comment' , '$' )])
440
444
self ._run_check ("</" , [('data' , '</' )])
441
- self ._run_check ("</a" , [('data' , '</a' )])
445
+ self ._run_check ("</a" , [])
446
+ self ._run_check ("</ a>" , [('endtag' , 'a' )])
447
+ self ._run_check ("</ a" , [('comment' , ' a' )])
442
448
self ._run_check ("<a<a>" , [('starttag' , 'a<a' , [])])
443
449
self ._run_check ("</a<a>" , [('endtag' , 'a<a' )])
444
- self ._run_check ("<!" , [('data ' , '<! ' )])
445
- self ._run_check ("<a" , [( 'data' , '<a' ) ])
446
- self ._run_check ("<a foo='bar'" , [( 'data' , "<a foo='bar'" ) ])
447
- self ._run_check ("<a foo='bar" , [( 'data' , "<a foo='bar" ) ])
448
- self ._run_check ("<a foo='>'" , [( 'data' , "<a foo='>'" ) ])
449
- self ._run_check ("<a foo='>" , [( 'data' , "<a foo='>" ) ])
450
+ self ._run_check ("<!" , [('comment ' , '' )])
451
+ self ._run_check ("<a" , [])
452
+ self ._run_check ("<a foo='bar'" , [])
453
+ self ._run_check ("<a foo='bar" , [])
454
+ self ._run_check ("<a foo='>'" , [])
455
+ self ._run_check ("<a foo='>" , [])
450
456
self ._run_check ("<a$>" , [('starttag' , 'a$' , [])])
451
457
self ._run_check ("<a$b>" , [('starttag' , 'a$b' , [])])
452
458
self ._run_check ("<a$b/>" , [('startendtag' , 'a$b' , [])])
453
459
self ._run_check ("<a$b >" , [('starttag' , 'a$b' , [])])
454
460
self ._run_check ("<a$b />" , [('startendtag' , 'a$b' , [])])
461
+ self ._run_check ("</a$b>" , [('endtag' , 'a$b' )])
455
462
456
463
def test_slashes_in_starttag (self ):
457
464
self ._run_check ('<a foo="var"/>' , [('startendtag' , 'a' , [('foo' , 'var' )])])
@@ -576,21 +583,50 @@ def test_EOF_in_charref(self):
576
583
for html , expected in data :
577
584
self ._run_check (html , expected )
578
585
579
- def test_EOF_in_comments_or_decls (self ):
586
+ def test_eof_in_comments (self ):
580
587
data = [
581
- ('<!' , [('data' , '<!' )]),
582
- ('<!-' , [('data' , '<!-' )]),
583
- ('<!--' , [('data' , '<!--' )]),
584
- ('<![' , [('data' , '<![' )]),
585
- ('<![CDATA[' , [('data' , '<![CDATA[' )]),
586
- ('<![CDATA[x' , [('data' , '<![CDATA[x' )]),
587
- ('<!DOCTYPE' , [('data' , '<!DOCTYPE' )]),
588
- ('<!DOCTYPE HTML' , [('data' , '<!DOCTYPE HTML' )]),
588
+ ('<!--' , [('comment' , '' )]),
589
+ ('<!---' , [('comment' , '' )]),
590
+ ('<!----' , [('comment' , '' )]),
591
+ ('<!-----' , [('comment' , '-' )]),
592
+ ('<!------' , [('comment' , '--' )]),
593
+ ('<!----!' , [('comment' , '' )]),
594
+ ('<!---!' , [('comment' , '-!' )]),
595
+ ('<!---!>' , [('comment' , '-!>' )]),
596
+ ('<!--foo' , [('comment' , 'foo' )]),
597
+ ('<!--foo-' , [('comment' , 'foo' )]),
598
+ ('<!--foo--' , [('comment' , 'foo' )]),
599
+ ('<!--foo--!' , [('comment' , 'foo' )]),
600
+ ('<!--<!--' , [('comment' , '<!' )]),
601
+ ('<!--<!--!' , [('comment' , '<!' )]),
589
602
]
590
603
for html , expected in data :
591
604
self ._run_check (html , expected )
605
+
606
+ def test_eof_in_declarations (self ):
607
+ data = [
608
+ ('<!' , [('comment' , '' )]),
609
+ ('<!-' , [('comment' , '-' )]),
610
+ ('<![' , [('comment' , '[' )]),
611
+ ('<![CDATA[' , [('unknown decl' , 'CDATA[' )]),
612
+ ('<![CDATA[x' , [('unknown decl' , 'CDATA[x' )]),
613
+ ('<![CDATA[x]' , [('unknown decl' , 'CDATA[x]' )]),
614
+ ('<![CDATA[x]]' , [('unknown decl' , 'CDATA[x]]' )]),
615
+ ('<!DOCTYPE' , [('decl' , 'DOCTYPE' )]),
616
+ ('<!DOCTYPE ' , [('decl' , 'DOCTYPE ' )]),
617
+ ('<!DOCTYPE html' , [('decl' , 'DOCTYPE html' )]),
618
+ ('<!DOCTYPE html ' , [('decl' , 'DOCTYPE html ' )]),
619
+ ('<!DOCTYPE html PUBLIC' , [('decl' , 'DOCTYPE html PUBLIC' )]),
620
+ ('<!DOCTYPE html PUBLIC "foo' , [('decl' , 'DOCTYPE html PUBLIC "foo' )]),
621
+ ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo' ,
622
+ [('decl' , 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo' )]),
623
+ ]
624
+ for html , expected in data :
625
+ self ._run_check (html , expected )
626
+
592
627
def test_bogus_comments (self ):
593
- html = ('<! not really a comment >'
628
+ html = ('<!ELEMENT br EMPTY>'
629
+ '<! not really a comment >'
594
630
'<! not a comment either -->'
595
631
'<! -- close enough -->'
596
632
'<!><!<-- this was an empty comment>'
@@ -604,6 +640,7 @@ def test_bogus_comments(self):
604
640
'<![CDATA]]>' # required '[' after CDATA
605
641
)
606
642
expected = [
643
+ ('comment' , 'ELEMENT br EMPTY' ),
607
644
('comment' , ' not really a comment ' ),
608
645
('comment' , ' not a comment either --' ),
609
646
('comment' , ' -- close enough --' ),
@@ -684,6 +721,26 @@ def test_convert_charrefs_dropped_text(self):
684
721
('endtag' , 'a' ), ('data' , ' bar & baz' )]
685
722
)
686
723
724
+ @support .requires_resource ('cpu' )
725
+ def test_eof_no_quadratic_complexity (self ):
726
+ # Each of these examples used to take about an hour.
727
+ # Now they take a fraction of a second.
728
+ def check (source ):
729
+ parser = html .parser .HTMLParser ()
730
+ parser .feed (source )
731
+ parser .close ()
732
+ n = 120_000
733
+ check ("<a " * n )
734
+ check ("<a a=" * n )
735
+ check ("</a " * 14 * n )
736
+ check ("</a a=" * 11 * n )
737
+ check ("<!--" * 4 * n )
738
+ check ("<!" * 60 * n )
739
+ check ("<?" * 19 * n )
740
+ check ("</$" * 15 * n )
741
+ check ("<![CDATA[" * 9 * n )
742
+ check ("<!doctype" * 35 * n )
743
+
687
744
688
745
class AttributesTestCase (TestCaseBase ):
689
746
0 commit comments