# Html5lib tests for XML injection issue

| kind                                   | sax      | etree        | minidom   | pulldom  | xmlrpc   | lxml           | genshi    |
| -------------------------------------- | -------- | ------------ | --------- | -------- | -------- | -------------- | --------- |
| billion laughs                         | **True** | **True**     | **True**  | **True** | **True** | False (1)      | False (5) |
| quadratic blowup                       | **True** | **True**     | **True**  | **True** | **True** | **True**       | False (5) |
| external entity expansion (remote)     | **True** | False (3)    | False (4) | **True** | false    | False (1)      | False (5) |
| external entity expansion (local file) | **True** | False (3)    | False (4) | **True** | false    | **True**       | False (5) |
| DTD retrieval                          | **True** | False        | False     | **True** | false    | False (1)      | False     |
| gzip bomb                              | False    | False        | False     | False    | **True** | **partly** (2) | False     |
| xpath support (7)                      | False    | False        | False     | False    | False    | **True**       | False     |
| xsl(t) support (7)                     | False    | False        | False     | False    | False    | **True**       | False     |
| xinclude support (7)                   | False    | **True** (6) | False     | False    | False    | **True** (6)   | **True**  |
| C library                              | expat    | expat        | expat     | expat    | expat    | libxml2        | expat     |

In [4]:
# html5lib load example
import html5lib

dname = "mydocument.html"

with open(dname, "rb") as f:
    document = html5lib.parse(f)
    pass

with open(dname, "rb") as f:
    lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
    pass

with open(dname, "rb") as f:
    parser = html5lib.HTMLParser(strict=True)
    document = parser.parse(f)
    pass

FileNotFoundError: [Errno 2] No such file or directory: 'mydocument.html'

In [45]:
# html5lib load example
import html5lib

# Replace here the file to test
dname = "xmltestdata/xmlbomb2.xml"

from xml.etree import ElementTree

def indent(elem, level=0):
    i = "\n" + level*"  "
    j = "\n" + (level-1)*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
        for subelem in elem:
            indent(subelem, level+1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = j
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = j
    return elem

print("Original:")

with open(dname, "rb") as f:
    body = f.read()
    print(body)
    print(len(body))

print("Basic Test: ")

with open(dname, "rb") as f:
    document = html5lib.parse(f)
    indent(document)
    ElementTree.dump(document)
    pass

print("LXML Test: ")

from lxml import etree

with open(dname, "rb") as f:
    lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
    print(etree.tostring(lxml_etree_document, pretty_print=True, xml_declaration=True))
    print(len(etree.tostring(lxml_etree_document, pretty_print=True, xml_declaration=True)))
    pass

Original:
b'<!DOCTYPE xmlbomb [\n<!ENTITY a "1234567890">\n]>\n<root>text<bomb>&a;</bomb><tag/></root>\n'
88
Basic Test: 
<html:html xmlns:html="http://www.w3.org/1999/xhtml">
  <html:head />
<html:body>]&gt;
<html:root>text<html:bomb>&amp;a;</html:bomb>
    <html:tag />
    </html:root>
  </html:body>
</html:html>
LXML Test: 
b'<?xml version=\'1.0\' encoding=\'ASCII\'?>\n<html:html xmlns:html="http://www.w3.org/1999/xhtml">\n  <html:head/>\n  <html:body>]&gt;\n<html:root>text<html:bomb>&amp;a;</html:bomb><html:tag/></html:root>\n</html:body>\n</html:html>\n'
222


In [66]:
# Test the serialize method too
# html5lib load example
import html5lib

# Replace here the file to test
dname = "xmltestdata/xmlbomb2.xml"

from xml.etree import ElementTree

print("Original:")

with open(dname, "rb") as f:
    body = f.read()
    print(body)
    print(len(body))

print("Basic Test: ")

with open(dname, "r") as f:
    document = html5lib.parse(f)
    print(html5lib.serialize(document, resolve_entities=True))
    pass

print("LXML Test: ")

from lxml import etree

with open(dname, "r") as f:
    lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
    print(html5lib.serialize(lxml_etree_document, tree="lxml", resolve_entities=True))
    pass

Original:
b'<!DOCTYPE xmlbomb [\n<!ENTITY a "1234567890">\n]>\n<root>text<bomb>&a;</bomb><tag/></root>\n'
88
Basic Test: 
]&gt;
<root>text<bomb>&amp;a;</bomb><tag></tag></root>

LXML Test: 
<!DOCTYPE html>]&gt;
<root>text<bomb>&amp;a;</bomb><tag></tag></root>



In [17]:
# Test the serialize method too
# html5lib load example
from bs4 import BeautifulSoup

# Replace here the file to test
dname = "xmltestdata/xhtml_bomb.xml"

from xml.etree import ElementTree

print("Original:")

with open(dname, "r") as f:
    body = f.read()
    print(body)
    print(len(body))

print("\nhtml.parser Test: ")

with open(dname, "r") as f:
    document = BeautifulSoup(f, 'html.parser')
    print(document)
    pass

print("LXML Test: ")

with open(dname, "r") as f:
    document = BeautifulSoup(f, 'lxml')
    print(document)
    pass

Original:
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://pompel.me:9000/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
    <head/>
    <body>text</body>
</html>
215

html.parser Test: 
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://pompel.me:9000/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html>
<head></head>
<body>text</body>
</html>
LXML Test: 
<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://pompel.me:9000/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head></head>
<body>text</body>
</html>
