This repository has been archived by the owner on Dec 18, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 47
/
XMLParser.py
118 lines (102 loc) · 3.99 KB
/
XMLParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from os.path import dirname, join
from collections import OrderedDict
from xml.parsers import expat
import xml.etree.ElementTree as etree
_catalog = join(dirname(__file__), "catalog")
def _wrap_error(e):
err = etree.ParseError(e)
err.code = e.code
err.position = e.lineno, e.offset
raise err
_names = {}
def _fixname(key):
try:
name = _names[key]
except KeyError:
name = key
if "}" in name:
name = "{" + name
_names[key] = name
return name
class XMLParser(object):
def __init__(self, encoding=None):
self._parser = expat.ParserCreate(encoding, "}")
self._target = etree.TreeBuilder()
# parser settings
self._parser.buffer_text = 1
self._parser.ordered_attributes = 1
self._parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
# parser callbacks
self._parser.XmlDeclHandler = self._xml_decl
self._parser.StartElementHandler = self._start
self._parser.EndElementHandler = self._end
self._parser.CharacterDataHandler = self._data
self._parser.ExternalEntityRefHandler = self._external
self._parser.SkippedEntityHandler = self._skipped
# used for our horrible re-encoding hack
self._fed_data = []
self._read_encoding = None
def _xml_decl(self, version, encoding, standalone):
self._read_encoding = encoding
def _start(self, tag, attrib_in):
self._fed_data = None
tag = _fixname(tag)
attrib = OrderedDict()
if attrib_in:
for i in range(0, len(attrib_in), 2):
attrib[_fixname(attrib_in[i])] = attrib_in[i+1]
return self._target.start(tag, attrib)
def _data(self, text):
return self._target.data(text)
def _end(self, tag):
return self._target.end(_fixname(tag))
def _external(self, context, base, systemId, publicId):
if publicId in {
"-//W3C//DTD XHTML 1.0 Transitional//EN",
"-//W3C//DTD XHTML 1.1//EN",
"-//W3C//DTD XHTML 1.0 Strict//EN",
"-//W3C//DTD XHTML 1.0 Frameset//EN",
"-//W3C//DTD XHTML Basic 1.0//EN",
"-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN",
"-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN",
"-//W3C//DTD MathML 2.0//EN",
"-//WAPFORUM//DTD XHTML Mobile 1.0//EN"
}:
parser = self._parser.ExternalEntityParserCreate(context)
with open(join(_catalog, "xhtml.dtd"), "rb") as fp:
try:
parser.ParseFile(fp)
except expat.error:
return False
return True
def _skipped(self, name, is_parameter_entity):
err = expat.error("undefined entity %s: line %d, column %d" %
(name, self._parser.ErrorLineNumber,
self._parser.ErrorColumnNumber))
err.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY
err.lineno = self._parser.ErrorLineNumber
err.offset = self._parser.ErrorColumnNumber
raise err
def feed(self, data):
if self._fed_data is not None:
self._fed_data.append(data)
try:
self._parser.Parse(data, False)
except expat.error as v:
_wrap_error(v)
except ValueError as e:
if e.args[0] == 'multi-byte encodings are not supported':
assert self._read_encoding is not None
xml = b"".join(self._fed_data).decode(self._read_encoding).encode("utf-8")
new_parser = XMLParser("utf-8")
self._parser = new_parser._parser
self._target = new_parser._target
self._fed_data = None
self.feed(xml)
def close(self):
try:
self._parser.Parse("", True)
except expat.error as v:
_wrap_error(v)
tree = self._target.close()
return tree