From bb70a0347f563154528a62cf4b8d0b80a2b29ce0 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 24 Aug 2016 03:44:14 +0100 Subject: [PATCH] Add a really ugly hack to support more encodings for XML --- manifest/XMLParser.py | 24 +++++++++++++++++++++--- manifest/tests/test_XMLParser.py | 8 ++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/manifest/XMLParser.py b/manifest/XMLParser.py index 45dcb0d..c2aecc8 100644 --- a/manifest/XMLParser.py +++ b/manifest/XMLParser.py @@ -26,21 +26,29 @@ def _fixname(key): class XMLParser(object): - def __init__(self): - self._parser = expat.ParserCreate(None, "}") + def __init__(self, encoding=None): + self._parser = expat.ParserCreate(encoding, "}") self._target = etree.TreeBuilder() # parser settings self._parser.buffer_text = 1 self._parser.ordered_attributes = 1 self._parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) # parser callbacks + self._parser.XmlDeclHandler = self._xml_decl self._parser.StartElementHandler = self._start self._parser.EndElementHandler = self._end self._parser.CharacterDataHandler = self._data self._parser.ExternalEntityRefHandler = self._external self._parser.SkippedEntityHandler = self._skipped + # used for our horrible re-encoding hack + self._fed_data = [] + self._read_encoding = None + + def _xml_decl(self, version, encoding, standalone): + self._read_encoding = encoding def _start(self, tag, attrib_in): + self._fed_data = None tag = _fixname(tag) attrib = OrderedDict() if attrib_in: @@ -84,12 +92,22 @@ def _skipped(self, name, is_parameter_entity): err.offset = self._parser.ErrorColumnNumber raise err - def feed(self, data): + if self._fed_data is not None: + self._fed_data.append(data) try: self._parser.Parse(data, False) except expat.error as v: _wrap_error(v) + except ValueError as e: + if e.args[0] == 'multi-byte encodings are not supported': + assert self._read_encoding is not None + xml = b"".join(self._fed_data).decode(self._read_encoding).encode("utf-8") + new_parser = XMLParser("utf-8") + self._parser = new_parser._parser + self._target = new_parser._target + self._fed_data = None + self.feed(xml) def close(self): try: diff --git a/manifest/tests/test_XMLParser.py b/manifest/tests/test_XMLParser.py index 9e02f88..fc130b7 100644 --- a/manifest/tests/test_XMLParser.py +++ b/manifest/tests/test_XMLParser.py @@ -44,3 +44,11 @@ def test_comment(): d = p.close() assert d.tag == u"foo" assert len(d) == 0 + + +def test_unsupported_encoding(): + p = XMLParser() + p.feed(u"\u3044".encode("shift-jis")) + d = p.close() + assert d.tag == u"foo" + assert d.text == u"\u3044"