Skip to content
This repository has been archived by the owner on Dec 18, 2018. It is now read-only.

Commit

Permalink
Add a really ugly hack to support more encodings for XML
Browse files Browse the repository at this point in the history
  • Loading branch information
gsnedders committed Aug 31, 2016
1 parent 1b22054 commit bb70a03
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 3 deletions.
24 changes: 21 additions & 3 deletions manifest/XMLParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,29 @@ def _fixname(key):


class XMLParser(object):
def __init__(self):
self._parser = expat.ParserCreate(None, "}")
def __init__(self, encoding=None):
self._parser = expat.ParserCreate(encoding, "}")
self._target = etree.TreeBuilder()
# parser settings
self._parser.buffer_text = 1
self._parser.ordered_attributes = 1
self._parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
# parser callbacks
self._parser.XmlDeclHandler = self._xml_decl
self._parser.StartElementHandler = self._start
self._parser.EndElementHandler = self._end
self._parser.CharacterDataHandler = self._data
self._parser.ExternalEntityRefHandler = self._external
self._parser.SkippedEntityHandler = self._skipped
# used for our horrible re-encoding hack
self._fed_data = []
self._read_encoding = None

def _xml_decl(self, version, encoding, standalone):
self._read_encoding = encoding

def _start(self, tag, attrib_in):
self._fed_data = None
tag = _fixname(tag)
attrib = OrderedDict()
if attrib_in:
Expand Down Expand Up @@ -84,12 +92,22 @@ def _skipped(self, name, is_parameter_entity):
err.offset = self._parser.ErrorColumnNumber
raise err


def feed(self, data):
if self._fed_data is not None:
self._fed_data.append(data)
try:
self._parser.Parse(data, False)
except expat.error as v:
_wrap_error(v)
except ValueError as e:
if e.args[0] == 'multi-byte encodings are not supported':
assert self._read_encoding is not None
xml = b"".join(self._fed_data).decode(self._read_encoding).encode("utf-8")
new_parser = XMLParser("utf-8")
self._parser = new_parser._parser
self._target = new_parser._target
self._fed_data = None
self.feed(xml)

def close(self):
try:
Expand Down
8 changes: 8 additions & 0 deletions manifest/tests/test_XMLParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,11 @@ def test_comment():
d = p.close()
assert d.tag == u"foo"
assert len(d) == 0


def test_unsupported_encoding():
p = XMLParser()
p.feed(u"<?xml version='1.0' encoding='Shift-JIS'?><foo>\u3044</foo>".encode("shift-jis"))
d = p.close()
assert d.tag == u"foo"
assert d.text == u"\u3044"

0 comments on commit bb70a03

Please sign in to comment.