Skip to content

Commit

Permalink
lisa: strip characters which lxml will refuse to serialize
Browse files Browse the repository at this point in the history
This consolidates behavior for all lxml based formats to strip
characters which would be rejected by lxml.

The previous behavior was:

- TMX stripped some of these since #4219 (issue #4183)
- XLIFF double escaped these as XML entities what is non-standard and
  not handled by any other XLIFF tools (introduced by #3551 and #3555)
- all other formats just crashed with ValueError: All strings must be
  XML compatible: Unicode or ASCII, no NULL bytes or control characters

This can be a breaking change in case somebody relied on the present
handling of control characters in XLIFF in translate-toolkit.

Fixes #3561
  • Loading branch information
nijel committed Mar 11, 2024
1 parent 78c6ac9 commit 6b015ba
Show file tree
Hide file tree
Showing 9 changed files with 97 additions and 103 deletions.
16 changes: 2 additions & 14 deletions tests/translate/storage/test_xliff.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,20 +71,8 @@ def test_unaccepted_control_chars(self):
Source: :wp:`Valid_characters_in_XML#XML_1.0`
"""
for code in xliff.ASCII_CONTROL_CODES:
self.unit.target = "Een&#x%s;" % code.lstrip("0") or "0"
assert self.unit.target == "Een%s" % chr(int(code, 16))
self.unit.target = "Een%s" % chr(int(code, 16))
assert self.unit.target == "Een%s" % chr(int(code, 16))

def test_unaccepted_control_chars_escapes_roundtrip(self):
"""Test control characters go ok on escaping roundtrip."""
for code in xliff.ASCII_CONTROL_CODES:
special = "Een%s" % chr(int(code, 16))
self.unit.source = special
print("unit.source:", repr(self.unit.source))
print("special:", repr(special))
assert self.unit.source == special
self.unit.target = "Een\x00"
assert self.unit.target == "Een"


class TestXLIFFfile(test_base.TestTranslationStore):
Expand Down
66 changes: 58 additions & 8 deletions translate/misc/xml_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,18 +226,68 @@ def expand_closing_tags(elem):
elements.extend(elem)


def validate_char(char: str) -> bool:
"""
identify valid chars for XML, based on xmlIsChar_ch from
https://github.com/GNOME/libxml2/blob/master/include/libxml/chvalid.h.
"""
ord_ch = ord(char)
return (0x9 <= ord_ch <= 0xA) or (ord_ch == 0xD) or (ord_ch >= 0x20)
"""
Characters which will get rejected by lxml, based on
https://github.com/lxml/lxml/blob/3ccc7d583e325ceb0ebdf8fc295bbb7fc8cd404d/src/lxml/apihelpers.pxi#L1474-L1503
and
https://github.com/GNOME/libxml2/blob/723b4de04015c5acccd3cda5dd60db7d00702064/include/libxml/chvalid.h#L108-L110
"""
XML_INVALID_CHARS_TRANS = str.maketrans(
dict.fromkeys(
(
"\x00", # Unicode Character 'NULL' (U+0000)
"\x01", # Unicode Character 'START OF HEADING' (U+0001)
"\x02", # Unicode Character 'START OF TEXT' (U+0002)
"\x03", # Unicode Character 'END OF TEXT' (U+0003)
"\x04", # Unicode Character 'END OF TRANSMISSION' (U+0004)
"\x05", # Unicode Character 'ENQUIRY' (U+0005)
"\x06", # Unicode Character 'ACKNOWLEDGE' (U+0006)
"\x07", # Unicode Character 'BELL' (U+0007), "\a" in Python
"\x08", # Unicode Character 'BACKSPACE' (U+0008), "\b" in Python
"\x0b", # Unicode Character 'LINE TABULATION' (U+000B), "\v" in Python
"\x0c", # Unicode Character 'FORM FEED (FF)' (U+000C), "\f" in Python
"\x0e", # Unicode Character 'SHIFT OUT' (U+000E)
"\x0f", # Unicode Character 'SHIFT IN' (U+000F)
"\x10", # Unicode Character 'DATA LINK ESCAPE' (U+0010)
"\x11", # Unicode Character 'DEVICE CONTROL ONE' (U+0011)
"\x12", # Unicode Character 'DEVICE CONTROL TWO' (U+0012)
"\x13", # Unicode Character 'DEVICE CONTROL THREE' (U+0013)
"\x14", # Unicode Character 'DEVICE CONTROL FOUR' (U+0014)
"\x15", # Unicode Character 'NEGATIVE ACKNOWLEDGE' (U+0015)
"\x16", # Unicode Character 'SYNCHRONOUS IDLE' (U+0016)
"\x17", # Unicode Character 'END OF TRANSMISSION BLOCK' (U+0017)
"\x18", # Unicode Character 'CANCEL' (U+0018)
"\x19", # Unicode Character 'END OF MEDIUM' (U+0019)
"\x1a", # Unicode Character 'SUBSTITUTE' (U+001A)
"\x1b", # Unicode Character 'ESCAPE' (U+001B)
"\x1c", # Unicode Character 'INFORMATION SEPARATOR FOUR' (U+001C)
"\x1d", # Unicode Character 'INFORMATION SEPARATOR THREE' (U+001D)
"\x1e", # Unicode Character 'INFORMATION SEPARATOR TWO' (U+001E)
"\x1f", # Unicode Character 'INFORMATION SEPARATOR ONE' (U+001F)
"\ufffe", # Invalid character
"\uffff", # Invalid character
*(chr(x) for x in range(0xD800, 0xDFFF + 1)),
)
)
)


def valid_chars_only(text: str) -> str:
"""Prevent to crash libxml with unexpected chars."""
return "".join(char for char in text if validate_char(char))
return text.translate(XML_INVALID_CHARS_TRANS)


def safely_set_text(node, text: str) -> None:
"""
Safe updating of ElementTree text of a node.
In case of ValueError it strips any characters refused by lxml.
"""
try:
node.text = text
except ValueError:
# Prevents "All strings must be XML compatible" when string contains a control characters
node.text = valid_chars_only(text)


def clear_content(node):
Expand Down
4 changes: 2 additions & 2 deletions translate/storage/flatxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from lxml import etree

from translate.misc.xml_helpers import getText, namespaced, reindent
from translate.misc.xml_helpers import getText, namespaced, reindent, safely_set_text
from translate.storage import base


Expand Down Expand Up @@ -69,7 +69,7 @@ def target(self, target):
"""Updates the translated string of this unit."""
if self.target == target:
return
self.xmlelement.text = target
safely_set_text(self.xmlelement, target)

def namespaced(self, name):
"""Returns name in Clark notation."""
Expand Down
5 changes: 3 additions & 2 deletions translate/storage/qph.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from lxml import etree

from translate.lang import data
from translate.misc.xml_helpers import safely_set_text
from translate.storage import lisa


Expand All @@ -49,7 +50,7 @@ def createlanguageNode(self, lang, text, purpose):
"""Returns an xml Element setup with given parameters."""
assert purpose
langset = etree.Element(self.namespaced(purpose))
langset.text = text
safely_set_text(langset, text)
return langset

def _getsourcenode(self):
Expand All @@ -69,7 +70,7 @@ def addnote(self, text, origin=None, position="append"):
current_notes = self.getnotes(origin)
self.removenotes(origin)
note = etree.SubElement(self.xmlelement, self.namespaced("definition"))
note.text = "\n".join(filter(None, [current_notes, text.strip()]))
safely_set_text(note, "\n".join(filter(None, [current_notes, text.strip()])))

def getnotes(self, origin=None):
# TODO: consider only responding when origin has certain values
Expand Down
17 changes: 11 additions & 6 deletions translate/storage/resx.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@

from lxml import etree

from translate.misc.xml_helpers import setXMLspace
from translate.misc.xml_helpers import (
safely_set_text,
setXMLspace,
)
from translate.storage import lisa
from translate.storage.placeables import general

Expand All @@ -39,7 +42,7 @@ def createlanguageNode(self, lang, text, purpose):
"""Returns an xml Element setup with given parameters."""
langset = etree.Element(self.namespaced(self.languageNode))

langset.text = text
safely_set_text(langset, text)
return langset

def _gettargetnode(self):
Expand All @@ -66,7 +69,7 @@ def target(self, target):
return
targetnode = self._gettargetnode()
targetnode.clear()
targetnode.text = target or ""
safely_set_text(targetnode, target or "")

def addnote(self, text, origin=None, position="append"):
"""Add a note specifically in the appropriate "comment" tag."""
Expand All @@ -77,11 +80,13 @@ def addnote(self, text, origin=None, position="append"):
if position == "append":
if current_notes.strip() in text_stripped:
# Don't add duplicate comments
note.text = text_stripped
safely_set_text(note, text_stripped)
else:
note.text = "\n".join(filter(None, [current_notes, text_stripped]))
safely_set_text(
note, "\n".join(filter(None, [current_notes, text_stripped]))
)
else:
note.text = text_stripped
safely_set_text(note, text_stripped)
if note.text:
# Correct the indent of <comment> by updating the tail of
# the preceding <value> element
Expand Down
10 changes: 7 additions & 3 deletions translate/storage/tbx.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@

from lxml import etree

from translate.misc.xml_helpers import getXMLspace, setXMLlang
from translate.misc.xml_helpers import (
getXMLspace,
safely_set_text,
setXMLlang,
)
from translate.storage import lisa


Expand All @@ -42,7 +46,7 @@ def createlanguageNode(self, lang, text, purpose):
term = etree.SubElement(tig, self.textNode)
# probably not what we want:
# lisa.setXMLspace(term, "preserve")
term.text = text
safely_set_text(term, text)
return langset

def getid(self):
Expand Down Expand Up @@ -75,7 +79,7 @@ def addnote(self, text, origin=None, position="append"):
if not text:
return
note = etree.SubElement(self.xmlelement, self._get_origin_element(origin))
note.text = text
safely_set_text(note, text)
if origin and origin not in ("pos", "definition"):
note.set("from", origin)

Expand Down
10 changes: 3 additions & 7 deletions translate/storage/tmx.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from lxml import etree

from translate import __version__
from translate.misc.xml_helpers import setXMLlang, valid_chars_only
from translate.misc.xml_helpers import safely_set_text, setXMLlang
from translate.storage import lisa


Expand All @@ -39,11 +39,7 @@ def createlanguageNode(self, lang, text, purpose):
seg = etree.SubElement(langset, self.textNode)
# implied by the standard:
# setXMLspace(seg, "preserve")
try:
seg.text = text
except ValueError:
# Prevents "All strings must be XML compatible" when string contains a control characters
seg.text = valid_chars_only(text)
safely_set_text(seg, text)

return langset

Expand All @@ -66,7 +62,7 @@ def addnote(self, text, origin=None, position="append"):
The origin parameter is ignored
"""
note = etree.SubElement(self.xmlelement, self.namespaced("note"))
note.text = text.strip()
safely_set_text(note, text.strip())

def _getnotelist(self, origin=None):
"""
Expand Down
17 changes: 9 additions & 8 deletions translate/storage/ts2.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

from translate.lang import data
from translate.misc.multistring import multistring
from translate.misc.xml_helpers import safely_set_text
from translate.storage import lisa
from translate.storage.placeables import general
from translate.storage.workflow import StateEnum as state
Expand Down Expand Up @@ -121,7 +122,7 @@ def createlanguageNode(self, lang, text, purpose):
# TODO: check language
# lisa.setXMLlang(langset, lang)

langset.text = text
safely_set_text(langset, text)
return langset

def _getsourcenode(self):
Expand Down Expand Up @@ -185,9 +186,9 @@ def target(self, target):
self.xmlelement.set("numerus", "yes")
for string in strings:
numerus = etree.SubElement(targetnode, self.namespaced("numerusform"))
numerus.text = string or ""
safely_set_text(numerus, string or "")
else:
targetnode.text = target or ""
safely_set_text(targetnode, target or "")

def hasplural(self):
return self.xmlelement.get("numerus") == "yes"
Expand All @@ -203,11 +204,11 @@ def addnote(self, text, origin=None, position="append"):
self.xmlelement, self.namespaced("translatorcomment")
)
if position == "append":
note.text = "\n".join(
item for item in [current_notes, text.strip()] if item
safely_set_text(
note, "\n".join(item for item in [current_notes, text.strip()] if item)
)
else:
note.text = text.strip()
safely_set_text(note, text.strip())

def getnotes(self, origin=None):
# TODO: consider only responding when origin has certain values
Expand Down Expand Up @@ -494,10 +495,10 @@ def _createcontext(self, contextname, comment=None):
self.document.getroot(), self.namespaced(self.bodyNode)
)
name = etree.SubElement(context, self.namespaced("name"))
name.text = contextname
safely_set_text(name, contextname)
if comment:
comment_node = etree.SubElement(context, "comment")
comment_node.text = comment
safely_set_text(comment_node, comment)
return context

def _getcontextname(self, contextnode):
Expand Down
55 changes: 2 additions & 53 deletions translate/storage/xliff.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from translate.misc.xml_helpers import (
clear_content,
getXMLspace,
safely_set_text,
setXMLlang,
setXMLspace,
)
Expand All @@ -48,46 +49,6 @@
ID_SEPARATOR_SAFE = "__%04__"


# List of ASCII control codes which are unaccepted in XML.
ASCII_CONTROL_CODES = [
"0000", # Unicode Character 'NULL' (U+0000)
"0001", # Unicode Character 'START OF HEADING' (U+0001)
"0002", # Unicode Character 'START OF TEXT' (U+0002)
"0003", # Unicode Character 'END OF TEXT' (U+0003)
"0004", # Unicode Character 'END OF TRANSMISSION' (U+0004)
"0005", # Unicode Character 'ENQUIRY' (U+0005)
"0006", # Unicode Character 'ACKNOWLEDGE' (U+0006)
"0007", # Unicode Character 'BELL' (U+0007), "\a" in Python
"0008", # Unicode Character 'BACKSPACE' (U+0008), "\b" in Python
"000b", # Unicode Character 'LINE TABULATION' (U+000B), "\v" in Python
"000c", # Unicode Character 'FORM FEED (FF)' (U+000C), "\f" in Python
"000e", # Unicode Character 'SHIFT OUT' (U+000E)
"000f", # Unicode Character 'SHIFT IN' (U+000F)
"0010", # Unicode Character 'DATA LINK ESCAPE' (U+0010)
"0011", # Unicode Character 'DEVICE CONTROL ONE' (U+0011)
"0012", # Unicode Character 'DEVICE CONTROL TWO' (U+0012)
"0013", # Unicode Character 'DEVICE CONTROL THREE' (U+0013)
"0014", # Unicode Character 'DEVICE CONTROL FOUR' (U+0014)
"0015", # Unicode Character 'NEGATIVE ACKNOWLEDGE' (U+0015)
"0016", # Unicode Character 'SYNCHRONOUS IDLE' (U+0016)
"0017", # Unicode Character 'END OF TRANSMISSION BLOCK' (U+0017)
"0018", # Unicode Character 'CANCEL' (U+0018)
"0019", # Unicode Character 'END OF MEDIUM' (U+0019)
"001a", # Unicode Character 'SUBSTITUTE' (U+001A)
"001b", # Unicode Character 'ESCAPE' (U+001B)
"001c", # Unicode Character 'INFORMATION SEPARATOR FOUR' (U+001C)
"001d", # Unicode Character 'INFORMATION SEPARATOR THREE' (U+001D)
"001e", # Unicode Character 'INFORMATION SEPARATOR TWO' (U+001E)
"001f", # Unicode Character 'INFORMATION SEPARATOR ONE' (U+001F)
]

ASCII_CONTROL_CHARACTERS = {code: chr(int(code, 16)) for code in ASCII_CONTROL_CODES}

ASCII_CONTROL_CHARACTERS_ESCAPES = {
code: "&#x%s;" % code.lstrip("0") or "0" for code in ASCII_CONTROL_CODES
}


class xliffunit(lisa.LISAunit):
"""A single term in the xliff file."""

Expand Down Expand Up @@ -136,15 +97,6 @@ def __init__(self, source, empty=False, **kwargs):
return
setXMLspace(self.xmlelement, "preserve")

def getNodeText(self, languageNode, xml_space="preserve"):
"""Retrieves the term from the given :attr:`languageNode`."""
text = super().getNodeText(languageNode, xml_space=xml_space)
if text is not None:
# Unescape the unaccepted ASCII control characters.
for code, character in ASCII_CONTROL_CHARACTERS.items():
text = text.replace(ASCII_CONTROL_CHARACTERS_ESCAPES[code], character)
return text

def createlanguageNode(self, lang, text, purpose):
"""Returns an xml Element setup with given parameters."""
# TODO: for now we do source, but we have to test if it is target,
Expand All @@ -156,11 +108,8 @@ def createlanguageNode(self, lang, text, purpose):
# TODO: check language
# setXMLlang(langset, lang)

# Escape the unaccepted ASCII control characters.
for code in ASCII_CONTROL_CODES:
text = text.replace(chr(int(code, 16)), "&#x%s;" % code.lstrip("0") or "0")
safely_set_text(langset, text)

langset.text = text
return langset

def getlanguageNodes(self):
Expand Down

0 comments on commit 6b015ba

Please sign in to comment.