lisa: strip characters which lxml will refuse to serialize

This consolidates behavior for all lxml based formats to strip characters which would be rejected by lxml. The previous behavior was: - TMX stripped some of these since #4219 (issue #4183) - XLIFF double escaped these as XML entities what is non-standard and not handled by any other XLIFF tools (introduced by #3551 and #3555) - all other formats just crashed with ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters This can be a breaking change in case somebody relied on the present handling of control characters in XLIFF in translate-toolkit. Fixes #3561
translate · Mar 11, 2024 · 6b015ba · 6b015ba
1 parent 78c6ac9
commit 6b015ba
Show file tree

Hide file tree

Showing 9 changed files with 97 additions and 103 deletions.
diff --git a/tests/translate/storage/test_xliff.py b/tests/translate/storage/test_xliff.py
@@ -71,20 +71,8 @@ def test_unaccepted_control_chars(self):
 
         Source: :wp:`Valid_characters_in_XML#XML_1.0`
         """
-        for code in xliff.ASCII_CONTROL_CODES:
-            self.unit.target = "Een&#x%s;" % code.lstrip("0") or "0"
-            assert self.unit.target == "Een%s" % chr(int(code, 16))
-            self.unit.target = "Een%s" % chr(int(code, 16))
-            assert self.unit.target == "Een%s" % chr(int(code, 16))
-
-    def test_unaccepted_control_chars_escapes_roundtrip(self):
-        """Test control characters go ok on escaping roundtrip."""
-        for code in xliff.ASCII_CONTROL_CODES:
-            special = "Een%s" % chr(int(code, 16))
-            self.unit.source = special
-            print("unit.source:", repr(self.unit.source))
-            print("special:", repr(special))
-            assert self.unit.source == special
+        self.unit.target = "Een\x00"
+        assert self.unit.target == "Een"
 
 
 class TestXLIFFfile(test_base.TestTranslationStore):

diff --git a/translate/misc/xml_helpers.py b/translate/misc/xml_helpers.py
@@ -226,18 +226,68 @@ def expand_closing_tags(elem):
         elements.extend(elem)
 
 
-def validate_char(char: str) -> bool:
-    """
-    identify valid chars for XML, based on xmlIsChar_ch from
-    https://github.com/GNOME/libxml2/blob/master/include/libxml/chvalid.h.
-    """
-    ord_ch = ord(char)
-    return (0x9 <= ord_ch <= 0xA) or (ord_ch == 0xD) or (ord_ch >= 0x20)
+"""
+Characters which will get rejected by lxml, based on
+https://github.com/lxml/lxml/blob/3ccc7d583e325ceb0ebdf8fc295bbb7fc8cd404d/src/lxml/apihelpers.pxi#L1474-L1503
+and
+https://github.com/GNOME/libxml2/blob/723b4de04015c5acccd3cda5dd60db7d00702064/include/libxml/chvalid.h#L108-L110
+"""
+XML_INVALID_CHARS_TRANS = str.maketrans(
+    dict.fromkeys(
+        (
+            "\x00",  # Unicode Character 'NULL' (U+0000)
+            "\x01",  # Unicode Character 'START OF HEADING' (U+0001)
+            "\x02",  # Unicode Character 'START OF TEXT' (U+0002)
+            "\x03",  # Unicode Character 'END OF TEXT' (U+0003)
+            "\x04",  # Unicode Character 'END OF TRANSMISSION' (U+0004)
+            "\x05",  # Unicode Character 'ENQUIRY' (U+0005)
+            "\x06",  # Unicode Character 'ACKNOWLEDGE' (U+0006)
+            "\x07",  # Unicode Character 'BELL' (U+0007), "\a" in Python
+            "\x08",  # Unicode Character 'BACKSPACE' (U+0008), "\b" in Python
+            "\x0b",  # Unicode Character 'LINE TABULATION' (U+000B), "\v" in Python
+            "\x0c",  # Unicode Character 'FORM FEED (FF)' (U+000C), "\f" in Python
+            "\x0e",  # Unicode Character 'SHIFT OUT' (U+000E)
+            "\x0f",  # Unicode Character 'SHIFT IN' (U+000F)
+            "\x10",  # Unicode Character 'DATA LINK ESCAPE' (U+0010)
+            "\x11",  # Unicode Character 'DEVICE CONTROL ONE' (U+0011)
+            "\x12",  # Unicode Character 'DEVICE CONTROL TWO' (U+0012)
+            "\x13",  # Unicode Character 'DEVICE CONTROL THREE' (U+0013)
+            "\x14",  # Unicode Character 'DEVICE CONTROL FOUR' (U+0014)
+            "\x15",  # Unicode Character 'NEGATIVE ACKNOWLEDGE' (U+0015)
+            "\x16",  # Unicode Character 'SYNCHRONOUS IDLE' (U+0016)
+            "\x17",  # Unicode Character 'END OF TRANSMISSION BLOCK' (U+0017)
+            "\x18",  # Unicode Character 'CANCEL' (U+0018)
+            "\x19",  # Unicode Character 'END OF MEDIUM' (U+0019)
+            "\x1a",  # Unicode Character 'SUBSTITUTE' (U+001A)
+            "\x1b",  # Unicode Character 'ESCAPE' (U+001B)
+            "\x1c",  # Unicode Character 'INFORMATION SEPARATOR FOUR' (U+001C)
+            "\x1d",  # Unicode Character 'INFORMATION SEPARATOR THREE' (U+001D)
+            "\x1e",  # Unicode Character 'INFORMATION SEPARATOR TWO' (U+001E)
+            "\x1f",  # Unicode Character 'INFORMATION SEPARATOR ONE' (U+001F)
+            "\ufffe",  # Invalid character
+            "\uffff",  # Invalid character
+            *(chr(x) for x in range(0xD800, 0xDFFF + 1)),
+        )
+    )
+)
 
 
 def valid_chars_only(text: str) -> str:
     """Prevent to crash libxml with unexpected chars."""
-    return "".join(char for char in text if validate_char(char))
+    return text.translate(XML_INVALID_CHARS_TRANS)
+
+
+def safely_set_text(node, text: str) -> None:
+    """
+    Safe updating of ElementTree text of a node.
+
+    In case of ValueError it strips any characters refused by lxml.
+    """
+    try:
+        node.text = text
+    except ValueError:
+        # Prevents "All strings must be XML compatible" when string contains a control characters
+        node.text = valid_chars_only(text)
 
 
 def clear_content(node):

diff --git a/translate/storage/flatxml.py b/translate/storage/flatxml.py
@@ -20,7 +20,7 @@
 
 from lxml import etree
 
-from translate.misc.xml_helpers import getText, namespaced, reindent
+from translate.misc.xml_helpers import getText, namespaced, reindent, safely_set_text
 from translate.storage import base
 
 
@@ -69,7 +69,7 @@ def target(self, target):
         """Updates the translated string of this unit."""
         if self.target == target:
             return
-        self.xmlelement.text = target
+        safely_set_text(self.xmlelement, target)
 
     def namespaced(self, name):
         """Returns name in Clark notation."""

diff --git a/translate/storage/qph.py b/translate/storage/qph.py
@@ -34,6 +34,7 @@
 from lxml import etree
 
 from translate.lang import data
+from translate.misc.xml_helpers import safely_set_text
 from translate.storage import lisa
 
 
@@ -49,7 +50,7 @@ def createlanguageNode(self, lang, text, purpose):
         """Returns an xml Element setup with given parameters."""
         assert purpose
         langset = etree.Element(self.namespaced(purpose))
-        langset.text = text
+        safely_set_text(langset, text)
         return langset
 
     def _getsourcenode(self):
@@ -69,7 +70,7 @@ def addnote(self, text, origin=None, position="append"):
         current_notes = self.getnotes(origin)
         self.removenotes(origin)
         note = etree.SubElement(self.xmlelement, self.namespaced("definition"))
-        note.text = "\n".join(filter(None, [current_notes, text.strip()]))
+        safely_set_text(note, "\n".join(filter(None, [current_notes, text.strip()])))
 
     def getnotes(self, origin=None):
         # TODO: consider only responding when origin has certain values

diff --git a/translate/storage/resx.py b/translate/storage/resx.py
@@ -21,7 +21,10 @@
 
 from lxml import etree
 
-from translate.misc.xml_helpers import setXMLspace
+from translate.misc.xml_helpers import (
+    safely_set_text,
+    setXMLspace,
+)
 from translate.storage import lisa
 from translate.storage.placeables import general
 
@@ -39,7 +42,7 @@ def createlanguageNode(self, lang, text, purpose):
         """Returns an xml Element setup with given parameters."""
         langset = etree.Element(self.namespaced(self.languageNode))
 
-        langset.text = text
+        safely_set_text(langset, text)
         return langset
 
     def _gettargetnode(self):
@@ -66,7 +69,7 @@ def target(self, target):
             return
         targetnode = self._gettargetnode()
         targetnode.clear()
-        targetnode.text = target or ""
+        safely_set_text(targetnode, target or "")
 
     def addnote(self, text, origin=None, position="append"):
         """Add a note specifically in the appropriate "comment" tag."""
@@ -77,11 +80,13 @@ def addnote(self, text, origin=None, position="append"):
         if position == "append":
             if current_notes.strip() in text_stripped:
                 # Don't add duplicate comments
-                note.text = text_stripped
+                safely_set_text(note, text_stripped)
             else:
-                note.text = "\n".join(filter(None, [current_notes, text_stripped]))
+                safely_set_text(
+                    note, "\n".join(filter(None, [current_notes, text_stripped]))
+                )
         else:
-            note.text = text_stripped
+            safely_set_text(note, text_stripped)
         if note.text:
             # Correct the indent of <comment> by updating the tail of
             # the preceding <value> element

diff --git a/translate/storage/tbx.py b/translate/storage/tbx.py
@@ -20,7 +20,11 @@
 
 from lxml import etree
 
-from translate.misc.xml_helpers import getXMLspace, setXMLlang
+from translate.misc.xml_helpers import (
+    getXMLspace,
+    safely_set_text,
+    setXMLlang,
+)
 from translate.storage import lisa
 
 
@@ -42,7 +46,7 @@ def createlanguageNode(self, lang, text, purpose):
         term = etree.SubElement(tig, self.textNode)
         # probably not what we want:
         # lisa.setXMLspace(term, "preserve")
-        term.text = text
+        safely_set_text(term, text)
         return langset
 
     def getid(self):
@@ -75,7 +79,7 @@ def addnote(self, text, origin=None, position="append"):
         if not text:
             return
         note = etree.SubElement(self.xmlelement, self._get_origin_element(origin))
-        note.text = text
+        safely_set_text(note, text)
         if origin and origin not in ("pos", "definition"):
             note.set("from", origin)
 

diff --git a/translate/storage/tmx.py b/translate/storage/tmx.py
@@ -21,7 +21,7 @@
 from lxml import etree
 
 from translate import __version__
-from translate.misc.xml_helpers import setXMLlang, valid_chars_only
+from translate.misc.xml_helpers import safely_set_text, setXMLlang
 from translate.storage import lisa
 
 
@@ -39,11 +39,7 @@ def createlanguageNode(self, lang, text, purpose):
         seg = etree.SubElement(langset, self.textNode)
         # implied by the standard:
         # setXMLspace(seg, "preserve")
-        try:
-            seg.text = text
-        except ValueError:
-            # Prevents "All strings must be XML compatible" when string contains a control characters
-            seg.text = valid_chars_only(text)
+        safely_set_text(seg, text)
 
         return langset
 
@@ -66,7 +62,7 @@ def addnote(self, text, origin=None, position="append"):
         The origin parameter is ignored
         """
         note = etree.SubElement(self.xmlelement, self.namespaced("note"))
-        note.text = text.strip()
+        safely_set_text(note, text.strip())
 
     def _getnotelist(self, origin=None):
         """

diff --git a/translate/storage/ts2.py b/translate/storage/ts2.py
@@ -36,6 +36,7 @@
 
 from translate.lang import data
 from translate.misc.multistring import multistring
+from translate.misc.xml_helpers import safely_set_text
 from translate.storage import lisa
 from translate.storage.placeables import general
 from translate.storage.workflow import StateEnum as state
@@ -121,7 +122,7 @@ def createlanguageNode(self, lang, text, purpose):
         # TODO: check language
         # lisa.setXMLlang(langset, lang)
 
-        langset.text = text
+        safely_set_text(langset, text)
         return langset
 
     def _getsourcenode(self):
@@ -185,9 +186,9 @@ def target(self, target):
             self.xmlelement.set("numerus", "yes")
             for string in strings:
                 numerus = etree.SubElement(targetnode, self.namespaced("numerusform"))
-                numerus.text = string or ""
+                safely_set_text(numerus, string or "")
         else:
-            targetnode.text = target or ""
+            safely_set_text(targetnode, target or "")
 
     def hasplural(self):
         return self.xmlelement.get("numerus") == "yes"
@@ -203,11 +204,11 @@ def addnote(self, text, origin=None, position="append"):
                 self.xmlelement, self.namespaced("translatorcomment")
             )
         if position == "append":
-            note.text = "\n".join(
-                item for item in [current_notes, text.strip()] if item
+            safely_set_text(
+                note, "\n".join(item for item in [current_notes, text.strip()] if item)
             )
         else:
-            note.text = text.strip()
+            safely_set_text(note, text.strip())
 
     def getnotes(self, origin=None):
         # TODO: consider only responding when origin has certain values
@@ -494,10 +495,10 @@ def _createcontext(self, contextname, comment=None):
             self.document.getroot(), self.namespaced(self.bodyNode)
         )
         name = etree.SubElement(context, self.namespaced("name"))
-        name.text = contextname
+        safely_set_text(name, contextname)
         if comment:
             comment_node = etree.SubElement(context, "comment")
-            comment_node.text = comment
+            safely_set_text(comment_node, comment)
         return context
 
     def _getcontextname(self, contextnode):

diff --git a/translate/storage/xliff.py b/translate/storage/xliff.py
@@ -30,6 +30,7 @@
 from translate.misc.xml_helpers import (
     clear_content,
     getXMLspace,
+    safely_set_text,
     setXMLlang,
     setXMLspace,
 )
@@ -48,46 +49,6 @@
 ID_SEPARATOR_SAFE = "__%04__"
 
 
-# List of ASCII control codes which are unaccepted in XML.
-ASCII_CONTROL_CODES = [
-    "0000",  # Unicode Character 'NULL' (U+0000)
-    "0001",  # Unicode Character 'START OF HEADING' (U+0001)
-    "0002",  # Unicode Character 'START OF TEXT' (U+0002)
-    "0003",  # Unicode Character 'END OF TEXT' (U+0003)
-    "0004",  # Unicode Character 'END OF TRANSMISSION' (U+0004)
-    "0005",  # Unicode Character 'ENQUIRY' (U+0005)
-    "0006",  # Unicode Character 'ACKNOWLEDGE' (U+0006)
-    "0007",  # Unicode Character 'BELL' (U+0007), "\a" in Python
-    "0008",  # Unicode Character 'BACKSPACE' (U+0008), "\b" in Python
-    "000b",  # Unicode Character 'LINE TABULATION' (U+000B), "\v" in Python
-    "000c",  # Unicode Character 'FORM FEED (FF)' (U+000C), "\f" in Python
-    "000e",  # Unicode Character 'SHIFT OUT' (U+000E)
-    "000f",  # Unicode Character 'SHIFT IN' (U+000F)
-    "0010",  # Unicode Character 'DATA LINK ESCAPE' (U+0010)
-    "0011",  # Unicode Character 'DEVICE CONTROL ONE' (U+0011)
-    "0012",  # Unicode Character 'DEVICE CONTROL TWO' (U+0012)
-    "0013",  # Unicode Character 'DEVICE CONTROL THREE' (U+0013)
-    "0014",  # Unicode Character 'DEVICE CONTROL FOUR' (U+0014)
-    "0015",  # Unicode Character 'NEGATIVE ACKNOWLEDGE' (U+0015)
-    "0016",  # Unicode Character 'SYNCHRONOUS IDLE' (U+0016)
-    "0017",  # Unicode Character 'END OF TRANSMISSION BLOCK' (U+0017)
-    "0018",  # Unicode Character 'CANCEL' (U+0018)
-    "0019",  # Unicode Character 'END OF MEDIUM' (U+0019)
-    "001a",  # Unicode Character 'SUBSTITUTE' (U+001A)
-    "001b",  # Unicode Character 'ESCAPE' (U+001B)
-    "001c",  # Unicode Character 'INFORMATION SEPARATOR FOUR' (U+001C)
-    "001d",  # Unicode Character 'INFORMATION SEPARATOR THREE' (U+001D)
-    "001e",  # Unicode Character 'INFORMATION SEPARATOR TWO' (U+001E)
-    "001f",  # Unicode Character 'INFORMATION SEPARATOR ONE' (U+001F)
-]
-
-ASCII_CONTROL_CHARACTERS = {code: chr(int(code, 16)) for code in ASCII_CONTROL_CODES}
-
-ASCII_CONTROL_CHARACTERS_ESCAPES = {
-    code: "&#x%s;" % code.lstrip("0") or "0" for code in ASCII_CONTROL_CODES
-}
-
-
 class xliffunit(lisa.LISAunit):
     """A single term in the xliff file."""
 
@@ -136,15 +97,6 @@ def __init__(self, source, empty=False, **kwargs):
             return
         setXMLspace(self.xmlelement, "preserve")
 
-    def getNodeText(self, languageNode, xml_space="preserve"):
-        """Retrieves the term from the given :attr:`languageNode`."""
-        text = super().getNodeText(languageNode, xml_space=xml_space)
-        if text is not None:
-            # Unescape the unaccepted ASCII control characters.
-            for code, character in ASCII_CONTROL_CHARACTERS.items():
-                text = text.replace(ASCII_CONTROL_CHARACTERS_ESCAPES[code], character)
-        return text
-
     def createlanguageNode(self, lang, text, purpose):
         """Returns an xml Element setup with given parameters."""
         # TODO: for now we do source, but we have to test if it is target,
@@ -156,11 +108,8 @@ def createlanguageNode(self, lang, text, purpose):
         # TODO: check language
         # setXMLlang(langset, lang)
 
-        # Escape the unaccepted ASCII control characters.
-        for code in ASCII_CONTROL_CODES:
-            text = text.replace(chr(int(code, 16)), "&#x%s;" % code.lstrip("0") or "0")
+        safely_set_text(langset, text)
 
-        langset.text = text
         return langset
 
     def getlanguageNodes(self):