Merge pull request #44 from zopefoundation/issue15

Handle Unicode token values with non-ascii chars.
zopefoundation · Aug 14, 2018 · c61de98 · c61de98
2 parents ec25d71 + 62d9970
commit c61de98
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 11 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -20,6 +20,13 @@ Changes
   ``max`` values (they must still specify a ``default`` value). See
   `issue 9 <https://github.com/zopefoundation/zope.schema/issues/9>`_.
 
+- ``Choice``, ``SimpleVocabulary`` and  ``SimpleTerm`` all gracefully
+  handle using Unicode token values with non-ASCII characters by encoding
+  them with the ``backslashreplace`` error handler. See `issue 15
+  <https://github.com/zopefoundation/zope.schema/issues/15>`_ and `PR
+  6 <https://github.com/zopefoundation/zope.schema/pull/6>`_.
+
+
 4.5.0 (2017-07-10)
 ------------------
 

diff --git a/src/zope/schema/interfaces.py b/src/zope/schema/interfaces.py
@@ -592,13 +592,14 @@ class ITokenizedTerm(ITerm):
     """
 
     # Should be a ``zope.schema.ASCIILine``, but `ASCIILine` is not a bootstrap
-    # field.
+    # field. `ASCIILine` is a type of NativeString.
     token = Attribute(
         "token",
         """Token which can be used to represent the value on a stream.
 
-        The value of this attribute must be a non-empty 7-bit string.
-        Control characters are not allowed.
+        The value of this attribute must be a non-empty 7-bit native string
+        (i.e., the ``str`` type on both Python 2 and 3).
+        Control characters, including newline, are not allowed.
         """)
 
 

diff --git a/src/zope/schema/tests/test__field.py b/src/zope/schema/tests/test__field.py
@@ -802,6 +802,18 @@ def test_ctor_w_values(self):
         self.assertEqual(sorted(choose.vocabulary.by_value.keys()), [1, 2])
         self.assertEqual(sorted(choose.source.by_value.keys()), [1, 2])
 
+    def test_ctor_w_unicode_non_ascii_values(self):
+        values = [u'K\xf6ln', u'D\xfcsseldorf', 'Bonn']
+        choose = self._makeOne(values=values)
+        self.assertEqual(sorted(choose.vocabulary.by_value.keys()),
+                         sorted(values))
+        self.assertEqual(sorted(choose.source.by_value.keys()),
+                         sorted(values))
+        self.assertEqual(
+            sorted(choose.vocabulary.by_token.keys()),
+            sorted([x.encode('ascii', 'backslashreplace').decode('ascii') for x in values]))
+
+
     def test_ctor_w_named_vocabulary(self):
         choose = self._makeOne(vocabulary="vocab")
         self.assertEqual(choose.vocabularyName, 'vocab')

diff --git a/src/zope/schema/tests/test_vocabulary.py b/src/zope/schema/tests/test_vocabulary.py
@@ -58,6 +58,13 @@ def test_bytes_value(self):
         self.assertEqual(term.token, 'term')
         self.assertFalse(ITitledTokenizedTerm.providedBy(term))
 
+    def test_unicode_non_ascii_value(self):
+        from zope.schema.interfaces import ITitledTokenizedTerm
+        term = self._makeOne(u'Snowman \u2603')
+        self.assertEqual(term.value, u'Snowman \u2603')
+        self.assertEqual(term.token, 'Snowman \\u2603')
+        self.assertFalse(ITitledTokenizedTerm.providedBy(term))
+
 
 class SimpleVocabularyTests(unittest.TestCase):
 

diff --git a/src/zope/schema/vocabulary.py b/src/zope/schema/vocabulary.py
@@ -13,15 +13,18 @@
 ##############################################################################
 """Vocabulary support for schema.
 """
+from collections import OrderedDict
 
 from zope.interface import directlyProvides
 from zope.interface import implementer
+
+from zope.schema._compat import text_type
 from zope.schema.interfaces import ITitledTokenizedTerm
 from zope.schema.interfaces import ITokenizedTerm
 from zope.schema.interfaces import ITreeVocabulary
 from zope.schema.interfaces import IVocabularyRegistry
 from zope.schema.interfaces import IVocabularyTokenized
-from collections import OrderedDict
+
 
 # simple vocabularies performing enumerated-like tasks
 _marker = object()
@@ -32,19 +35,31 @@ class SimpleTerm(object):
     """Simple tokenized term used by SimpleVocabulary."""
 
     def __init__(self, value, token=None, title=None):
-        """Create a term for value and token. If token is omitted,
-        str(value) is used for the token.  If title is provided,
-        term implements ITitledTokenizedTerm.
+        """Create a term for *value* and *token*. If *token* is
+        omitted, str(value) is used for the token, escaping any
+        non-ASCII characters.
+
+        If *title* is provided, term implements `ITitledTokenizedTerm`.
         """
         self.value = value
         if token is None:
             token = value
         # In Python 3 str(bytes) returns str(repr(bytes)), which is not what
         # we want here. On the other hand, we want to try to keep the token as
-        # readable as possible.
-        self.token = str(token) \
-            if not isinstance(token, bytes) \
-            else str(token.decode('ascii', 'ignore'))
+        # readable as possible. On both 2 and 3, self.token should be a native
+        # string (ASCIILine).
+        if not isinstance(token, (str, bytes, text_type)):
+            # Nothing we recognize as intended to be textual data.
+            # Get its str() as promised
+            token = str(token)
+
+        if isinstance(token, text_type):
+            token = token.encode('ascii', 'backslashreplace')
+        # Token should be bytes at this point. Now back to native string,
+        # if needed.
+        if not isinstance(token, str):
+            token = token.decode('ascii')
+        self.token = token
         self.title = title
         if title is not None:
             directlyProvides(self, ITitledTokenizedTerm)