zopefoundation · jinty · Nov 5, 2016 · mgedmin · Jul 10, 2019 · mgedmin
diff --git a/src/zope/publisher/browser.py b/src/zope/publisher/browser.py
@@ -22,6 +22,7 @@
 
 import re
 from cgi import FieldStorage
+import locale
 import tempfile
 
 import zope.component
@@ -247,16 +248,17 @@ def _createResponse(self):
 
     def _decode(self, text):
         """Try to decode the text using one of the available charsets."""
-        # According to PEP-3333, in python-3, QUERY_STRING is a string,
-        # representing 'latin-1' encoded byte array. So, if we are in python-3
-        # context, encode text as 'latin-1' first, to try to decode
-        # resulting byte array using user-supplied charset.
-        if not isinstance(text, bytes):
-            text = text.encode('latin-1')
         if self.charsets is None:
             envadapter = IUserPreferredCharsets(self)
             self.charsets = envadapter.getPreferredCharsets() or ['utf-8']
             self.charsets = [c for c in self.charsets if c != '*']
+        if not PYTHON2 and self.charsets and self.charsets[0] == 'utf-8':
+            # optimization: we are trying to decode something cgi.FieldStorage already
+            # decoded for us, let's just return it rather than waste time decoding
+            return text
+        if not PYTHON2:
+            # undo what cgi.FieldStorage did and maintain backwards compat
+            text = text.encode('utf-8')
         for charset in self.charsets:
             try:
                 text = _u(text, charset)
@@ -298,6 +300,18 @@ def processInputs(self):
             env = env.copy()
             del env['QUERY_STRING']
 
+        if not PYTHON2:
+            # According to PEP-3333, in python-3, QUERY_STRING is a string,
+            # representing 'latin-1' encoded byte array. So, if we are in python-3
+            # context, encode text as 'latin-1' first, to try to decode
+            # resulting byte array using user-supplied charset.
+            #
+            # We also need to re-encode it in locale.getpreferredencoding() so that cgi.py
+            # FieldStorage can later decode it.
+            qs = env.get('QUERY_STRING')
+            if qs is not None:
+                qs = qs.encode('latin-1')
+                env['QUERY_STRING'] = qs.decode(locale.getpreferredencoding(), 'surrogateescape')
 
         args = {'encoding': 'utf-8'} if not PYTHON2 else {}
         fs = ZopeFieldStorage(fp=fp, environ=env,

diff --git a/src/zope/publisher/tests/test_browserrequest.py b/src/zope/publisher/tests/test_browserrequest.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 ##############################################################################
 #
 # Copyright (c) 2001, 2002 Zope Foundation and Contributors.
@@ -236,6 +237,35 @@ def testLargePostValue(self):
         request  = self._createRequest(extra, body=LARGE_POSTED_VALUE)
         request.processInputs()
 
+    def test_multipart_form_decoding_in_UTF8(self):
+        street = u'NOT latin-1: 汉语/漢語'
+        body = u"""---123
+Content-Disposition: form-data; name="street"
+
+%s
+---123--
+""" % street
+
+        extra = {
+            'CONTENT_TYPE':     'multipart/form-data; boundary=-123',
+            "REQUEST_METHOD": "POST"
+            }
+        request = self._createRequest(extra, body=body.encode('utf-8'))
+        request.processInputs()
+        self.assertTrue(isinstance(request.form[_u("street")], unicode))
+        self.assertEqual(street, request.form['street'])
+
+    def test_urlencoded_form_in_utf8(self):
+        street = u'NOT latin-1: 汉语/漢語'
+        body = u'street=' + street
+        request = self._createRequest(
+            dict(REQUEST_METHOD='POST',
+                 CONTENT_TYPE='application/x-www-form-urlencoded',
+                 ),
+            body.encode('utf-8'))
+        request.processInputs()
+        self.assertEqual(dict(request.form), dict(street=street))
+
     def testDefault2(self):
         extra = {'PATH_INFO': '/folder/item2/view'}
         request = self._createRequest(extra)
@@ -283,18 +313,23 @@ def testForm(self):
                          {_u("a"):_u("5"), _u("b"):6})
 
     def testFormNoEncodingUsesUTF8(self):
-        encoded = 'K\xc3\xb6hlerstra\xc3\x9fe'
+        street = u'Non Latin-1: 汉语/漢語'
+        qs = u'a=5&b:int=6&street=' + street
+        qs = qs.encode('utf-8')
+        if not PYTHON2:
+            # as per PEP-3333
+            qs = qs.decode('latin-1')
         extra = {
             # if nothing else is specified, form data should be
             # interpreted as UTF-8, as this stub query string is
-            'QUERY_STRING': 'a=5&b:int=6&street=' + encoded
+            'QUERY_STRING': qs
             }
         request = self._createRequest(extra)
         # many mainstream browsers do not send HTTP_ACCEPT_CHARSET
         del request._environ['HTTP_ACCEPT_CHARSET']
         publish(request)
         self.assertTrue(isinstance(request.form[_u("street")], unicode))
-        self.assertEqual(_u("K\xf6hlerstra\xdfe"), request.form['street'])
+        self.assertEqual(street, request.form['street'])
 
     def testFormAcceptsStarButNotUTF8(self):
         extra = {