Redo decoding

Recode QUERY_STRING earlier rather than encoding after it has passed through FieldStorage. We were erroneously encoding form data from the body :( Also, note that on python3 cgi.FieldStorage already decodes using utf-8, so either just return that or encode to bytes to we can later decode. This code is particularly insane, but almost impossible to fix properly without vendoring FieldStorage which started decoding it’s inputs in python3 :(
zopefoundation · Nov 5, 2016 · 85c7000 · 85c7000
1 parent 39d67bc
commit 85c7000
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 9 deletions.
diff --git a/src/zope/publisher/browser.py b/src/zope/publisher/browser.py
@@ -22,6 +22,7 @@
 
 import re
 from cgi import FieldStorage
+import locale
 import tempfile
 
 import zope.component
@@ -247,16 +248,17 @@ def _createResponse(self):
 
     def _decode(self, text):
         """Try to decode the text using one of the available charsets."""
-        # According to PEP-3333, in python-3, QUERY_STRING is a string,
-        # representing 'latin-1' encoded byte array. So, if we are in python-3
-        # context, encode text as 'latin-1' first, to try to decode
-        # resulting byte array using user-supplied charset.
-        if not isinstance(text, bytes):
-            text = text.encode('latin-1')
         if self.charsets is None:
             envadapter = IUserPreferredCharsets(self)
             self.charsets = envadapter.getPreferredCharsets() or ['utf-8']
             self.charsets = [c for c in self.charsets if c != '*']
+        if not PYTHON2 and self.charsets and self.charsets[0] == 'utf-8':
+            # optimization: we are trying to decode something cgi.FieldStorage already
+            # decoded for us, let's just return it rather than waste time decoding
+            return text
+        if not PYTHON2:
+            # undo what cgi.FieldStorage did and maintain backwards compat
+            text = text.encode('utf-8')
         for charset in self.charsets:
             try:
                 text = _u(text, charset)
@@ -298,6 +300,18 @@ def processInputs(self):
             env = env.copy()
             del env['QUERY_STRING']
 
+        if not PYTHON2:
+            # According to PEP-3333, in python-3, QUERY_STRING is a string,
+            # representing 'latin-1' encoded byte array. So, if we are in python-3
+            # context, encode text as 'latin-1' first, to try to decode
+            # resulting byte array using user-supplied charset.
+            #
+            # We also need to re-encode it in locale.getpreferredencoding() so that cgi.py
+            # FieldStorage can later decode it.
+            qs = env.get('QUERY_STRING')
+            if qs is not None:
+                qs = qs.encode('latin-1')
+                env['QUERY_STRING'] = qs.decode(locale.getpreferredencoding(), 'surrogateescape')
 
         args = {'encoding': 'utf-8'} if not PYTHON2 else {}
         fs = ZopeFieldStorage(fp=fp, environ=env,

diff --git a/src/zope/publisher/tests/test_browserrequest.py b/src/zope/publisher/tests/test_browserrequest.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 ##############################################################################
 #
 # Copyright (c) 2001, 2002 Zope Foundation and Contributors.
@@ -220,6 +221,35 @@ def testFileUploadPost(self):
         # Test that we can actually read the file data
         self.assertEqual(request.form['upload'].read(), b'Some data')
 
+    def test_multipart_form_decoding_in_UTF8(self):
+        street = u'NOT latin-1: 汉语/漢語'
+        body = u"""---123
+Content-Disposition: form-data; name="street"
+
+%s
+---123--
+""" % street
+
+        extra = {
+            'CONTENT_TYPE':     'multipart/form-data; boundary=-123',
+            "REQUEST_METHOD": "POST"
+            }
+        request = self._createRequest(extra, body=body.encode('utf-8'))
+        request.processInputs()
+        self.assertTrue(isinstance(request.form[_u("street")], unicode))
+        self.assertEqual(street, request.form['street'])
+
+    def test_urlencoded_form_in_utf8(self):
+        street = u'NOT latin-1: 汉语/漢語'
+        body = u'street=' + street
+        request = self._createRequest(
+            dict(REQUEST_METHOD='POST',
+                 CONTENT_TYPE='application/x-www-form-urlencoded',
+                 ),
+            body.encode('utf-8'))
+        request.processInputs()
+        self.assertEqual(dict(request.form), dict(street=street))
+
     def testDefault2(self):
         extra = {'PATH_INFO': '/folder/item2/view'}
         request = self._createRequest(extra)
@@ -267,18 +297,23 @@ def testForm(self):
                          {_u("a"):_u("5"), _u("b"):6})
 
     def testFormNoEncodingUsesUTF8(self):
-        encoded = 'K\xc3\xb6hlerstra\xc3\x9fe'
+        street = u'Non Latin-1: 汉语/漢語'
+        qs = u'a=5&b:int=6&street=' + street
+        qs = qs.encode('utf-8')
+        if not PYTHON2:
+            # as per PEP-3333
+            qs = qs.decode('latin-1')
         extra = {
             # if nothing else is specified, form data should be
             # interpreted as UTF-8, as this stub query string is
-            'QUERY_STRING': 'a=5&b:int=6&street=' + encoded
+            'QUERY_STRING': qs
             }
         request = self._createRequest(extra)
         # many mainstream browsers do not send HTTP_ACCEPT_CHARSET
         del request._environ['HTTP_ACCEPT_CHARSET']
         publish(request)
         self.assertTrue(isinstance(request.form[_u("street")], unicode))
-        self.assertEqual(_u("K\xf6hlerstra\xdfe"), request.form['street'])
+        self.assertEqual(street, request.form['street'])
 
     def testFormAcceptsStarButNotUTF8(self):
         extra = {