Skip to content

Commit

Permalink
Redo decoding
Browse files Browse the repository at this point in the history
Recode QUERY_STRING earlier rather than encoding
after it has passed through FieldStorage. We were
erroneously encoding form data from the body :(

Also, note that on python3 cgi.FieldStorage already
decodes using utf-8, so either just return that or
encode to bytes to we can later decode.

This code is particularly insane, but almost
impossible to fix properly without vendoring
FieldStorage which started decoding it’s inputs
in python3 :(
  • Loading branch information
Brian Sutherland committed Nov 5, 2016
1 parent 39d67bc commit 85c7000
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 9 deletions.
26 changes: 20 additions & 6 deletions src/zope/publisher/browser.py
Expand Up @@ -22,6 +22,7 @@

import re
from cgi import FieldStorage
import locale
import tempfile

import zope.component
Expand Down Expand Up @@ -247,16 +248,17 @@ def _createResponse(self):

def _decode(self, text):
"""Try to decode the text using one of the available charsets."""
# According to PEP-3333, in python-3, QUERY_STRING is a string,
# representing 'latin-1' encoded byte array. So, if we are in python-3
# context, encode text as 'latin-1' first, to try to decode
# resulting byte array using user-supplied charset.
if not isinstance(text, bytes):
text = text.encode('latin-1')
if self.charsets is None:
envadapter = IUserPreferredCharsets(self)
self.charsets = envadapter.getPreferredCharsets() or ['utf-8']
self.charsets = [c for c in self.charsets if c != '*']
if not PYTHON2 and self.charsets and self.charsets[0] == 'utf-8':
# optimization: we are trying to decode something cgi.FieldStorage already
# decoded for us, let's just return it rather than waste time decoding
return text
if not PYTHON2:
# undo what cgi.FieldStorage did and maintain backwards compat
text = text.encode('utf-8')
for charset in self.charsets:
try:
text = _u(text, charset)
Expand Down Expand Up @@ -298,6 +300,18 @@ def processInputs(self):
env = env.copy()
del env['QUERY_STRING']

if not PYTHON2:
# According to PEP-3333, in python-3, QUERY_STRING is a string,
# representing 'latin-1' encoded byte array. So, if we are in python-3
# context, encode text as 'latin-1' first, to try to decode
# resulting byte array using user-supplied charset.
#
# We also need to re-encode it in locale.getpreferredencoding() so that cgi.py
# FieldStorage can later decode it.
qs = env.get('QUERY_STRING')
if qs is not None:
qs = qs.encode('latin-1')
env['QUERY_STRING'] = qs.decode(locale.getpreferredencoding(), 'surrogateescape')

args = {'encoding': 'utf-8'} if not PYTHON2 else {}
fs = ZopeFieldStorage(fp=fp, environ=env,
Expand Down
41 changes: 38 additions & 3 deletions src/zope/publisher/tests/test_browserrequest.py
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Foundation and Contributors.
Expand Down Expand Up @@ -220,6 +221,35 @@ def testFileUploadPost(self):
# Test that we can actually read the file data
self.assertEqual(request.form['upload'].read(), b'Some data')

def test_multipart_form_decoding_in_UTF8(self):
street = u'NOT latin-1: 汉语/漢語'
body = u"""---123
Content-Disposition: form-data; name="street"
%s
---123--
""" % street

extra = {
'CONTENT_TYPE': 'multipart/form-data; boundary=-123',
"REQUEST_METHOD": "POST"
}
request = self._createRequest(extra, body=body.encode('utf-8'))
request.processInputs()
self.assertTrue(isinstance(request.form[_u("street")], unicode))
self.assertEqual(street, request.form['street'])

def test_urlencoded_form_in_utf8(self):
street = u'NOT latin-1: 汉语/漢語'
body = u'street=' + street
request = self._createRequest(
dict(REQUEST_METHOD='POST',
CONTENT_TYPE='application/x-www-form-urlencoded',
),
body.encode('utf-8'))
request.processInputs()
self.assertEqual(dict(request.form), dict(street=street))

def testDefault2(self):
extra = {'PATH_INFO': '/folder/item2/view'}
request = self._createRequest(extra)
Expand Down Expand Up @@ -267,18 +297,23 @@ def testForm(self):
{_u("a"):_u("5"), _u("b"):6})

def testFormNoEncodingUsesUTF8(self):
encoded = 'K\xc3\xb6hlerstra\xc3\x9fe'
street = u'Non Latin-1: 汉语/漢語'
qs = u'a=5&b:int=6&street=' + street
qs = qs.encode('utf-8')
if not PYTHON2:
# as per PEP-3333
qs = qs.decode('latin-1')
extra = {
# if nothing else is specified, form data should be
# interpreted as UTF-8, as this stub query string is
'QUERY_STRING': 'a=5&b:int=6&street=' + encoded
'QUERY_STRING': qs
}
request = self._createRequest(extra)
# many mainstream browsers do not send HTTP_ACCEPT_CHARSET
del request._environ['HTTP_ACCEPT_CHARSET']
publish(request)
self.assertTrue(isinstance(request.form[_u("street")], unicode))
self.assertEqual(_u("K\xf6hlerstra\xdfe"), request.form['street'])
self.assertEqual(street, request.form['street'])

def testFormAcceptsStarButNotUTF8(self):
extra = {
Expand Down

0 comments on commit 85c7000

Please sign in to comment.