replace cgi.FieldStorage by multipart (#1094)

* replace `cgi.FieldStorage` by `multipart` * add CHANGES.rst log entry [skip ci] * partially support non seekable input file -- this will not work when the request body is accessed twice; in those cases, the second access will give an empty result * interpret a missing `CONTENT_TYPE` as `application/x-www-form-urlencoded` (required by some tests) * better explain "binary converters" [skip ci] * Update src/ZPublisher/Converters.py Co-authored-by: Michael Howitz <mh@gocept.com>
zopefoundation · Jan 19, 2023 · 5b324f6 · 5b324f6
1 parent 3b3c67a
commit 5b324f6
Show file tree

Hide file tree

Showing 7 changed files with 240 additions and 42 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -10,6 +10,15 @@ https://github.com/zopefoundation/Zope/blob/4.x/CHANGES.rst
 5.8.1 (unreleased)
 ------------------
 
+- Replace ``cgi.FieldStorage`` by ``multipart`` avoiding
+  the ``cgi`` module deprecated by Python 3.11.
+
+  Mark binary converters with a true ``binary`` attribute.
+
+  Fix encoding handling and ``:bytes`` converter.
+
+  See `#1094 <https://github.com/zopefoundation/Zope/pull/1094>`_.
+
 - Clean out and refactor dependency configuration files.
 
 - Update to newest compatible versions of dependencies.

diff --git a/setup.py b/setup.py
@@ -122,6 +122,7 @@ def _read_file(filename):
         'zope.testing',
         'zope.traversing',
         'zope.viewlet',
+        'multipart',
     ],
     include_package_data=True,
     zip_safe=False,

diff --git a/src/ZPublisher/Converters.py b/src/ZPublisher/Converters.py
@@ -11,6 +11,17 @@
 #
 ##############################################################################
 
+"""Converters
+
+Used by `ZPublisher.HTTPRequest` and `OFS.PropertyManager`.
+
+Binary converters (i.e. converters which use `bytes` for/in their result)
+are marked with a true `binary` attribute`.
+This allows the publisher to perform the conversion to `bytes`
+based on its more precise encoding knowledge.
+"""
+
+
 import html
 import json
 import re
@@ -42,6 +53,9 @@ def field2bytes(v):
         return bytes(v)
 
 
+field2bytes.binary = True
+
+
 def field2text(value, nl=re.compile('\r\n|\n\r').search):
     value = field2string(value)
     match_object = nl(value)

diff --git a/src/ZPublisher/HTTPRequest.py b/src/ZPublisher/HTTPRequest.py
@@ -20,13 +20,18 @@
 import random
 import re
 import time
-from cgi import FieldStorage
 from copy import deepcopy
+from types import SimpleNamespace
+from urllib.parse import parse_qsl
 from urllib.parse import unquote
 from urllib.parse import urlparse
 
 from AccessControl.tainted import should_be_tainted
 from AccessControl.tainted import taint_string
+from multipart import Headers
+from multipart import MultipartParser
+from multipart import parse_options_header
+from zExceptions import BadRequest
 from zope.component import queryUtility
 from zope.i18n.interfaces import IUserPreferredLanguages
 from zope.i18n.locales import LoadLocaleError
@@ -47,6 +52,13 @@
 from .cookie import getCookieValuePolicy
 
 
+# DOS attack protection -- limiting the amount of memory for forms
+# probably should become configurable
+FORM_MEMORY_LIMIT = 2 ** 20  # memory limit for forms
+FORM_DISK_LIMIT = 2 ** 30    # disk limit for forms
+FORM_MEMFILE_LIMIT = 4000    # limit for `BytesIO` -> temporary file switch
+
+
 # This may get overwritten during configuration
 default_encoding = 'utf-8'
 
@@ -494,17 +506,14 @@ def processInputs(
             environ['QUERY_STRING'] = ''
 
         meth = None
-        fs_kw = {}
-        fs_kw['encoding'] = self.charset
 
-        fs = ZopeFieldStorage(
-            fp=fp, environ=environ, keep_blank_values=1, **fs_kw)
+        fs = ZopeFieldStorage(fp, environ)
 
         # Keep a reference to the FieldStorage. Otherwise it's
         # __del__ method is called too early and closing FieldStorage.file.
         self._hold(fs)
 
-        if not hasattr(fs, 'list') or fs.list is None:
+        if not hasattr(fs, 'list'):
             if 'HTTP_SOAPACTION' in environ:
                 # Stash XML request for interpretation by a SOAP-aware view
                 other['SOAPXML'] = fs.value
@@ -525,24 +534,55 @@ def processInputs(
             tainteddefaults = {}
             converter = None
 
-            for item in fslist:
-
+            for item in fslist:  # form data
+                # Note:
+                # we want to support 2 use cases
+                # 1. the form data has been created by the browser
+                # 2. the form data is free standing
+                # A browser internally works with character data,
+                # which it encodes for transmission to the server --
+                # usually with `self.charset`. Therefore, we
+                # usually expect the form data to represent data
+                # in this charset.
+                # We make this same assumption also for free standing
+                # form data, i.e. we expect the form creator to know
+                # the server's charset. However, sometimes data cannot
+                # be represented in this charset (e.g. arbitrary binary
+                # data). To cover this case, we decode data
+                # with the `surrogateescape` error handler (see PEP 383).
+                # It allows to retrieve the original byte sequence.
+                # With an encoding modifier, the form creator
+                # can specify the correct encoding used by a form field value.
+                # Note: we always expect the form field name
+                # to be representable with `self.charset`. As those
+                # names are expected to be `ASCII`, this should be no
+                # big restriction.
+                # Note: the use of `surrogateescape` can lead to delayed
+                # problems when surrogates reach the application because
+                # they cannot be encoded with a standard error handler.
+                # We might want to prevent this.
+                character_encoding = ''  # currently used encoding
                 isFileUpload = 0
                 key = item.name
                 if key is None:
                     continue
+                key = item.name.encode("latin-1").decode(self.charset)
 
                 if hasattr(item, 'file') and \
                    hasattr(item, 'filename') and \
                    hasattr(item, 'headers'):
-                    if item.file and item.filename is not None:
-                        item = FileUpload(item)
-                        isFileUpload = 1
-                    else:
-                        item = item.value
+                    item = FileUpload(item, self.charset)
+                    isFileUpload = 1
+                else:
+                    character_encoding = self.charset
+                    item = item.value.decode(
+                        character_encoding, "surrogateescape")
+                # from here on, `item` contains the field value
+                # either as `FileUpload` or `str` with
+                # `character_encoding` as encoding.
+                # `key` the field name (`str`)
 
                 flags = 0
-                character_encoding = ''
                 # Variables for potentially unsafe values.
                 tainted = None
                 converter_type = None
@@ -599,7 +639,15 @@ def processInputs(
                             if not item:
                                 flags = flags | EMPTY
                         elif has_codec(type_name):
+                            # recode:
+                            assert not isFileUpload, "cannot recode files"
+                            item = item.encode(
+                                character_encoding, "surrogateescape")
                             character_encoding = type_name
+                            # we do not use `surrogateescape` as
+                            # we immediately want to determine
+                            # an incompatible encoding modifier
+                            item = item.decode(character_encoding)
 
                         delim = key.rfind(':')
                         if delim < 0:
@@ -644,20 +692,11 @@ def processInputs(
                     # defer conversion
                     if flags & CONVERTED:
                         try:
-                            if character_encoding:
-                                # We have a string with a specified character
-                                # encoding.  This gets passed to the converter
-                                # either as unicode, if it can handle it, or
-                                # crunched back down to utf-8 if it can not.
-                                if isinstance(item, bytes):
-                                    item = str(item, character_encoding)
-                                if hasattr(converter, 'convert_unicode'):
-                                    item = converter.convert_unicode(item)
-                                else:
-                                    item = converter(
-                                        item.encode(default_encoding))
-                            else:
-                                item = converter(item)
+                            if character_encoding and \
+                               getattr(converter, "binary", False):
+                                item = item.encode(character_encoding,
+                                                   "surrogateescape")
+                            item = converter(item)
 
                             # Flag potentially unsafe values
                             if converter_type in ('string', 'required', 'text',
@@ -1632,15 +1671,117 @@ def sane_environment(env):
     return dict
 
 
-class ZopeFieldStorage(FieldStorage):
-    """This subclass exists to work around a Python bug
-    (see https://bugs.python.org/issue27777) to make sure
-    we can read binary data from a request body.
+class ValueDescriptor:
+    """(non data) descriptor to compute `value` from `file`."""
+    def __get__(self, inst, owner=None):
+        if inst is None:
+            return self
+        file = inst.file
+        try:
+            fpos = file.tell()
+        except Exception:
+            fpos = None
+        try:
+            return file.read()
+        finally:
+            if fpos is not None:
+                file.seek(fpos)
+
+
+class ValueAccessor:
+    value = ValueDescriptor()
+
+
+class FormField(SimpleNamespace, ValueAccessor):
+    """represent a single form field.
+
+    Typical attributes:
+    name
+      the field name
+    value
+      the field value (`bytes`)
+
+    File fields additionally have the attributes:
+    file
+      a binary file containing the file content
+    filename
+      the file's name as reported by the client
+    headers
+      a case insensitive dict with header information;
+      usually `content-type` and `content-disposition`.
+
+    Unless otherwise noted, `latin-1` decoded bytes
+    are used to represent textual data.
     """
 
-    def read_binary(self):
-        self._binary_file = True
-        return FieldStorage.read_binary(self)
+
+class ZopeFieldStorage(ValueAccessor):
+    def __init__(self, fp, environ):
+        self.file = fp
+        method = environ.get("REQUEST_METHOD", "GET").upper()
+        qs = environ.get("QUERY_STRING", "")
+        hl = []
+        content_type = environ.get("CONTENT_TYPE",
+                                   "application/x-www-form-urlencoded")
+        content_type = content_type
+        hl.append(("content-type", content_type))
+        content_disposition = environ.get("CONTENT_DISPOSITION")
+        if content_disposition is not None:
+            hl.append(("content-disposition", content_disposition))
+        self.headers = Headers(hl)
+        parts = ()
+        if method == "POST":
+            try:
+                fpos = fp.tell()
+            except Exception:
+                fpos = None
+            if content_type.startswith("multipart/form-data"):
+                ct, options = parse_options_header(content_type)
+                parts = MultipartParser(
+                    fp, options["boundary"],
+                    mem_limit=FORM_MEMORY_LIMIT,
+                    disk_limit=FORM_DISK_LIMIT,
+                    memfile_limit=FORM_MEMFILE_LIMIT,
+                    charset="latin-1").parts()
+            elif content_type == "application/x-www-form-urlencoded":
+                if qs:
+                    qs += "&"
+                qs += fp.read(FORM_MEMORY_LIMIT).decode("latin-1")
+                if fp.read(1):
+                    raise BadRequest("form data processing "
+                                     "requires too much memory")
+            else:
+                # `processInputs` currently expects either
+                # form values or a response body, not both.
+                # reset `qs` to fulfill this expectation.
+                qs = ""
+            if fpos is not None:
+                fp.seek(fpos)
+        elif method not in ("GET", "HEAD"):
+            # `processInputs` currently expects either
+            # form values or a response body, not both.
+            # reset `qs` to fulfill this expectation.
+            qs = ""
+        fl = []
+        add_field = fl.append
+        for name, val in parse_qsl(
+           qs,  # noqa: E121
+           keep_blank_values=True, encoding="latin-1"):
+            add_field(FormField(
+                name=name, value=val.encode("latin-1")))
+        for part in parts:
+            if part.filename:
+                # a file
+                field = FormField(
+                    name=part.name,
+                    file=part.file,
+                    filename=part.filename,
+                    headers=part.headers)
+            else:
+                field = FormField(name=part.name, value=part.raw)
+            add_field(field)
+        if fl:
+            self.list = fl
 
 
 # Original version: zope.publisher.browser.FileUpload
@@ -1660,11 +1801,12 @@ class FileUpload:
     # that protected code can use DTML to work with FileUploads.
     __allow_access_to_unprotected_subobjects__ = 1
 
-    def __init__(self, aFieldStorage):
+    def __init__(self, aFieldStorage, charset):
         self.file = aFieldStorage.file
         self.headers = aFieldStorage.headers
-        self.filename = aFieldStorage.filename
-        self.name = aFieldStorage.name
+        self.filename = aFieldStorage.filename\
+            .encode("latin-1").decode(charset)
+        self.name = aFieldStorage.name.encode("latin-1").decode(charset)
 
         # Add an assertion to the rfc822.Message object that implements
         # self.headers so that managed code can access them.