Standardize HTTPResponse.read(X) behavior regardless of compression

Co-authored-by: Franek Magiera <framagie@gmail.com>
urllib3 · Nov 14, 2022 · c35033f · c35033f
1 parent 9763f09
commit c35033f
Show file tree

Hide file tree

Showing 7 changed files with 292 additions and 70 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -66,11 +66,11 @@ jobs:
             os: ubuntu-20.04  # CPython 3.9.2 is not available for ubuntu-22.04.
             experimental: false
             nox-session: test-3.9
-          - python-version: 3.11-dev
+          - python-version: 3.11
             os: ubuntu-20.04
             experimental: true
             nox-session: test-3.11
-          - python-version: 3.11-dev
+          - python-version: 3.11
             os: ubuntu-22.04
             experimental: true
             nox-session: test-3.11

diff --git a/changelog/2128.removal.rst b/changelog/2128.removal.rst
@@ -0,0 +1,7 @@
+Standardized :meth:`~urllib3.response.HTTPResponse.read` to respect the semantics of BufferedIOBase regardless of compression. Specifically, this method:
+
+* only returns an empty bytes object to indicate EOF (that is, the response has been fully consumed),
+* never returns more bytes than requested,
+* can issue any number of system calls: zero, one or multiple.
+
+If you want each :meth:`~urllib3.response.HTTPResponse.read` call to issue a single system call, you need to disable decompression by setting ``decode_content=False``.
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -1,11 +1,12 @@
 coverage==6.4
 tornado==6.2
 PySocks==1.7.1
-pytest==7.0.0
+pytest==7.2.0
 pytest-timeout==2.1.0
 pytest-freezegun==0.4.2
 flaky==3.7.0
 trustme==0.9.0
 cryptography==37.0.2
 backports.zoneinfo==0.2.1;python_version<"3.9"
-towncrier==21.9.0 
+towncrier==21.9.0
+pytest-memray==1.3.0;python_version>="3.8" and sys_platform!="win32" and implementation_name=="cpython"
diff --git a/noxfile.py b/noxfile.py
@@ -1,6 +1,7 @@
 import os
 import shutil
 import subprocess
+import sys
 
 import nox
 
@@ -31,6 +32,12 @@ def tests_impl(
     # Print OpenSSL information.
     session.run("python", "-m", "OpenSSL.debug")
 
+    memray_supported = True
+    if sys.implementation.name != "cpython" or sys.version_info < (3, 8):
+        memray_supported = False  # pytest-memray requires CPython 3.8+
+    elif sys.platform == "win32":
+        memray_supported = False
+
     # Inspired from https://hynek.me/articles/ditch-codecov-python/
     # We use parallel mode and then combine in a later CI step
     session.run(
@@ -42,6 +49,7 @@ def tests_impl(
         "--parallel-mode",
         "-m",
         "pytest",
+        *("--memray", "--hide-memray-summary") if memray_supported else (),
         "-r",
         "a",
         f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}",

diff --git a/src/urllib3/response.py b/src/urllib3/response.py
@@ -1,3 +1,4 @@
+import collections
 import io
 import json as _json
 import logging
@@ -11,6 +12,7 @@
 from typing import (
     TYPE_CHECKING,
     Any,
+    Deque,
     Generator,
     Iterator,
     List,
@@ -223,6 +225,63 @@ def _get_decoder(mode: str) -> ContentDecoder:
     return DeflateDecoder()
 
 
+class BytesQueueBuffer:
+    """Memory-efficient bytes buffer
+
+    To return decoded data in read() and still follow the BufferedIOBase API, we need a
+    buffer to always return the correct amount of bytes.
+
+    This buffer should be filled using calls to put()
+
+    Our maximum memory usage is determined by the sum of the size of:
+
+     * self.buffer, which contains the full data
+     * the largest chunk that we will copy in get()
+
+    The worst case scenario is a single chunk, in which case we'll make a full copy of
+    the data inside get().
+    """
+
+    def __init__(self) -> None:
+        self.buffer: Deque[bytes] = collections.deque()
+        self._size: int = 0
+
+    def __len__(self) -> int:
+        return self._size
+
+    def put(self, data: bytes) -> None:
+        self.buffer.append(data)
+        self._size += len(data)
+
+    def get(self, n: int) -> bytes:
+        if not self.buffer:
+            raise RuntimeError("buffer is empty")
+        elif n < 0:
+            raise ValueError("n should be > 0")
+
+        fetched = 0
+        ret = io.BytesIO()
+        while fetched < n:
+            remaining = n - fetched
+            chunk = self.buffer.popleft()
+            chunk_length = len(chunk)
+            if remaining < chunk_length:
+                left_chunk, right_chunk = chunk[:remaining], chunk[remaining:]
+                ret.write(left_chunk)
+                self.buffer.appendleft(right_chunk)
+                self._size -= remaining
+                break
+            else:
+                ret.write(chunk)
+                self._size -= chunk_length
+            fetched += chunk_length
+
+            if not self.buffer:
+                break
+
+        return ret.getvalue()
+
+
 class BaseHTTPResponse(io.IOBase):
     CONTENT_DECODERS = ["gzip", "deflate"]
     if brotli is not None:
@@ -512,6 +571,9 @@ def __init__(
         # Determine length of response
         self.length_remaining = self._init_length(request_method)
 
+        # Used to return the correct amount of bytes for partial read()s
+        self._decoded_buffer = BytesQueueBuffer()
+
         # If requested, preload the body.
         if preload_content and not self._body:
             self._body = self.read(decode_content=decode_content)
@@ -720,6 +782,48 @@ def _fp_read(self, amt: Optional[int] = None) -> bytes:
             # StringIO doesn't like amt=None
             return self._fp.read(amt) if amt is not None else self._fp.read()
 
+    def _raw_read(
+        self,
+        amt: Optional[int] = None,
+    ) -> bytes:
+        """
+        Reads `amt` of bytes from the socket.
+        """
+        if self._fp is None:
+            return None  # type: ignore[return-value]
+
+        fp_closed = getattr(self._fp, "closed", False)
+
+        with self._error_catcher():
+            data = self._fp_read(amt) if not fp_closed else b""
+            if amt is not None and amt != 0 and not data:
+                # Platform-specific: Buggy versions of Python.
+                # Close the connection when no data is returned
+                #
+                # This is redundant to what httplib/http.client _should_
+                # already do.  However, versions of python released before
+                # December 15, 2012 (http://bugs.python.org/issue16298) do
+                # not properly close the connection in all cases. There is
+                # no harm in redundantly calling close.
+                self._fp.close()
+                if (
+                    self.enforce_content_length
+                    and self.length_remaining is not None
+                    and self.length_remaining != 0
+                ):
+                    # This is an edge case that httplib failed to cover due
+                    # to concerns of backward compatibility. We're
+                    # addressing it here to make sure IncompleteRead is
+                    # raised during streaming, so all calls with incorrect
+                    # Content-Length are caught.
+                    raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
+
+        if data:
+            self._fp_bytes_read += len(data)
+            if self.length_remaining is not None:
+                self.length_remaining -= len(data)
+        return data
+
     def read(
         self,
         amt: Optional[int] = None,
@@ -750,51 +854,43 @@ def read(
         if decode_content is None:
             decode_content = self.decode_content
 
-        if self._fp is None:
-            return None  # type: ignore[return-value]
+        if amt is not None:
+            cache_content = False
 
-        flush_decoder = False
-        fp_closed = getattr(self._fp, "closed", False)
+            if len(self._decoded_buffer) >= amt:
+                return self._decoded_buffer.get(amt)
 
-        with self._error_catcher():
-            data = self._fp_read(amt) if not fp_closed else b""
-            if amt is None:
-                flush_decoder = True
-            else:
-                cache_content = False
-                if (
-                    amt != 0 and not data
-                ):  # Platform-specific: Buggy versions of Python.
-                    # Close the connection when no data is returned
-                    #
-                    # This is redundant to what httplib/http.client _should_
-                    # already do.  However, versions of python released before
-                    # December 15, 2012 (http://bugs.python.org/issue16298) do
-                    # not properly close the connection in all cases. There is
-                    # no harm in redundantly calling close.
-                    self._fp.close()
-                    flush_decoder = True
-                    if (
-                        self.enforce_content_length
-                        and self.length_remaining is not None
-                        and self.length_remaining != 0
-                    ):
-                        # This is an edge case that httplib failed to cover due
-                        # to concerns of backward compatibility. We're
-                        # addressing it here to make sure IncompleteRead is
-                        # raised during streaming, so all calls with incorrect
-                        # Content-Length are caught.
-                        raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
+        data = self._raw_read(amt)
 
-        if data:
-            self._fp_bytes_read += len(data)
-            if self.length_remaining is not None:
-                self.length_remaining -= len(data)
+        flush_decoder = False
+        if amt is None:
+            flush_decoder = True
+        elif amt != 0 and not data:
+            flush_decoder = True
 
-            data = self._decode(data, decode_content, flush_decoder)
+        if not data and len(self._decoded_buffer) == 0:
+            return data
 
+        if amt is None:
+            data = self._decode(data, decode_content, flush_decoder)
             if cache_content:
                 self._body = data
+        else:
+            # do not waste memory on buffer when not decoding
+            if not decode_content:
+                return data
+
+            decoded_data = self._decode(data, decode_content, flush_decoder)
+            self._decoded_buffer.put(decoded_data)
+
+            while len(self._decoded_buffer) < amt and data:
+                # TODO make sure to initially read enough data to get past the headers
+                # For example, the GZ file header takes 10 bytes, we don't want to read
+                # it one byte at a time
+                data = self._raw_read(amt)
+                decoded_data = self._decode(data, decode_content, flush_decoder)
+                self._decoded_buffer.put(decoded_data)
+            data = self._decoded_buffer.get(amt)
 
         return data