Skip to content

Commit

Permalink
Standardize HTTPResponse.read(X) behavior regardless of compression
Browse files Browse the repository at this point in the history
Co-authored-by: Franek Magiera <framagie@gmail.com>
  • Loading branch information
pquentin and franekmagiera committed Nov 14, 2022
1 parent 9763f09 commit c35033f
Show file tree
Hide file tree
Showing 7 changed files with 292 additions and 70 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,11 @@ jobs:
os: ubuntu-20.04 # CPython 3.9.2 is not available for ubuntu-22.04.
experimental: false
nox-session: test-3.9
- python-version: 3.11-dev
- python-version: 3.11
os: ubuntu-20.04
experimental: true
nox-session: test-3.11
- python-version: 3.11-dev
- python-version: 3.11
os: ubuntu-22.04
experimental: true
nox-session: test-3.11
Expand Down
7 changes: 7 additions & 0 deletions changelog/2128.removal.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Standardized :meth:`~urllib3.response.HTTPResponse.read` to respect the semantics of BufferedIOBase regardless of compression. Specifically, this method:

* only returns an empty bytes object to indicate EOF (that is, the response has been fully consumed),
* never returns more bytes than requested,
* can issue any number of system calls: zero, one or multiple.

If you want each :meth:`~urllib3.response.HTTPResponse.read` call to issue a single system call, you need to disable decompression by setting ``decode_content=False``.
5 changes: 3 additions & 2 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
coverage==6.4
tornado==6.2
PySocks==1.7.1
pytest==7.0.0
pytest==7.2.0
pytest-timeout==2.1.0
pytest-freezegun==0.4.2
flaky==3.7.0
trustme==0.9.0
cryptography==37.0.2
backports.zoneinfo==0.2.1;python_version<"3.9"
towncrier==21.9.0
towncrier==21.9.0
pytest-memray==1.3.0;python_version>="3.8" and sys_platform!="win32" and implementation_name=="cpython"
8 changes: 8 additions & 0 deletions noxfile.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import shutil
import subprocess
import sys

import nox

Expand Down Expand Up @@ -31,6 +32,12 @@ def tests_impl(
# Print OpenSSL information.
session.run("python", "-m", "OpenSSL.debug")

memray_supported = True
if sys.implementation.name != "cpython" or sys.version_info < (3, 8):
memray_supported = False # pytest-memray requires CPython 3.8+
elif sys.platform == "win32":
memray_supported = False

# Inspired from https://hynek.me/articles/ditch-codecov-python/
# We use parallel mode and then combine in a later CI step
session.run(
Expand All @@ -42,6 +49,7 @@ def tests_impl(
"--parallel-mode",
"-m",
"pytest",
*("--memray", "--hide-memray-summary") if memray_supported else (),
"-r",
"a",
f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}",
Expand Down
172 changes: 134 additions & 38 deletions src/urllib3/response.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import collections
import io
import json as _json
import logging
Expand All @@ -11,6 +12,7 @@
from typing import (
TYPE_CHECKING,
Any,
Deque,
Generator,
Iterator,
List,
Expand Down Expand Up @@ -223,6 +225,63 @@ def _get_decoder(mode: str) -> ContentDecoder:
return DeflateDecoder()


class BytesQueueBuffer:
"""Memory-efficient bytes buffer
To return decoded data in read() and still follow the BufferedIOBase API, we need a
buffer to always return the correct amount of bytes.
This buffer should be filled using calls to put()
Our maximum memory usage is determined by the sum of the size of:
* self.buffer, which contains the full data
* the largest chunk that we will copy in get()
The worst case scenario is a single chunk, in which case we'll make a full copy of
the data inside get().
"""

def __init__(self) -> None:
self.buffer: Deque[bytes] = collections.deque()
self._size: int = 0

def __len__(self) -> int:
return self._size

def put(self, data: bytes) -> None:
self.buffer.append(data)
self._size += len(data)

def get(self, n: int) -> bytes:
if not self.buffer:
raise RuntimeError("buffer is empty")
elif n < 0:
raise ValueError("n should be > 0")

fetched = 0
ret = io.BytesIO()
while fetched < n:
remaining = n - fetched
chunk = self.buffer.popleft()
chunk_length = len(chunk)
if remaining < chunk_length:
left_chunk, right_chunk = chunk[:remaining], chunk[remaining:]
ret.write(left_chunk)
self.buffer.appendleft(right_chunk)
self._size -= remaining
break
else:
ret.write(chunk)
self._size -= chunk_length
fetched += chunk_length

if not self.buffer:
break

return ret.getvalue()


class BaseHTTPResponse(io.IOBase):
CONTENT_DECODERS = ["gzip", "deflate"]
if brotli is not None:
Expand Down Expand Up @@ -512,6 +571,9 @@ def __init__(
# Determine length of response
self.length_remaining = self._init_length(request_method)

# Used to return the correct amount of bytes for partial read()s
self._decoded_buffer = BytesQueueBuffer()

# If requested, preload the body.
if preload_content and not self._body:
self._body = self.read(decode_content=decode_content)
Expand Down Expand Up @@ -720,6 +782,48 @@ def _fp_read(self, amt: Optional[int] = None) -> bytes:
# StringIO doesn't like amt=None
return self._fp.read(amt) if amt is not None else self._fp.read()

def _raw_read(
self,
amt: Optional[int] = None,
) -> bytes:
"""
Reads `amt` of bytes from the socket.
"""
if self._fp is None:
return None # type: ignore[return-value]

fp_closed = getattr(self._fp, "closed", False)

with self._error_catcher():
data = self._fp_read(amt) if not fp_closed else b""
if amt is not None and amt != 0 and not data:
# Platform-specific: Buggy versions of Python.
# Close the connection when no data is returned
#
# This is redundant to what httplib/http.client _should_
# already do. However, versions of python released before
# December 15, 2012 (http://bugs.python.org/issue16298) do
# not properly close the connection in all cases. There is
# no harm in redundantly calling close.
self._fp.close()
if (
self.enforce_content_length
and self.length_remaining is not None
and self.length_remaining != 0
):
# This is an edge case that httplib failed to cover due
# to concerns of backward compatibility. We're
# addressing it here to make sure IncompleteRead is
# raised during streaming, so all calls with incorrect
# Content-Length are caught.
raise IncompleteRead(self._fp_bytes_read, self.length_remaining)

if data:
self._fp_bytes_read += len(data)
if self.length_remaining is not None:
self.length_remaining -= len(data)
return data

def read(
self,
amt: Optional[int] = None,
Expand Down Expand Up @@ -750,51 +854,43 @@ def read(
if decode_content is None:
decode_content = self.decode_content

if self._fp is None:
return None # type: ignore[return-value]
if amt is not None:
cache_content = False

flush_decoder = False
fp_closed = getattr(self._fp, "closed", False)
if len(self._decoded_buffer) >= amt:
return self._decoded_buffer.get(amt)

with self._error_catcher():
data = self._fp_read(amt) if not fp_closed else b""
if amt is None:
flush_decoder = True
else:
cache_content = False
if (
amt != 0 and not data
): # Platform-specific: Buggy versions of Python.
# Close the connection when no data is returned
#
# This is redundant to what httplib/http.client _should_
# already do. However, versions of python released before
# December 15, 2012 (http://bugs.python.org/issue16298) do
# not properly close the connection in all cases. There is
# no harm in redundantly calling close.
self._fp.close()
flush_decoder = True
if (
self.enforce_content_length
and self.length_remaining is not None
and self.length_remaining != 0
):
# This is an edge case that httplib failed to cover due
# to concerns of backward compatibility. We're
# addressing it here to make sure IncompleteRead is
# raised during streaming, so all calls with incorrect
# Content-Length are caught.
raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
data = self._raw_read(amt)

if data:
self._fp_bytes_read += len(data)
if self.length_remaining is not None:
self.length_remaining -= len(data)
flush_decoder = False
if amt is None:
flush_decoder = True
elif amt != 0 and not data:
flush_decoder = True

data = self._decode(data, decode_content, flush_decoder)
if not data and len(self._decoded_buffer) == 0:
return data

if amt is None:
data = self._decode(data, decode_content, flush_decoder)
if cache_content:
self._body = data
else:
# do not waste memory on buffer when not decoding
if not decode_content:
return data

decoded_data = self._decode(data, decode_content, flush_decoder)
self._decoded_buffer.put(decoded_data)

while len(self._decoded_buffer) < amt and data:
# TODO make sure to initially read enough data to get past the headers
# For example, the GZ file header takes 10 bytes, we don't want to read
# it one byte at a time
data = self._raw_read(amt)
decoded_data = self._decode(data, decode_content, flush_decoder)
self._decoded_buffer.put(decoded_data)
data = self._decoded_buffer.get(amt)

return data

Expand Down

0 comments on commit c35033f

Please sign in to comment.