Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Encode str bodies to Latin-1 instead of UTF-8 #3063

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/urllib3/util/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def body_to_chunks(

# Bytes or strings become bytes
elif isinstance(body, (str, bytes)):
chunks = (to_bytes(body),)
chunks = (to_bytes(body, "latin-1"),)
content_length = len(chunks[0])

# File-like object, TODO: use seek() and tell() for length?
Expand All @@ -227,7 +227,7 @@ def chunk_readable() -> typing.Iterable[bytes]:
if not datablock:
break
if encode:
datablock = datablock.encode("iso-8859-1")
datablock = datablock.encode("latin-1")
yield datablock

chunks = chunk_readable()
Expand Down
2 changes: 1 addition & 1 deletion test/with_dummyserver/test_chunked_transfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _test_body(self, data: bytes | str | None) -> None:

assert b"Transfer-Encoding: chunked" in header.split(b"\r\n")
if data:
bdata = data if isinstance(data, bytes) else data.encode("utf-8")
bdata = data if isinstance(data, bytes) else data.encode("latin-1")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is actually suspicious because 1.26.x used utf-8 here:

def _test_body(self, data):
self.start_chunked_handler()
with HTTPConnectionPool(self.host, self.port, retries=False) as pool:
pool.urlopen("GET", "/", data, chunked=True)
header, body = self.buffer.split(b"\r\n\r\n", 1)
assert b"Transfer-Encoding: chunked" in header.split(b"\r\n")
if data:
bdata = data if isinstance(data, bytes) else data.encode("utf-8")
assert b"\r\n" + bdata + b"\r\n" in body
assert body.endswith(b"\r\n0\r\n\r\n")
len_str = body.split(b"\r\n", 1)[0]
stated_len = int(len_str, 16)
assert stated_len == len(bdata)
else:
assert body == b"0\r\n\r\n"
def test_bytestring_body(self):
self._test_body(b"thisshouldbeonechunk\r\nasdf")
def test_unicode_body(self):
self._test_body(u"thisshouldbeonechunk\r\näöüß")

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Weird, but there was no test case for non-ASCII characters?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are a few non-ASCII characters on line 75

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And changing data.encode("utf-8") to data.encode("latin-1") breaks the test in 1.26.x

test_unicode_body failed; it passed 0 out of the required 1 times.
        <class 'AssertionError'>
        assert ((b'\r\n' + b'thisshouldbeonechunk\r\n\xe4\xf6\xfc\xdf') + b'\r\n') in b'1e\r\nthisshouldbeonechunk\r\n\xc3\xa4\xc3\xb6\xc3\xbc\xc3\x9f\r\n0\r\n\r\n'

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Related encoding happens on this line in 1.26.x

chunk = chunk.encode("utf8")

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, good find! So we were encoding chunks with UTF-8 and other bodies with Latin-1, and 2.0 reversed that unintentionally. I'll fix that too.

assert b"\r\n" + bdata + b"\r\n" in body
assert body.endswith(b"\r\n0\r\n\r\n")

Expand Down
2 changes: 1 addition & 1 deletion test/with_dummyserver/test_connectionpool.py
Original file line number Diff line number Diff line change
Expand Up @@ -1089,7 +1089,7 @@ def test_bytes_header(self) -> None:
assert request_headers["User-Agent"] == "test header"

@pytest.mark.parametrize(
"user_agent", ["Schönefeld/1.18.0", "Schönefeld/1.18.0".encode("iso-8859-1")]
"user_agent", ["Schönefeld/1.18.0", "Schönefeld/1.18.0".encode("latin-1")]
)
def test_user_agent_non_ascii_user_agent(self, user_agent: str) -> None:
with HTTPConnectionPool(self.host, self.port, retries=False) as pool:
Expand Down
54 changes: 46 additions & 8 deletions test/with_dummyserver/test_socketlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
from urllib3.util import ssl_, ssl_wrap_socket
from urllib3.util.retry import Retry
from urllib3.util.timeout import Timeout
from urllib3.util.util import to_bytes

from .. import LogRecorder, has_alpn

Expand Down Expand Up @@ -2256,6 +2257,38 @@ def socket_handler(listener: socket.socket) -> None:
assert b"Content-Length: 0\r\n" in sent_bytes
assert b"transfer-encoding" not in sent_bytes.lower()

def test_encode_body_latin_1(self) -> None:
buffer = bytearray()

def socket_handler(listener: socket.socket) -> None:
nonlocal buffer
sock = listener.accept()[0]
sock.settimeout(0)

start = time.perf_counter()
while time.perf_counter() - start < (LONG_TIMEOUT / 2):
try:
buffer += sock.recv(65536)
except OSError:
continue

sock.sendall(
b"HTTP/1.1 200 OK\r\n"
b"Server: example.com\r\n"
b"Content-Length: 0\r\n\r\n"
)
sock.close()

self._start_server(socket_handler)

with HTTPConnectionPool(self.host, self.port, timeout=3) as pool:
resp = pool.request("POST", "/", body="\x80")
assert resp.status == 200

sent_bytes = bytes(buffer)
assert to_bytes("\x80", "latin-1") in sent_bytes
assert to_bytes("\x80", "utf-8") not in sent_bytes

@pytest.mark.parametrize("chunked", [True, False])
@pytest.mark.parametrize("method", ["POST", "PUT", "PATCH"])
@pytest.mark.parametrize("body_type", ["file", "generator", "bytes"])
Expand Down Expand Up @@ -2344,30 +2377,34 @@ def socket_handler(listener: socket.socket) -> None:
self._start_server(socket_handler)

body: typing.Any
# \x80 encodes to two bytes with UTF-8, so it's good way to make sure that
# latin-1 was in fact used
body_str = "x" * 9 + "\x80"
body_bytes = body_str.encode("latin-1")
if body_type == "generator":

def body_generator() -> typing.Generator[bytes, None, None]:
yield b"x" * 10
yield body_bytes

body = body_generator()
should_be_chunked = True

elif body_type == "file":
body = io.BytesIO(b"x" * 10)
body = io.BytesIO(body_bytes)
body.seek(0, 0)
should_be_chunked = True

elif body_type == "file_text":
body = io.StringIO("x" * 10)
body = io.StringIO(body_str)
body.seek(0, 0)
should_be_chunked = True

elif body_type == "bytearray":
body = bytearray(b"x" * 10)
body = bytearray(body_bytes)
should_be_chunked = False

else:
body = b"x" * 10
body = body_bytes
should_be_chunked = False

with HTTPConnectionPool(
Expand All @@ -2385,12 +2422,13 @@ def body_generator() -> typing.Generator[bytes, None, None]:
if should_be_chunked:
assert b"content-length" not in sent_bytes.lower()
assert b"Transfer-Encoding: chunked\r\n" in sent_bytes
assert b"\r\n\r\na\r\nxxxxxxxxxx\r\n0\r\n\r\n" in sent_bytes

expected_str_body = f"\r\n\r\na\r\n{body_str}\r\n0\r\n\r\n"
assert to_bytes(expected_str_body, "latin-1") in sent_bytes
assert to_bytes(expected_str_body, "utf-8") not in sent_bytes
else:
assert b"Content-Length: 10\r\n" in sent_bytes
assert b"transfer-encoding" not in sent_bytes.lower()
assert sent_bytes.endswith(b"\r\n\r\nxxxxxxxxxx")
assert sent_bytes.endswith(to_bytes(f"\r\n\r\n{body_str}", "latin-1"))

@pytest.mark.parametrize(
"header_transform",
Expand Down
Loading