Skip to content

Commit

Permalink
Http response header encoding (#905) (#909)
Browse files Browse the repository at this point in the history
* cosmetics: work around XEmacs highlighting bug

* HTTP response header encoding support

* reference PR in `CHANGES.rst` (for documentation purposes)

* fix typo

* Revert "cosmetics: work around XEmacs highlighting bug"

This reverts commit e4749c9.
  • Loading branch information
dataflake committed Oct 8, 2020
1 parent bea75e6 commit 2adcb95
Show file tree
Hide file tree
Showing 3 changed files with 215 additions and 4 deletions.
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ Backwards incompatible changes
Features
++++++++

- HTTP header encoding support
(`#905 <https://github.com/zopefoundation/Zope/pull/905>`_)

- Add support for Python 3.9.

- New interface ``Products.PageTemplates.interfaces.IZopeAwareEngine``.
Expand Down
138 changes: 135 additions & 3 deletions src/ZPublisher/HTTPResponse.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
import sys
import time
import zlib
from email.header import Header
from email.message import _parseparam
from email.utils import encode_rfc2231
from io import BytesIO
from io import IOBase
from urllib.parse import quote
Expand Down Expand Up @@ -94,7 +97,9 @@


def _scrubHeader(name, value):
return ''.join(_CRLF.split(str(name))), ''.join(_CRLF.split(str(value)))
if not isinstance(value, str):
value = str(value)
return ''.join(_CRLF.split(str(name))), ''.join(_CRLF.split(value))


_NOW = None # overwrite for testing
Expand Down Expand Up @@ -704,20 +709,24 @@ def listHeaders(self):
""" Return a list of (key, value) pairs for our headers.
o Do appropriate case normalization.
o Encode header values via `header_encoding_registry`
"""

result = [
('X-Powered-By', 'Zope (www.zope.org), Python (www.python.org)')
]

encode = header_encoding_registry.encode
for key, value in self.headers.items():
if key.lower() == key:
# only change non-literal header names
key = '-'.join([x.capitalize() for x in key.split('-')])
result.append((key, value))
result.append((key, encode(key, value)))

result.extend(self._cookie_list())
result.extend(self.accumulated_headers)
for key, value in self.accumulated_headers:
result.append((key, encode(key, value)))
return result

def _unauthorized(self):
Expand Down Expand Up @@ -1088,3 +1097,126 @@ def __bytes__(self):

def __str__(self):
raise NotImplementedError


# HTTP header encoding
class HeaderEncodingRegistry(dict):
"""Encode HTTP headers.
HTTP/1.1 uses `ISO-8859-1` as charset for its headers
(the modern spec (RFC 7230-7235) has deprecated non ASCII characters
but for the sake of older browsers we still use `ISO-8859-1`).
Header values need encoding if they contain characters
not expressible in this charset.
HTTP/1.1 is based on MIME
("Multimedia Internet Mail Extensions" RFC 2045-2049).
MIME knows about 2 header encodings:
- one for parameter values (RFC 2231)
- and one word words as part of text, phrase or comment (RFC 2047)
For use with HTTP/1.1 MIME's parameter value encoding (RFC 2231)
was specialized and simplified via RFC 5987 and RFC 8187.
For efficiency reasons and because HTTP is an extensible
protocol (an application can use headers not specified
by HTTP), we use an encoding registry to guide the header encoding.
An application can register an encoding for specific keys and/or
a default encoding to be used for keys without specific registration.
If there is neither a specific encoding nor a default encoding,
a header value remains unencoded.
Header values are encoded only if they contain non `ISO-8859-1` characters.
"""

def register(self, header, encoder, **kw):
"""register *encoder* as encoder for header *header*.
If *encoder* is `None`, this indicates that *header* should not
get encoded.
If *header* is `None`, this indicates that *encoder* is defined
as the default encoder.
When encoding is necessary, *encoder* is called with
the header value and the keywords specified by *kw*.
"""
if header is not None:
header = header.lower()
self[header] = encoder, kw

def unregister(self, header):
"""remove any registration for *header*.
*header* can be either a header name or `None`.
In the latter case, a default registration is removed.
"""
if header is not None:
header = header.lower()
if header in self:
del self[header]

def encode(self, header, value):
"""encode *value* as specified for *header*.
encoding takes only place if *value* contains non ISO-8859-1 chars.
"""
if not isinstance(value, str):
return value
header = header.lower()
reg = self.get(header) or self.get(None)
if reg is None or reg[0] is None or non_latin_1(value) is None:
return value
return reg[0](value, **reg[1])


non_latin_1 = re.compile(r"[^\x00-\xff]").search


def encode_words(value):
"""RFC 2047 word encoding.
Note: treats *value* as unstructured data
and therefore must not be applied for headers with
a structured value (unless the structure is garanteed
to only contain ISO-8859-1 chars).
"""
return Header(value, 'utf-8', 1000000).encode()


def encode_params(value):
"""RFC 5987(8187) (specialized from RFC 2231) parameter encoding.
This encodes parameters as specified by RFC 5987 using
fixed `UTF-8` encoding (as required by RFC 8187).
However, all parameters with non latin-1 values are
automatically transformed and a `*` suffixed parameter is added
(RFC 8187 allows this only for parameters explicitly specified
to have this behavior).
Many HTTP headers use `,` separated lists. For simplicity,
such headers are not supported (we would need to recognize
`,` inside quoted strings as special).
"""
params = []
for p in _parseparam(";" + value):
p = p.strip()
if not p:
continue
params.append([s.strip() for s in p.split("=", 1)])
known_params = {p[0] for p in params}
for p in params[:]:
if len(p) == 2 and non_latin_1(p[1]): # need encoding
pn = p[0]
pnc = pn + "*"
pv = p[1]
if pnc not in known_params:
if pv.startswith('"'):
pv = pv[1:-1] # remove quotes
params.append((pnc, encode_rfc2231(pv, "utf-8", None)))
# backward compatibility for clients not understanding RFC 5987
p[1] = p[1].encode("iso-8859-1", "replace").decode("iso-8859-1")
return "; ".join("=".join(p) for p in params)


header_encoding_registry = HeaderEncodingRegistry()
header_encoding_registry.register("content-type", encode_params)
header_encoding_registry.register("content-disposition", encode_params)
78 changes: 77 additions & 1 deletion src/ZPublisher/tests/testHTTPResponse.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@
from zExceptions import NotFound
from zExceptions import ResourceLockedError
from zExceptions import Unauthorized
from ZPublisher.HTTPResponse import make_content_disposition

from ..HTTPResponse import encode_params
from ..HTTPResponse import encode_words
from ..HTTPResponse import header_encoding_registry
from ..HTTPResponse import make_content_disposition


class HTTPResponseTests(unittest.TestCase):
Expand Down Expand Up @@ -1375,6 +1379,23 @@ def test_isHTML_not_decodable_bytes(self):
response = self._makeOne()
self.assertFalse(response.isHTML('bïñårÿ'.encode('latin1')))

def test_header_encoding(self):
r = self._makeOne()
r.setHeader("unencoded1", "€")
r.setHeader("content-disposition", "a; p=€")
r.addHeader("unencoded2", "€")
r.addHeader("content-disposition", "a2; p2=€")
hdrs = r.listHeaders()[1:] # drop `X-Powered...`
shdrs, ahdrs = dict(hdrs[:2]), dict(hdrs[2:])
# for some reasons, `set` headers change their name
# while `add` headers do not
self.assertEqual(shdrs["Unencoded1"], "€")
self.assertEqual(ahdrs["unencoded2"], "€")
self.assertEqual(shdrs["Content-Disposition"],
"a; p=?; p*=utf-8''%E2%82%AC")
self.assertEqual(ahdrs["content-disposition"],
"a2; p2=?; p2*=utf-8''%E2%82%AC")


class MakeDispositionHeaderTests(unittest.TestCase):

Expand All @@ -1400,3 +1421,58 @@ def test_unicode(self):
make_content_disposition('inline', 'ıq.png'),
'inline; filename="b\'q.png\'"; filename*=UTF-8\'\'%C4%B1q.png'
)


class TestHeaderEncodingRegistry(unittest.TestCase):
def setUp(self):
self._copy = header_encoding_registry.copy()

def tearDown(self):
header_encoding_registry.clear()
header_encoding_registry.update(self._copy)

def test_default_registrations(self):
self.assertIn('content-type', header_encoding_registry)
self.assertEqual(header_encoding_registry["content-disposition"],
(encode_params, {}))

def test_encode(self):
def encode(value, param):
return param
header_encoding_registry.register("my-header", encode, param=1)
# non-ISO-8859-1 encoded
self.assertEqual(header_encoding_registry.encode("my-header", "€"),
1)
# ISO-8859-1 not encoded
self.assertEqual(header_encoding_registry.encode("my-header", "ä"),
"ä")
# unregistered not encoded
self.assertEqual(header_encoding_registry.encode("my-header2", "€"),
"€")
# test header name not case sensitive
self.assertEqual(header_encoding_registry.encode("My-Header", "€"),
1)
# default
header_encoding_registry.register(None, encode, param=2)
self.assertEqual(header_encoding_registry.encode("my-header2", "€"),
2)
self.assertEqual(header_encoding_registry.encode("my-header", "€"),
1)

def test_encode_words(self):
self.assertEqual(encode_words("ä"), "=?utf-8?b?w6Q=?=")

def test_encode_params(self):
self.assertEqual(encode_params('abc; p1=1; p2="2"; p3="€"; p4=€; '
'p5="€"; p5*=5'),
'abc; p1=1; p2="2"; p3="?"; p4=?; p5="?"; p5*=5; '
'p3*=utf-8\'\'%E2%82%AC; p4*=utf-8\'\'%E2%82%AC')

def test_case_insensitivity(self):
header_encoding_registry.register("HdR", lambda value: 0)
# Note: case insensitivity not implemented for `dict` methods
self.assertIn("hdr", header_encoding_registry)
self.assertEqual(header_encoding_registry.encode("HDR", "€"), 0)
header_encoding_registry.unregister("hDr")
header_encoding_registry.unregister("hDr") # no exception
self.assertNotIn("hdr", header_encoding_registry)

0 comments on commit 2adcb95

Please sign in to comment.