Skip to content

Commit

Permalink
header encoding: when serializing HTTP headers that are non-ascii, at…
Browse files Browse the repository at this point in the history
…tempt to

%-encode any non-ascii headers as utf-8 as specified in RFC 5987, RFC 8187
Add StatusAndHeaders.to_ascii_bytes() which ensures an ascii only encoding of the headers
Addresses #38
  • Loading branch information
ikreymer committed Oct 6, 2018
1 parent 3f6e2d7 commit eca8182
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 5 deletions.
20 changes: 20 additions & 0 deletions test/test_statusandheaders.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
>>> st1 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
>>> st1
Expand Down Expand Up @@ -208,3 +211,20 @@ def test_validate_status():
assert not StatusAndHeaders('Bad OK', []).validate_statusline('204 No Content')


def test_non_ascii():
st = StatusAndHeaders('200 OK', [('Custom-Header', u'attachment; filename="Éxamplè"')])
res = st.to_ascii_bytes().decode('ascii')
assert res == "\
200 OK\r\n\
Custom-Header: attachment; filename*=UTF-8''%C3%89xampl%C3%A8\r\n\
\r\n\
"

def test_non_ascii_2():
st = StatusAndHeaders('200 OK', [('Custom-Header', u'value; filename="Éxamplè"; param; other=испытание; another')])
res = st.to_ascii_bytes().decode('ascii')
assert res == "\
200 OK\r\n\
Custom-Header: value; filename*=UTF-8''%C3%89xampl%C3%A8; param; other*=UTF-8''%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5; another\r\n\
\r\n\
"
46 changes: 44 additions & 2 deletions test/test_writer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from warcio.statusandheaders import StatusAndHeaders
from warcio.warcwriter import BufferWARCWriter, GzippingWrapper
from warcio.recordloader import ArcWarcRecordLoader
Expand Down Expand Up @@ -63,6 +66,27 @@ def _make_warc_date(cls):
\r\n\
'

RESPONSE_RECORD_UNICODE_HEADERS = '\
WARC/1.0\r\n\
WARC-Type: response\r\n\
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
WARC-Target-URI: http://example.com/\r\n\
WARC-Date: 2000-01-01T00:00:00Z\r\n\
WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
WARC-Block-Digest: sha1:4OWI4LV5GWIWVTL2MPL7OHSLNNAQ3H4W\r\n\
Content-Type: application/http; msgtype=response\r\n\
Content-Length: 207\r\n\
\r\n\
HTTP/1.0 200 OK\r\n\
Content-Type: text/plain; charset="UTF-8"\r\n\
Content-Disposition: attachment; filename*=UTF-8\'\'%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5.txt\r\n\
Custom-Header: somevalue\r\n\
\r\n\
some\n\
text\r\n\
\r\n\
'


RESPONSE_RECORD_2 = '\
WARC/1.0\r\n\
Expand Down Expand Up @@ -314,6 +338,24 @@ def sample_response_from_buff(writer):
length=len(payload))


# ============================================================================
@sample_record('response-unicode-header', RESPONSE_RECORD_UNICODE_HEADERS)
def sample_response_from_buff(writer):
headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'),
('Content-Disposition', u'attachment; filename="испытание.txt"'),
('Custom-Header', 'somevalue')
]

payload = b'some\ntext'

http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')

return writer.create_warc_record('http://example.com/', 'response',
payload=BytesIO(payload),
length=len(payload),
http_headers=http_headers)


# ============================================================================
@sample_record('response_2', RESPONSE_RECORD_2)
def sample_response_2(writer):
Expand Down Expand Up @@ -603,7 +645,8 @@ def test_read_from_stream_no_content_length(self, record_sampler, is_gzip):
stream.seek(0)
parsed_record = ArcWarcRecordLoader().parse_record_stream(DecompressingBufferedReader(stream))

assert full_record.http_headers == parsed_record.http_headers
if 'Content-Disposition' not in record_string:
assert full_record.http_headers == parsed_record.http_headers
assert full_record.raw_stream.read() == parsed_record.raw_stream.read()
assert full_record.rec_headers != parsed_record.rec_headers

Expand Down Expand Up @@ -661,4 +704,3 @@ def validate_response(record):

validate_warcinfo(records[0])


37 changes: 37 additions & 0 deletions warcio/statusandheaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
from warcio.utils import to_native_str, headers_to_str_headers
import uuid

from six.moves.urllib.parse import quote
import re


#=================================================================
class StatusAndHeaders(object):
ENCODE_HEADER_RX = re.compile(r'[=]["\']?([^;"]+)["\']?(?=[;]?)')
"""
Representation of parsed http-style status line and headers
Status Line if first line of request/response
Expand Down Expand Up @@ -154,6 +158,39 @@ def to_str(self, filter_func=None):
def to_bytes(self, filter_func=None, encoding='utf-8'):
return self.to_str(filter_func).encode(encoding) + b'\r\n'

def to_ascii_bytes(self, filter_func=None):
""" Attempt to encode the headers block as ascii
If encoding fails, call percent_encode_non_ascii_headers()
to encode any headers per RFCs
"""
try:
string = self.to_str(filter_func)
string = string.encode('ascii')
except UnicodeEncodeError:
self.percent_encode_non_ascii_headers()
string = self.to_str(filter_func)
string = string.encode('ascii')

return string + b'\r\n'

def percent_encode_non_ascii_headers(self, encoding='UTF-8'):
""" Encode any headers that are not plain ascii
as UTF-8 as per:
https://tools.ietf.org/html/rfc8187#section-3.2.3
https://tools.ietf.org/html/rfc5987#section-3.2.2
"""
def do_encode(m):
return "*={0}''".format(encoding) + quote(to_native_str(m.group(1)))

for index in range(len(self.headers) - 1, -1, -1):
curr_name, curr_value = self.headers[index]
try:
# test if header is ascii encodable, no action needed
curr_value.encode('ascii')
except:
new_value = self.ENCODE_HEADER_RX.sub(do_encode, curr_value)
self.headers[index] = (curr_name, new_value)

# act like a (case-insensitive) dictionary of headers, much like other
# python http headers apis including http.client.HTTPMessage
# and requests.structures.CaseInsensitiveDict
Expand Down
8 changes: 5 additions & 3 deletions warcio/warcwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,8 @@ def _init_warc_headers(self, uri, record_type, warc_headers_dict):
return warc_headers

def _set_header_buff(self, record):
headers_buff = record.http_headers.to_bytes(self.header_filter, 'iso-8859-1')
# HTTP headers %-encoded as ascii (see to_ascii_bytes for more info)
headers_buff = record.http_headers.to_ascii_bytes(self.header_filter)
record.http_headers.headers_buff = headers_buff

def _write_warc_record(self, out, record):
Expand Down Expand Up @@ -273,8 +274,9 @@ def _write_warc_record(self, out, record):

record.rec_headers.replace_header('Content-Length', str(record.length))

# write record headers
out.write(record.rec_headers.to_bytes())
# write record headers -- encoded as utf-8
# WARC headers can be utf-8 per spec
out.write(record.rec_headers.to_bytes(encoding='utf-8'))

# write headers buffer, if any
if record.http_headers:
Expand Down

0 comments on commit eca8182

Please sign in to comment.