Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Encode Non-ASCII HTTP Headers #45

Merged
merged 1 commit into from
Oct 9, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions test/test_statusandheaders.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
>>> st1 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
>>> st1
Expand Down Expand Up @@ -208,3 +211,20 @@ def test_validate_status():
assert not StatusAndHeaders('Bad OK', []).validate_statusline('204 No Content')


def test_non_ascii():
st = StatusAndHeaders('200 OK', [('Custom-Header', u'attachment; filename="Éxamplè"')])
res = st.to_ascii_bytes().decode('ascii')
assert res == "\
200 OK\r\n\
Custom-Header: attachment; filename*=UTF-8''%C3%89xampl%C3%A8\r\n\
\r\n\
"

def test_non_ascii_2():
st = StatusAndHeaders('200 OK', [('Custom-Header', u'value; filename="Éxamplè"; param; other=испытание; another')])
res = st.to_ascii_bytes().decode('ascii')
assert res == "\
200 OK\r\n\
Custom-Header: value; filename*=UTF-8''%C3%89xampl%C3%A8; param; other*=UTF-8''%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5; another\r\n\
\r\n\
"
46 changes: 44 additions & 2 deletions test/test_writer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from warcio.statusandheaders import StatusAndHeaders
from warcio.warcwriter import BufferWARCWriter, GzippingWrapper
from warcio.recordloader import ArcWarcRecordLoader
Expand Down Expand Up @@ -63,6 +66,27 @@ def _make_warc_date(cls):
\r\n\
'

RESPONSE_RECORD_UNICODE_HEADERS = '\
WARC/1.0\r\n\
WARC-Type: response\r\n\
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
WARC-Target-URI: http://example.com/\r\n\
WARC-Date: 2000-01-01T00:00:00Z\r\n\
WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
WARC-Block-Digest: sha1:4OWI4LV5GWIWVTL2MPL7OHSLNNAQ3H4W\r\n\
Content-Type: application/http; msgtype=response\r\n\
Content-Length: 207\r\n\
\r\n\
HTTP/1.0 200 OK\r\n\
Content-Type: text/plain; charset="UTF-8"\r\n\
Content-Disposition: attachment; filename*=UTF-8\'\'%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5.txt\r\n\
Custom-Header: somevalue\r\n\
\r\n\
some\n\
text\r\n\
\r\n\
'


RESPONSE_RECORD_2 = '\
WARC/1.0\r\n\
Expand Down Expand Up @@ -314,6 +338,24 @@ def sample_response_from_buff(writer):
length=len(payload))


# ============================================================================
@sample_record('response-unicode-header', RESPONSE_RECORD_UNICODE_HEADERS)
def sample_response_from_buff(writer):
headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'),
('Content-Disposition', u'attachment; filename="испытание.txt"'),
('Custom-Header', 'somevalue')
]

payload = b'some\ntext'

http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')

return writer.create_warc_record('http://example.com/', 'response',
payload=BytesIO(payload),
length=len(payload),
http_headers=http_headers)


# ============================================================================
@sample_record('response_2', RESPONSE_RECORD_2)
def sample_response_2(writer):
Expand Down Expand Up @@ -603,7 +645,8 @@ def test_read_from_stream_no_content_length(self, record_sampler, is_gzip):
stream.seek(0)
parsed_record = ArcWarcRecordLoader().parse_record_stream(DecompressingBufferedReader(stream))

assert full_record.http_headers == parsed_record.http_headers
if 'Content-Disposition' not in record_string:
assert full_record.http_headers == parsed_record.http_headers
assert full_record.raw_stream.read() == parsed_record.raw_stream.read()
assert full_record.rec_headers != parsed_record.rec_headers

Expand Down Expand Up @@ -661,4 +704,3 @@ def validate_response(record):

validate_warcinfo(records[0])


37 changes: 37 additions & 0 deletions warcio/statusandheaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
from warcio.utils import to_native_str, headers_to_str_headers
import uuid

from six.moves.urllib.parse import quote
import re


#=================================================================
class StatusAndHeaders(object):
ENCODE_HEADER_RX = re.compile(r'[=]["\']?([^;"]+)["\']?(?=[;]?)')
"""
Representation of parsed http-style status line and headers
Status Line if first line of request/response
Expand Down Expand Up @@ -154,6 +158,39 @@ def to_str(self, filter_func=None):
def to_bytes(self, filter_func=None, encoding='utf-8'):
return self.to_str(filter_func).encode(encoding) + b'\r\n'

def to_ascii_bytes(self, filter_func=None):
""" Attempt to encode the headers block as ascii
If encoding fails, call percent_encode_non_ascii_headers()
to encode any headers per RFCs
"""
try:
string = self.to_str(filter_func)
string = string.encode('ascii')
except UnicodeEncodeError:
self.percent_encode_non_ascii_headers()
string = self.to_str(filter_func)
string = string.encode('ascii')

return string + b'\r\n'

def percent_encode_non_ascii_headers(self, encoding='UTF-8'):
""" Encode any headers that are not plain ascii
as UTF-8 as per:
https://tools.ietf.org/html/rfc8187#section-3.2.3
https://tools.ietf.org/html/rfc5987#section-3.2.2
"""
def do_encode(m):
return "*={0}''".format(encoding) + quote(to_native_str(m.group(1)))

for index in range(len(self.headers) - 1, -1, -1):
curr_name, curr_value = self.headers[index]
try:
# test if header is ascii encodable, no action needed
curr_value.encode('ascii')
except:
new_value = self.ENCODE_HEADER_RX.sub(do_encode, curr_value)
self.headers[index] = (curr_name, new_value)

# act like a (case-insensitive) dictionary of headers, much like other
# python http headers apis including http.client.HTTPMessage
# and requests.structures.CaseInsensitiveDict
Expand Down
8 changes: 5 additions & 3 deletions warcio/warcwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,8 @@ def _init_warc_headers(self, uri, record_type, warc_headers_dict):
return warc_headers

def _set_header_buff(self, record):
headers_buff = record.http_headers.to_bytes(self.header_filter, 'iso-8859-1')
# HTTP headers %-encoded as ascii (see to_ascii_bytes for more info)
headers_buff = record.http_headers.to_ascii_bytes(self.header_filter)
record.http_headers.headers_buff = headers_buff

def _write_warc_record(self, out, record):
Expand Down Expand Up @@ -273,8 +274,9 @@ def _write_warc_record(self, out, record):

record.rec_headers.replace_header('Content-Length', str(record.length))

# write record headers
out.write(record.rec_headers.to_bytes())
# write record headers -- encoded as utf-8
# WARC headers can be utf-8 per spec
out.write(record.rec_headers.to_bytes(encoding='utf-8'))

# write headers buffer, if any
if record.http_headers:
Expand Down