header encoding: when serializing HTTP headers that are non-ascii, at…

…tempt to %-encode any non-ascii headers as utf-8 as specified in RFC 5987, RFC 8187 Add StatusAndHeaders.to_ascii_bytes() which ensures an ascii only encoding of the headers Addresses #38
webrecorder · Oct 6, 2018 · eca8182 · eca8182
1 parent 3f6e2d7
commit eca8182
Show file tree

Hide file tree

Showing 4 changed files with 106 additions and 5 deletions.
diff --git a/test/test_statusandheaders.py b/test/test_statusandheaders.py
@@ -1,3 +1,6 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
 """
 >>> st1 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
 >>> st1
@@ -208,3 +211,20 @@ def test_validate_status():
     assert not StatusAndHeaders('Bad OK', []).validate_statusline('204 No Content')
 
 
+def test_non_ascii():
+    st = StatusAndHeaders('200 OK', [('Custom-Header', u'attachment; filename="Éxamplè"')])
+    res = st.to_ascii_bytes().decode('ascii')
+    assert res == "\
+200 OK\r\n\
+Custom-Header: attachment; filename*=UTF-8''%C3%89xampl%C3%A8\r\n\
+\r\n\
+"
+
+def test_non_ascii_2():
+    st = StatusAndHeaders('200 OK', [('Custom-Header', u'value; filename="Éxamplè"; param; other=испытание; another')])
+    res = st.to_ascii_bytes().decode('ascii')
+    assert res == "\
+200 OK\r\n\
+Custom-Header: value; filename*=UTF-8''%C3%89xampl%C3%A8; param; other*=UTF-8''%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5; another\r\n\
+\r\n\
+"
diff --git a/test/test_writer.py b/test/test_writer.py
@@ -1,3 +1,6 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
 from warcio.statusandheaders import StatusAndHeaders
 from warcio.warcwriter import BufferWARCWriter, GzippingWrapper
 from warcio.recordloader import ArcWarcRecordLoader
@@ -63,6 +66,27 @@ def _make_warc_date(cls):
 \r\n\
 '
 
+RESPONSE_RECORD_UNICODE_HEADERS = '\
+WARC/1.0\r\n\
+WARC-Type: response\r\n\
+WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
+WARC-Target-URI: http://example.com/\r\n\
+WARC-Date: 2000-01-01T00:00:00Z\r\n\
+WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
+WARC-Block-Digest: sha1:4OWI4LV5GWIWVTL2MPL7OHSLNNAQ3H4W\r\n\
+Content-Type: application/http; msgtype=response\r\n\
+Content-Length: 207\r\n\
+\r\n\
+HTTP/1.0 200 OK\r\n\
+Content-Type: text/plain; charset="UTF-8"\r\n\
+Content-Disposition: attachment; filename*=UTF-8\'\'%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5.txt\r\n\
+Custom-Header: somevalue\r\n\
+\r\n\
+some\n\
+text\r\n\
+\r\n\
+'
+
 
 RESPONSE_RECORD_2 = '\
 WARC/1.0\r\n\
@@ -314,6 +338,24 @@ def sample_response_from_buff(writer):
                                      length=len(payload))
 
 
+# ============================================================================
+@sample_record('response-unicode-header', RESPONSE_RECORD_UNICODE_HEADERS)
+def sample_response_from_buff(writer):
+    headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'),
+                    ('Content-Disposition', u'attachment; filename="испытание.txt"'),
+                    ('Custom-Header', 'somevalue')
+                   ]
+
+    payload = b'some\ntext'
+
+    http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')
+
+    return writer.create_warc_record('http://example.com/', 'response',
+                                     payload=BytesIO(payload),
+                                     length=len(payload),
+                                     http_headers=http_headers)
+
+
 # ============================================================================
 @sample_record('response_2', RESPONSE_RECORD_2)
 def sample_response_2(writer):
@@ -603,7 +645,8 @@ def test_read_from_stream_no_content_length(self, record_sampler, is_gzip):
         stream.seek(0)
         parsed_record = ArcWarcRecordLoader().parse_record_stream(DecompressingBufferedReader(stream))
 
-        assert full_record.http_headers == parsed_record.http_headers
+        if 'Content-Disposition' not in record_string:
+            assert full_record.http_headers == parsed_record.http_headers
         assert full_record.raw_stream.read() == parsed_record.raw_stream.read()
         assert full_record.rec_headers != parsed_record.rec_headers
 
@@ -661,4 +704,3 @@ def validate_response(record):
 
         validate_warcinfo(records[0])
 
-
diff --git a/warcio/statusandheaders.py b/warcio/statusandheaders.py
@@ -7,9 +7,13 @@
 from warcio.utils import to_native_str, headers_to_str_headers
 import uuid
 
+from six.moves.urllib.parse import quote
+import re
+
 
 #=================================================================
 class StatusAndHeaders(object):
+    ENCODE_HEADER_RX = re.compile(r'[=]["\']?([^;"]+)["\']?(?=[;]?)')
     """
     Representation of parsed http-style status line and headers
     Status Line if first line of request/response
@@ -154,6 +158,39 @@ def to_str(self, filter_func=None):
     def to_bytes(self, filter_func=None, encoding='utf-8'):
         return self.to_str(filter_func).encode(encoding) + b'\r\n'
 
+    def to_ascii_bytes(self, filter_func=None):
+        """ Attempt to encode the headers block as ascii
+            If encoding fails, call percent_encode_non_ascii_headers()
+            to encode any headers per RFCs
+        """
+        try:
+            string = self.to_str(filter_func)
+            string = string.encode('ascii')
+        except UnicodeEncodeError:
+            self.percent_encode_non_ascii_headers()
+            string = self.to_str(filter_func)
+            string = string.encode('ascii')
+
+        return string + b'\r\n'
+
+    def percent_encode_non_ascii_headers(self, encoding='UTF-8'):
+        """ Encode any headers that are not plain ascii
+            as UTF-8 as per:
+            https://tools.ietf.org/html/rfc8187#section-3.2.3
+            https://tools.ietf.org/html/rfc5987#section-3.2.2
+        """
+        def do_encode(m):
+            return "*={0}''".format(encoding) + quote(to_native_str(m.group(1)))
+
+        for index in range(len(self.headers) - 1, -1, -1):
+            curr_name, curr_value = self.headers[index]
+            try:
+                # test if header is ascii encodable, no action needed
+                curr_value.encode('ascii')
+            except:
+                new_value = self.ENCODE_HEADER_RX.sub(do_encode, curr_value)
+                self.headers[index] = (curr_name, new_value)
+
     # act like a (case-insensitive) dictionary of headers, much like other
     # python http headers apis including http.client.HTTPMessage
     # and requests.structures.CaseInsensitiveDict

diff --git a/warcio/warcwriter.py b/warcio/warcwriter.py
@@ -222,7 +222,8 @@ def _init_warc_headers(self, uri, record_type, warc_headers_dict):
         return warc_headers
 
     def _set_header_buff(self, record):
-        headers_buff = record.http_headers.to_bytes(self.header_filter, 'iso-8859-1')
+        # HTTP headers %-encoded as ascii (see to_ascii_bytes for more info)
+        headers_buff = record.http_headers.to_ascii_bytes(self.header_filter)
         record.http_headers.headers_buff = headers_buff
 
     def _write_warc_record(self, out, record):
@@ -273,8 +274,9 @@ def _write_warc_record(self, out, record):
 
         record.rec_headers.replace_header('Content-Length', str(record.length))
 
-        # write record headers
-        out.write(record.rec_headers.to_bytes())
+        # write record headers -- encoded as utf-8
+        # WARC headers can be utf-8 per spec
+        out.write(record.rec_headers.to_bytes(encoding='utf-8'))
 
         # write headers buffer, if any
         if record.http_headers: