Limit size of parsed header lines and message bodies

vfaronov · Jul 17, 2017 · 8ca0e31 · 8ca0e31
1 parent 9a6ea5d
commit 8ca0e31
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 30 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,9 @@ Unreleased
 Changed
 -------
 - Notice `1277`_ (obsolete 'X-' prefix) is now reported only once per message.
+- HTTPolice no longer attempts to process very long header lines (currently
+  16K; they will fail with notice `1006`_/`1009`_) and message bodies
+  (currently 1G; notice `1298`_).
 - The syntax of `chunk extensions`_ is no longer checked.
 
 Added
@@ -18,6 +21,8 @@ Added
 
 .. _Forwarded: https://tools.ietf.org/html/rfc7239
 .. _chunk extensions: https://tools.ietf.org/html/rfc7230#section-4.1.1
+.. _1009: http://pythonhosted.org/HTTPolice/notices.html#1009
+.. _1298: http://pythonhosted.org/HTTPolice/notices.html#1298
 .. _1296: http://pythonhosted.org/HTTPolice/notices.html#1296
 .. _1297: http://pythonhosted.org/HTTPolice/notices.html#1297
 

diff --git a/httpolice/framing1.py b/httpolice/framing1.py
@@ -30,6 +30,9 @@
 STATUS_CODE = re.compile(u'^[0-9]{3}$')
 
 
+MAX_BODY_SIZE = 1024 * 1024 * 1024
+
+
 def parse_streams(inbound, outbound, scheme=None):
     """Parse one or two HTTP/1.x streams.
 
@@ -118,6 +121,25 @@ def _parse_request_heading(stream, scheme=None):
     return req
 
 
+def _process_content_length(msg, stream):
+    n = msg.headers.content_length.value
+    if n is Unavailable:
+        msg.body = Unavailable
+        stream.sane = False
+    else:
+        if n > MAX_BODY_SIZE:
+            msg.body = Unavailable
+            stream.sane = False
+            msg.complain(1298, place=msg.headers.content_length, size=n,
+                         max_size=MAX_BODY_SIZE)
+        else:
+            try:
+                msg.body = stream.read(n)
+            except ParseError as exc:
+                msg.body = Unavailable
+                msg.complain(1004, error=exc)
+
+
 def _parse_request_body(req, stream):
     # RFC 7230 section 3.3.3.
 
@@ -133,16 +155,7 @@ def _parse_request_body(req, stream):
             _decode_transfer_coding(req, codings.pop())
 
     elif req.headers.content_length:
-        n = req.headers.content_length.value
-        if n is Unavailable:
-            req.body = Unavailable
-            stream.sane = False
-        else:
-            try:
-                req.body = stream.read(n)
-            except ParseError as exc:
-                req.body = Unavailable
-                req.complain(1004, error=exc)
+        _process_content_length(req, stream)
 
     else:
         req.body = b''
@@ -220,16 +233,7 @@ def _parse_response_body(resp, stream):
             _decode_transfer_coding(resp, codings.pop())
 
     elif resp.headers.content_length.is_present:
-        n = resp.headers.content_length.value
-        if n is Unavailable:
-            resp.body = Unavailable
-            stream.sane = False
-        else:
-            try:
-                resp.body = stream.read(n)
-            except ParseError as exc:
-                resp.body = Unavailable
-                resp.complain(1004, error=exc)
+        _process_content_length(resp, stream)
 
     else:
         resp.body = stream.read()
@@ -283,7 +287,17 @@ def _decode_transfer_coding(msg, coding):
         msg.body = Unavailable
 
 
-def _parse_chunk(stream):
+class BodyTooLongError(Exception):
+
+    def __init__(self, size, max_size):
+        super(BodyTooLongError, self).__init__(u'body longer than %d bytes' %
+                                               max_size)
+        self.size = size
+        self.max_size = max_size
+
+
+def _parse_chunk(stream, data):
+    current_size = sum(len(c) for c in data)
     with stream.parsing(chunk):
         pos = stream.tell()
         (size_s, _, _) = stream.readline().partition(u';')
@@ -293,28 +307,33 @@ def _parse_chunk(stream):
             except ValueError:
                 raise stream.error(pos)
         if size == 0:
-            return b''
+            return False
+        elif size + current_size > MAX_BODY_SIZE:
+            stream.sane = False
+            raise BodyTooLongError(size + current_size, MAX_BODY_SIZE)
         else:
-            data = stream.read(size)
+            data.append(stream.read(size))
             stream.readlineend()
-            return data
+            return True
 
 
 def _parse_chunked(msg, stream):
     data = []
+    place = u'chunked framing'
     try:
-        chunk_data = _parse_chunk(stream)
-        while chunk_data:
-            data.append(chunk_data)
-            chunk_data = _parse_chunk(stream)
+        while _parse_chunk(stream, data):
+            pass
         trailer = parse_header_fields(stream)
         with stream.parsing(chunked_body):
             stream.readlineend()
     except ParseError as e:
         msg.complain(1005, error=e)
         msg.body = Unavailable
+    except BodyTooLongError as e:
+        msg.complain(1298, place=place, size=e.size, max_size=e.max_size)
+        msg.body = Unavailable
     else:
-        stream.dump_complaints(msg.complain, place=u'chunked framing')
+        stream.dump_complaints(msg.complain, place=place)
         msg.body = b''.join(data)
         msg.trailer_entries = trailer
         if trailer:

diff --git a/httpolice/notices.xml b/httpolice/notices.xml
@@ -2180,4 +2180,9 @@ One non-obvious thing is how references work
     <explain>If this was intended to be <var ref="n_elements"/> parameters for a single proxy hop, then the pairs must be separated with semicolons, not commas.</explain>
   </comment>
 
+  <debug id="1298">
+    <title>Body is too long to be checked</title>
+    <explain>This message’s <var ref="place"/> indicates that the body is at least <var ref="size"/> bytes long. HTTPolice does not attempt to process bodies longer than <var ref="max_size"/> bytes. The rest of the stream will not be processed either.</explain>
+  </debug>
+
 </notices>
diff --git a/httpolice/stream.py b/httpolice/stream.py
@@ -15,6 +15,8 @@ class Stream(object):
     as similarly-named methods of file objects.
     """
 
+    max_line_length = 16 * 1024
+
     def __init__(self, file_, name=None):
         self.file = file_
         self.name = name
@@ -62,10 +64,15 @@ def read(self, n=-1):
 
     def readline(self, decode=True):
         pos = self.tell()
-        r = self.file.readline()
+        r = self.file.readline(self.max_line_length)
         if self.peek() == b'':
             self.eof = True
         if not r.endswith(b'\n'):
+            if len(r) >= self.max_line_length:
+                raise self.error(
+                    pos,
+                    expected=u'no more than %d bytes before end of line' %
+                    self.max_line_length)
             raise self.error(pos, expected=u'data terminated by end of line')
 
         if len(r) >= 2 and r[-2:-1] == b'\r':

diff --git a/test/combined_data/1298_1 b/test/combined_data/1298_1
@@ -0,0 +1,23 @@
+1298 1298 1007 1010
+
+======== BEGIN INBOUND STREAM ========
+PUT /articles/123/ HTTP/1.1
+Host: example.com
+User-Agent: demo
+Content-Type: text/plain
+Content-Length: 8429245833248012
+
+Hello world!
+
+======== BEGIN OUTBOUND STREAM ========
+HTTP/1.1 201 Created
+Date: Thu, 31 Dec 2015 18:26:56 GMT
+Content-Type: text/plain
+Transfer-Encoding: chunked
+
+3e
+A new article has been created with the following content:
+
+
+1df25abb9d7d0c
+Hello world!
diff --git a/test/test_streams_input.py b/test/test_streams_input.py
@@ -359,3 +359,16 @@ def test_rearrange():
     assert exchanges[8].request is None
     assert [complaint.id for complaint in exchanges[8].complaints] == [1279]
     assert exchanges[9].request.target == u'/08'
+
+
+def test_super_long_headers(tmpdir):
+    req_path = tmpdir.join('request.dat')
+    with req_path.open('wb') as req_file:
+        req_file.write(b'GET / HTTP/1.1\r\n'
+                       b'Host: example.com\r\n'
+                       b'User-Agent: test\r\n'
+                       b'Accept-Language: ' + (b'en, ' * 4096) + b'\r\n'
+                       b'\r\n')
+    exchanges = load(req_stream_input, [str(req_path)])
+    assert exchanges[0].request is None
+    assert [complaint.id for complaint in exchanges[0].complaints] == [1006]