Skip to content
This repository has been archived by the owner on Mar 29, 2022. It is now read-only.

Commit

Permalink
Reorganize parsing
Browse files Browse the repository at this point in the history
Since the beginning of HTTPolice, I have been trying to keep header parsing
and HTTP/1.x framing parsing together. Originally it was the exact same
mechanism. Later, when I switched to Earley parsing, it proved too
expensive for framing and I bolted on some regexes derived from the same
grammar combinators. Later came strange things like `simple_parse`.

Eventually it became clear to me that headers and framing are two very
different use cases and they should be separated, which I did in this
commit. The only thing they share now is the `ParseError`, and
the associated need for framing to have 'hollow' grammar symbols purely
for rendering these `ParseError`.

This makes the code simpler and clearer, but more importantly, it solves
the problem that we used to read entire TCP streams eagerly into memory
before even starting to parse them (because our regex engine cannot read
from a file on demand).
  • Loading branch information
vfaronov committed Jul 16, 2017
1 parent ff2d2c1 commit e6858ee
Show file tree
Hide file tree
Showing 27 changed files with 475 additions and 523 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,15 @@ Unreleased
Changed
-------
- Notice `1277`_ (obsolete 'X-' prefix) is now reported only once per message.
- The syntax of `chunk extensions`_ is no longer checked.

Added
-----
- HTTPolice can now use much less memory when parsing long TCP streams.
- Checks for the `Forwarded`_ header (notices `1296`_, `1297`_).

.. _Forwarded: https://tools.ietf.org/html/rfc7239
.. _chunk extensions: https://tools.ietf.org/html/rfc7230#section-4.1.1
.. _1296: http://pythonhosted.org/HTTPolice/notices.html#1296
.. _1297: http://pythonhosted.org/HTTPolice/notices.html#1297

Expand Down
220 changes: 107 additions & 113 deletions httpolice/framing1.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,31 @@

"""Parse HTTP/1.x message framing according to RFC 7230."""

import re

from httpolice.citation import RFC
from httpolice.codings import decode_deflate, decode_gzip
from httpolice.exchange import Exchange, complaint_box
from httpolice.known import m, st, tc
from httpolice.parse import ParseError, maybe, skip
from httpolice.parse import ParseError, Symbol
from httpolice.request import Request
from httpolice.response import Response
from httpolice.structure import (FieldName, HeaderEntry, HTTPVersion, Method,
StatusCode, Unavailable, okay)
from httpolice.syntax import rfc7230
from httpolice.syntax.common import CRLF, LF, SP


# Create empty symbols just for referring to them in parse errors.

HTTP_message = Symbol(u'HTTP-message', RFC(7230, section=(3,)))
request_line = Symbol(u'request-line', RFC(7230, section=(3, 1, 1)))
status_line = Symbol(u'status-line', RFC(7230, section=(3, 1, 2)))
header_field = Symbol(u'header-field', RFC(7230, section=(3, 2)))
chunked_body = Symbol(u'chunked-body', RFC(7230, section=(4, 1)))
chunk = Symbol(u'chunk', RFC(7230, section=(4, 1)))
chunk_size = Symbol(u'chunk-size', RFC(7230, section=(4, 1)))


HTTP_VERSION = re.compile(u'^HTTP/[0-9]\\.[0-9]$')


def parse_streams(inbound, outbound, scheme=None):
Expand All @@ -35,11 +50,11 @@ def parse_streams(inbound, outbound, scheme=None):
containing neither request nor responses,
but only a notice that indicates some general problem with the streams.
"""
while inbound and inbound.sane:
while inbound and inbound.good:
(req, req_box) = _parse_request(inbound, scheme)
(resps, resp_box) = ([], None)
if req:
if outbound and outbound.sane:
if outbound and outbound.good:
(resps, resp_box) = _parse_responses(outbound, req)
if resps:
if resps[-1].status == st.switching_protocols:
Expand All @@ -54,15 +69,14 @@ def parse_streams(inbound, outbound, scheme=None):

if inbound and not inbound.eof:
# Some data remains on the inbound stream, but we can't parse it.
yield complaint_box(1007, stream=inbound,
nbytes=len(inbound.consume_rest()))
yield complaint_box(1007, stream=inbound, offset=inbound.tell())

if outbound and outbound.sane:
if outbound and outbound.good:
if inbound:
# We had some requests, but we ran out of them.
# We'll still try to parse the remaining responses on their own.
yield complaint_box(1008, stream=outbound)
while outbound.sane:
while outbound.good:
(resps, resp_box) = _parse_responses(outbound, None)
if resps:
yield Exchange(None, resps)
Expand All @@ -71,41 +85,36 @@ def parse_streams(inbound, outbound, scheme=None):

if outbound and not outbound.eof:
# Some data remains on the outbound stream, but we can't parse it.
yield complaint_box(1010, stream=outbound,
nbytes=len(outbound.consume_rest()))
yield complaint_box(1010, stream=outbound, offset=outbound.tell())


def _parse_request(stream, scheme=None):
req = _parse_request_heading(stream, scheme)
if req is Unavailable:
box = Exchange(None, [])
stream.dump_complaints(box.complain, place=u'request heading')
return (None, box)
try:
req = _parse_request_heading(stream, scheme)
except ParseError as e:
return (None, complaint_box(1006, error=e))
else:
_parse_request_body(req, stream)
return (req, None)


def _parse_request_heading(stream, scheme=None):
beginning = stream.point
try:
with stream:
method_ = Method(stream.consume_regex(rfc7230.method))
stream.consume_regex(SP)
target = stream.consume_regex(b'[^\\s]+', u'request target')
stream.consume_regex(SP)
version_ = HTTPVersion(stream.consume_regex(rfc7230.HTTP_version))
_parse_line_ending(stream)
entries = parse_header_fields(stream)
except ParseError as e:
stream.sane = False
stream.complain(1006, error=e)
return Unavailable
else:
req = Request(scheme, method_, target, version_, entries, body=None,
remark=u'from %s, offset %d' % (stream.name, beginning))
stream.dump_complaints(req.complain, place=u'request heading')
return req
beginning = stream.tell()
with stream.parsing(request_line):
line = stream.readline()
pieces = line.split(u' ')
if len(pieces) != 3 or not HTTP_VERSION.match(pieces[2]):
raise stream.error(beginning)
method_ = Method(pieces[0])
target = pieces[1]
version_ = HTTPVersion(pieces[2])
entries = parse_header_fields(stream)
with stream.parsing(HTTP_message):
stream.readlineend()
req = Request(scheme, method_, target, version_, entries, body=None,
remark=u'from %s, offset %d' % (stream.name, beginning))
stream.dump_complaints(req.complain, place=u'request heading')
return req


def _parse_request_body(req, stream):
Expand All @@ -129,26 +138,24 @@ def _parse_request_body(req, stream):
stream.sane = False
else:
try:
req.body = stream.consume_n_bytes(n)
req.body = stream.read(n)
except ParseError as exc:
req.body = Unavailable
req.complain(1004, error=exc)
stream.sane = False

else:
req.body = b''


def _parse_responses(stream, req):
resps = []
while stream.sane:
while stream.good:
# Parse all responses corresponding to one request.
# RFC 7230 section 3.3.
resp = _parse_response_heading(req, stream)
if resp is Unavailable:
box = Exchange(None, [])
stream.dump_complaints(box.complain, place=u'response heading')
return (resps, box)
try:
resp = _parse_response_heading(req, stream)
except ParseError as e:
return (resps, complaint_box(1009, error=e))
else:
resps.append(resp)
_parse_response_body(resp, stream)
Expand All @@ -160,27 +167,25 @@ def _parse_responses(stream, req):


def _parse_response_heading(req, stream):
beginning = stream.point
try:
with stream:
version_ = HTTPVersion(stream.consume_regex(rfc7230.HTTP_version))
stream.consume_regex(SP)
status = StatusCode(stream.consume_regex(rfc7230.status_code))
stream.consume_regex(SP)
reason = stream.consume_regex(rfc7230.reason_phrase)
_parse_line_ending(stream)
entries = parse_header_fields(stream)
except ParseError as e:
stream.complain(1009, error=e)
stream.sane = False
return Unavailable
else:
resp = Response(
version_, status, reason, entries, body=None,
remark=u'from %s, offset %d' % (stream.name, beginning))
resp.request = req
stream.dump_complaints(resp.complain, place=u'response heading')
return resp
beginning = stream.tell()
with stream.parsing(status_line):
line = stream.readline()
pieces = line.split(u' ', 2)
if len(pieces) != 3 or \
not HTTP_VERSION.match(pieces[0]) or not pieces[1].isdigit():
raise stream.error(beginning)
version_ = HTTPVersion(pieces[0])
status = StatusCode(pieces[1])
reason = pieces[2]
entries = parse_header_fields(stream)
with stream.parsing(HTTP_message):
stream.readlineend()
resp = Response(
version_, status, reason, entries, body=None,
remark=u'from %s, offset %d' % (stream.name, beginning))
resp.request = req
stream.dump_complaints(resp.complain, place=u'response heading')
return resp


def _parse_response_body(resp, stream):
Expand Down Expand Up @@ -208,7 +213,7 @@ def _parse_response_body(resp, stream):
codings.pop()
_parse_chunked(resp, stream)
else:
resp.body = stream.consume_rest()
resp.body = stream.read()
while codings and okay(resp.body):
_decode_transfer_coding(resp, codings.pop())

Expand All @@ -219,22 +224,13 @@ def _parse_response_body(resp, stream):
stream.sane = False
else:
try:
resp.body = stream.consume_n_bytes(n)
resp.body = stream.read(n)
except ParseError as exc:
resp.body = Unavailable
resp.complain(1004, error=exc)
stream.sane = False

else:
resp.body = stream.consume_rest()


def _parse_line_ending(stream):
r = stream.maybe_consume_regex(CRLF)
if r is None:
r = stream.consume_regex(LF, u'line ending')
stream.complain(1224)
return r
resp.body = stream.read()


def parse_header_fields(stream):
Expand All @@ -245,28 +241,20 @@ def parse_header_fields(stream):
:raises: :class:`ParseError`
"""
entries = []
while True:
name = stream.maybe_consume_regex(rfc7230.field_name)
if name is None:
break
stream.consume_regex(b':')
stream.consume_regex(rfc7230.OWS)
vs = []
while True:
v = stream.maybe_consume_regex(rfc7230.field_content)
if v is None:
if stream.maybe_consume_regex(rfc7230.obs_fold):
stream.complain(1016)
vs.append(b' ')
else:
break
else:
vs.append(v.encode('iso-8859-1')) # back to bytes
value = b''.join(vs)
stream.consume_regex(rfc7230.OWS)
_parse_line_ending(stream)
entries.append(HeaderEntry(FieldName(name), value))
_parse_line_ending(stream)
while stream.peek() not in [b'\r', b'\n', b'']:
with stream.parsing(header_field):
pos = stream.tell()
line = stream.readline(decode=False)
(name, colon, v) = line.partition(b':')
if not colon:
raise stream.error(pos)
vs = [v]
while stream.peek() in [b' ', b'\t']:
stream.complain(1016)
vs.append(b' ' + stream.readline(decode=False).lstrip(b' \t'))
name = FieldName(name.decode())
value = b''.join(vs).strip(b' \t')
entries.append(HeaderEntry(name, value))
return entries


Expand Down Expand Up @@ -294,27 +282,33 @@ def _decode_transfer_coding(msg, coding):


def _parse_chunk(stream):
size = stream.parse(rfc7230.chunk_size * skip(maybe(rfc7230.chunk_ext)))
_parse_line_ending(stream)
if size == 0:
return b''
else:
data = stream.consume_n_bytes(size)
_parse_line_ending(stream)
return data
with stream.parsing(chunk):
pos = stream.tell()
(size_s, _, _) = stream.readline().partition(u';')
with stream.parsing(chunk_size):
try:
size = int(size_s.rstrip(u' \t'), 16) # RFC errata ID: 4667
except ValueError:
raise stream.error(pos)
if size == 0:
return b''
else:
data = stream.read(size)
stream.readlineend()
return data


def _parse_chunked(msg, stream):
data = []
try:
with stream:
chunk = _parse_chunk(stream)
while chunk:
data.append(chunk)
chunk = _parse_chunk(stream)
trailer = parse_header_fields(stream)
chunk_data = _parse_chunk(stream)
while chunk_data:
data.append(chunk_data)
chunk_data = _parse_chunk(stream)
trailer = parse_header_fields(stream)
with stream.parsing(chunked_body):
stream.readlineend()
except ParseError as e:
stream.sane = False
msg.complain(1005, error=e)
msg.body = Unavailable
else:
Expand Down
16 changes: 7 additions & 9 deletions httpolice/header.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from httpolice import known
from httpolice.known import alt_svc_param, cache_directive, h, header
import httpolice.known.hsts_directive
from httpolice.parse import simple_parse
from httpolice.parse import parse
from httpolice.structure import Parametrized, Unavailable, okay
from httpolice.syntax.rfc7230 import quoted_string, token

Expand Down Expand Up @@ -142,7 +142,7 @@ def _pre_parse(self):
if parser is None:
parsed = entry.value
else:
(parsed, annotations) = simple_parse(
(parsed, annotations) = parse(
entry.value, parser,
self.message.complain, 1000, place=entry,
annotate_classes=known.classes)
Expand Down Expand Up @@ -330,10 +330,9 @@ def _process_directive(self, entry, directive_with_argument):
self.message.complain(1157, entry=entry, directive=directive)
argument = None
elif parser is not None:
argument = simple_parse(argument, parser,
self.message.complain, 1158,
place=entry,
directive=directive, value=argument)
argument = parse(argument, parser,
self.message.complain, 1158, place=entry,
directive=directive, value=argument)
return Parametrized(directive, argument)

def __getattr__(self, key):
Expand Down Expand Up @@ -392,9 +391,8 @@ def _process_parsed(self, entry, parsed):
(name, value) = params[i]
parser = alt_svc_param.parser_for(name)
if parser is not None:
value = simple_parse(value, parser,
self.message.complain, 1259,
place=entry, param=name, value=value)
value = parse(value, parser, self.message.complain, 1259,
place=entry, param=name, value=value)
params[i] = (name, value)

return parsed
Expand Down

0 comments on commit e6858ee

Please sign in to comment.