Skip to content
This repository has been archived by the owner on Mar 29, 2022. It is now read-only.

Commit

Permalink
Memoize parse results
Browse files Browse the repository at this point in the history
  • Loading branch information
vfaronov committed Aug 2, 2016
1 parent d2d8e19 commit 8326f5a
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 39 deletions.
9 changes: 5 additions & 4 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@ Unreleased

Added
-----
- HTTPolice now works under `PyPy`_ (the 2.7 variant),
which can make it significantly faster on large inputs.
You will probably need a recent version of PyPy
(5.3.1 works, but with 5.0.0 I get segfaults all over the place).
- HTTPolice now caches more intermediate values in memory,
which makes it significantly faster in many cases.
- HTTPolice now works correctly under `PyPy`_ (the 2.7 variant),
which, too, can make it faster on large inputs.
You will probably need a recent version of PyPy (5.3.1 is OK).
- `HTML reports`_ now have an "options" menu
to filter exchanges and notices on the fly.
- The ``httpolice`` command-line tool now has
Expand Down
20 changes: 9 additions & 11 deletions httpolice/header.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,14 @@
Particularly non-obvious are comparisons of :class:`HeaderView` (q.v.).
"""

import copy
import operator
import sys

from httpolice import known
from httpolice.known import alt_svc_param, cache_directive, h, header
import httpolice.known.hsts_directive
from httpolice.parse import ParseError, Stream, simple_parse
from httpolice.parse import simple_parse
from httpolice.structure import Parametrized, Unavailable, okay
from httpolice.syntax.rfc7230 import quoted_string, token

Expand Down Expand Up @@ -135,17 +136,13 @@ def _pre_parse(self):
if parser is None:
parsed = entry.value
else:
stream = Stream(entry.value, annotate_classes=known.classes)
try:
parsed = stream.parse(parser, to_eof=True)
except ParseError as e:
self.message.complain(1000, entry=entry, error=e)
parsed = Unavailable
else:
(parsed, annotations) = simple_parse(
entry.value, parser,
self.message.complain, 1000, place=entry,
annotate_classes=known.classes)
if parsed is not Unavailable:
parsed = self._process_parsed(entry, parsed)
self.message.annotations[(from_trailer, i)] = \
stream.collect_annotations()
stream.dump_complaints(self.message.complain, place=entry)
self.message.annotations[(from_trailer, i)] = annotations
values.append(parsed)
return entries, values

Expand Down Expand Up @@ -372,6 +369,7 @@ def _process_parsed(self, entry, parsed):
return parsed

# Parse every parameter's value according to its defined parser.
parsed = copy.deepcopy(parsed)
for alternative in parsed:
params = alternative.param.sequence
for i in range(len(params)):
Expand Down
2 changes: 1 addition & 1 deletion httpolice/notices.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<notices>

<error id="1000">
<title>Malformed <var ref="entry"/> header</title>
<title>Malformed <var ref="place"/> header</title>
<exception/>
</error>

Expand Down
89 changes: 66 additions & 23 deletions httpolice/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
For example, in an HTML report,
the ``text/xml`` in ``Accept: text/xml;q=0.9`` becomes a hyperlink to RFC,
because it is parsed into a :class:`~httpolice.structure.MediaType` object.
The list of classes to annotate must be passed to :meth:`Stream.__init__`.
The list of classes to annotate must be passed to :meth:`Stream.parse`.
Also, the object (in this case, ``MediaType(u'text/xml')``)
must be the end result of a distinct :class:`Nonterminal`,
**not** buried inside some :class:`Rule`.
Expand Down Expand Up @@ -581,27 +581,40 @@ def can_complain(func):

class Stream(object):

# pylint: disable=attribute-defined-outside-init

"""Wraps a string of bytes that are the input to parsers."""

def __init__(self, data, name=None, annotate_classes=None):
self._stack = []
_cache = OrderedDict()

# Obtained by running under ``/usr/bin/time -v`` on a large tcpflow input
# and increasing until the results stopped improving.
_cache_size = 200

def __init__(self, data, name=None):
self.data = data
self.name = name
self.point = 0
self._sane = True
self.complaints = []
self.annotations = []
self.annotate_classes = tuple(annotate_classes or ())
self._set_state((0, [], []))
self._stack = []

def _get_state(self):
return (self.point, self.complaints[:], self.annotations[:])

def _set_state(self, state):
(self.point, self.complaints, self.annotations) = state

def _is_empty_state(self):
return self.point == 0 and not self.complaints and not self.annotations

def __enter__(self):
self._stack.append((self.point,
self.complaints[:], self.annotations[:]))
self._stack.append(self._get_state())
return self

def __exit__(self, exc_type, _1, _2):
frame = self._stack.pop()
state = self._stack.pop()
if exc_type is not None:
(self.point, self.complaints, self.annotations) = frame
self._set_state(state)
return False

def peek(self, n):
Expand Down Expand Up @@ -666,8 +679,30 @@ def maybe_consume_regex(self, target):
except ParseError:
return None

def parse(self, target, to_eof=False):
return parse(self, target.as_nonterminal(), to_eof)
def parse(self, target, to_eof=False, annotate_classes=None):
annotate_classes = tuple(annotate_classes or ())
key = None
if self._is_empty_state() and to_eof:
# Caching is really only useful
# when we're parsing something small in its entirety,
# like a header value.
# The above ``if`` means that the cache won't get in our way
# when we're parsing something big in chunks,
# like HTTP/1.x framing.
key = (self.data, target, annotate_classes)
item = self._cache.pop(key, None)
if item is not None:
(r, state) = item
self._set_state(state)
self._cache[key] = item
return r

r = parse(self, target.as_nonterminal(), to_eof, annotate_classes)
if key is not None:
self._cache[key] = (r, self._get_state())
while len(self._cache) > self._cache_size:
self._cache.popitem()
return r

def complain(self, notice_id, **context):
self.complaints.append((notice_id, context))
Expand Down Expand Up @@ -737,7 +772,7 @@ def _add_item(items, items_idx, items_set, symbol, rule, pos, start):
(symbol, rule, pos, start))


def parse(stream, target_symbol, to_eof=False):
def parse(stream, target_symbol, to_eof=False, annotate_classes=()):
(items, items_idx, items_set) = ([], {}, set())

# Seed the initial items inventory with rules for `target_symbol`.
Expand Down Expand Up @@ -812,7 +847,8 @@ def parse(stream, target_symbol, to_eof=False):
i += 1

if (last_good_i is not None) and (last_good_i == i or not to_eof):
results = _find_results(stream, target_symbol, chart, last_good_i)
results = _find_results(stream, target_symbol, chart, last_good_i,
[], annotate_classes)
for start_i, _, result, complaints, annotations in results:
# There may be multiple valid parses in case of ambiguities,
# but in practice we just want
Expand All @@ -826,7 +862,8 @@ def parse(stream, target_symbol, to_eof=False):
raise _build_parse_error(stream, target_symbol, chart)


def _find_results(stream, symbol, chart, end_i, outer_parents=None):
def _find_results(stream, symbol, chart, end_i,
outer_parents, annotate_classes):
# The trivial base case is to find the parse result of a terminal.
if isinstance(symbol, Terminal):
if end_i > 0:
Expand All @@ -848,8 +885,6 @@ def _find_results(stream, symbol, chart, end_i, outer_parents=None):
# We don't want to consider items that are
# already being processed further up the stack.
# Otherwise, we would fall into unbounded recursion.
if outer_parents is None:
outer_parents = []
if item in outer_parents:
continue

Expand Down Expand Up @@ -918,7 +953,7 @@ def complain(id_, **ctx):
result = nodes

# Finally, annotate if needed.
if isinstance(result, stream.annotate_classes):
if isinstance(result, annotate_classes):
all_annotations.append((start_i, end_i, result))

# And that's one complete parse for `target_symbol`
Expand All @@ -936,7 +971,8 @@ def complain(id_, **ctx):
inner_symbol = rule.symbols[-len(frames) - 1]
# Recursively get an iterator
# over possible results for this symbol.
rs = _find_results(stream, inner_symbol, chart, i, parents)
rs = _find_results(stream, inner_symbol, chart, i, parents,
annotate_classes)

# Get the next result for this symbol.
r = next(rs, None)
Expand Down Expand Up @@ -1040,14 +1076,21 @@ def _find_pivots(chart, symbol, start, stack=None):
###############################################################################
# Miscellaneous helpers.

def simple_parse(data, symbol, complain, fail_notice_id, **extra_context):
def simple_parse(data, symbol, complain, fail_notice_id, annotate_classes=None,
**extra_context):
"""(Try to) parse an entire string as a single grammar symbol."""
stream = Stream(force_bytes(data))

try:
r = stream.parse(symbol, to_eof=True)
r = stream.parse(symbol, to_eof=True,
annotate_classes=annotate_classes)
except ParseError as e:
complain(fail_notice_id, error=e, **extra_context)
r = Unavailable
else:
stream.dump_complaints(complain, **extra_context)
return r

if annotate_classes is None:
return r
else:
return (r, stream.collect_annotations())

0 comments on commit 8326f5a

Please sign in to comment.