Iterator based sre token scanner
# NOTE(kgibbs): This line must be added to make this file work under
# Python 2.2, which is commonly used at Google.
from __future__ import generators
# NOTE(kgibbs): End changes.
import sre_parse, sre_compile, sre_constants
from sre_constants import BRANCH, SUBPATTERN
from re import VERBOSE, MULTILINE, DOTALL # NOTE(guido): Was 'from sre ...'
import re
__all__ = ['Scanner', 'pattern']
class Scanner(object):
def __init__(self, lexicon, flags=FLAGS):
self.actions = [None]
# combine phrases into a compound pattern
s = sre_parse.Pattern()
s.flags = flags
p = []
# NOTE(kgibbs): These lines must be added to make this file work under
# Python 2.2, which is commonly used at Google.
def enumerate(obj):
i = -1
for item in obj:
i += 1
yield i, item
# NOTE(kgibbs): End changes.
for idx, token in enumerate(lexicon):
phrase = token.pattern
subpattern = sre_parse.SubPattern(s,
[(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))])
except sre_constants.error:
s.groups = len(p)+1 # NOTE(guido): Added to make SRE validation work
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
self.scanner = sre_compile.compile(p)
def iterscan(self, string, idx=0, context=None):
Yield match, end_idx for each match
match = self.scanner.scanner(string, idx).match
actions = self.actions
lastend = idx
end = len(string)
while True:
m = match()
if m is None:
matchbegin, matchend = m.span()
if lastend == matchend:
action = actions[m.lastindex]
if action is not None:
rval, next_pos = action(m, context)
if next_pos is not None and next_pos != matchend:
# "fast forward" the scanner
matchend = next_pos
match = self.scanner.scanner(string, matchend).match
yield rval, matchend
lastend = matchend
def pattern(pattern, flags=FLAGS):
def decorator(fn):
fn.pattern = pattern
fn.regex = re.compile(pattern, flags)
return fn
return decorator
