Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
QueryParser.py:
Browse files Browse the repository at this point in the history
- Rephrased the description of the grammar, pointing out that the
  lexicon decides on globbing syntax.

- Refactored term and atom parsing (moving atom parsing into a
  separate method).  The previously checked-in version accidentally
  accepted some invalid forms like ``foo AND -bar''; this is fixed.

tests/testQueryParser.py:

- Each test is now in a separate method; this produces more output
  (alas) but makes pinpointing the errors much simpler.

- Added some tests catching ``foo AND -bar'' and similar.

- Added an explicit test class for the handling of stopwords.  The
  "and/" test no longer has to check self.__class__.

- Some refactoring of the TestQueryParser class; the utility methods
  are now in a base class TestQueryParserBase, in a different order;
  compareParseTrees() now shows the parse tree it got when raising an
  exception.  The parser is now self.parser instead of self.p (see
  below).

tests/testZCTextIndex.py:

- setUp() no longer needs to assign to self.p; the parser is
  consistently called self.parser now.
  • Loading branch information
gvanrossum committed May 20, 2002
1 parent 49dccce commit a1ab7a3
Show file tree
Hide file tree
Showing 3 changed files with 218 additions and 73 deletions.
73 changes: 40 additions & 33 deletions QueryParser.py
Expand Up @@ -27,17 +27,19 @@
An ATOM is either:
+ A sequence of characters not containing whitespace or parentheses or
double quotes, and not equal to one of the key words 'AND', 'OR', 'NOT'; or
double quotes, and not equal (ignoring case) to one of the key words
'AND', 'OR', 'NOT'; or
+ A non-empty string enclosed in double quotes. The interior of the string
can contain whitespace, parentheses and key words.
+ A non-empty string enclosed in double quotes. The interior of the
string can contain whitespace, parentheses and key words, but not
quotes.
In addition, an ATOM may optionally be preceded by a hyphen, meaning
that it must not be present.
+ A hyphen followed by one of the two forms above, meaning that it
must not be present.
An unquoted ATOM may also end in a star. This is a primitive
"globbing" function, meaning to search for any word with a given
prefix.
An unquoted ATOM may also contain globbing characters. Globbing
syntax is defined by the lexicon; for example "foo*" could mean any
word starting with "foo".
When multiple consecutive ATOMs are found at the leaf level, they are
connected by an implied AND operator, and an unquoted leading hyphen
Expand Down Expand Up @@ -202,32 +204,37 @@ def _parseTerm(self):
tree = self._parseOrExpr()
self._require(_RPAREN)
else:
atoms = [self._get(_ATOM)]
while self._peek(_ATOM):
atoms.append(self._get(_ATOM))
nodes = []
nots = []
for a in atoms:
words = self._lexicon.parseTerms(a)
if not words:
self._ignored.append(a)
continue # Only stopwords
if len(words) > 1:
n = ParseTree.PhraseNode(" ".join(words))
elif self._lexicon.isGlob(words[0]):
n = ParseTree.GlobNode(words[0])
else:
n = ParseTree.AtomNode(words[0])
if a[0] == "-":
n = ParseTree.NotNode(n)
nots.append(n)
else:
nodes.append(n)
nodes = [self._parseAtom()]
while self._peek(_ATOM):
nodes.append(self._parseAtom())
nodes = filter(None, nodes)
if not nodes:
return None # Only stowords
nodes.extend(nots)
return None # Only stopwords
structure = [(isinstance(nodes[i], ParseTree.NotNode), i, nodes[i])
for i in range(len(nodes))]
structure.sort()
nodes = [node for (bit, index, node) in structure]
if isinstance(nodes[0], ParseTree.NotNode):
raise ParseTree.ParseError(
"a term must have at least one positive word")
if len(nodes) == 1:
tree = nodes[0]
else:
tree = ParseTree.AndNode(nodes)
return nodes[0]
tree = ParseTree.AndNode(nodes)
return tree

def _parseAtom(self):
term = self._get(_ATOM)
words = self._lexicon.parseTerms(term)
if not words:
self._ignored.append(term)
return None
if len(words) > 1:
tree = ParseTree.PhraseNode(words)
elif self._lexicon.isGlob(words[0]):
tree = ParseTree.GlobNode(words[0])
else:
tree = ParseTree.AtomNode(words[0])
if term[0] == "-":
tree = ParseTree.NotNode(tree)
return tree
216 changes: 177 additions & 39 deletions tests/testQueryParser.py
Expand Up @@ -21,116 +21,254 @@
from Products.ZCTextIndex.ParseTree import AtomNode, PhraseNode, GlobNode
from Products.ZCTextIndex.Lexicon import Lexicon, Splitter

class TestQueryParser(TestCase):
class TestQueryParserBase(TestCase):

def compareParseTrees(self, got, expected):
def setUp(self):
self.lexicon = Lexicon(Splitter())
self.parser = QueryParser(self.lexicon)

def expect(self, input, output, expected_ignored=[]):
tree = self.parser.parseQuery(input)
ignored = self.parser.getIgnored()
self.compareParseTrees(tree, output)
self.assertEqual(ignored, expected_ignored)
# Check that parseQueryEx() == (parseQuery(), getIgnored())
ex_tree, ex_ignored = self.parser.parseQueryEx(input)
self.compareParseTrees(ex_tree, tree)
self.assertEqual(ex_ignored, expected_ignored)

def failure(self, input):
self.assertRaises(ParseError, self.parser.parseQuery, input)
self.assertRaises(ParseError, self.parser.parseQueryEx, input)

def compareParseTrees(self, got, expected, msg=None):
if msg is None:
msg = repr(got)
self.assertEqual(isinstance(got, ParseTreeNode), 1)
self.assertEqual(got.__class__, expected.__class__)
self.assertEqual(got.__class__, expected.__class__, msg)
if isinstance(got, PhraseNode):
self.assertEqual(got.nodeType(), "PHRASE")
self.assertEqual(got.getValue(), expected.getValue())
self.assertEqual(got.nodeType(), "PHRASE", msg)
self.assertEqual(got.getValue(), expected.getValue(), msg)
elif isinstance(got, GlobNode):
self.assertEqual(got.nodeType(), "GLOB")
self.assertEqual(got.getValue(), expected.getValue())
self.assertEqual(got.nodeType(), "GLOB", msg)
self.assertEqual(got.getValue(), expected.getValue(), msg)
elif isinstance(got, AtomNode):
self.assertEqual(got.nodeType(), "ATOM")
self.assertEqual(got.getValue(), expected.getValue())
self.assertEqual(got.nodeType(), "ATOM", msg)
self.assertEqual(got.getValue(), expected.getValue(), msg)
elif isinstance(got, NotNode):
self.assertEqual(got.nodeType(), "NOT")
self.compareParseTrees(got.getValue(), expected.getValue())
self.compareParseTrees(got.getValue(), expected.getValue(), msg)
elif isinstance(got, AndNode) or isinstance(got, OrNode):
self.assertEqual(got.nodeType(),
isinstance(got, AndNode) and "AND" or "OR")
isinstance(got, AndNode) and "AND" or "OR", msg)
list1 = got.getValue()
list2 = expected.getValue()
self.assertEqual(len(list1), len(list2))
self.assertEqual(len(list1), len(list2), msg)
for i in range(len(list1)):
self.compareParseTrees(list1[i], list2[i])
self.compareParseTrees(list1[i], list2[i], msg)

def expect(self, input, output):
tree = self.p.parseQuery(input)
self.compareParseTrees(tree, output)
class TestQueryParser(TestQueryParserBase):

def failure(self, input):
self.assertRaises(ParseError, self.p.parseQuery, input)

def setUp(self):
self.lexicon = Lexicon(Splitter())
self.p = QueryParser(self.lexicon)

def testParseQuery(self):
def test001(self):
self.expect("foo", AtomNode("foo"))

def test002(self):
self.expect("note", AtomNode("note"))

def test003(self):
self.expect("aa and bb AND cc",
AndNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))

def test004(self):
self.expect("aa OR bb or cc",
OrNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))

def test005(self):
self.expect("aa AND bb OR cc AnD dd",
OrNode([AndNode([AtomNode("aa"), AtomNode("bb")]),
AndNode([AtomNode("cc"), AtomNode("dd")])]))

def test006(self):
self.expect("(aa OR bb) AND (cc OR dd)",
AndNode([OrNode([AtomNode("aa"), AtomNode("bb")]),
OrNode([AtomNode("cc"), AtomNode("dd")])]))
self.expect("aa AND not bb",

def test007(self):
self.expect("aa AND NOT bb",
AndNode([AtomNode("aa"), NotNode(AtomNode("bb"))]))

self.expect('"foo bar"', PhraseNode("foo bar"))
def test010(self):
self.expect('"foo bar"', PhraseNode(["foo", "bar"]))

def test011(self):
self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")]))

self.expect('(("foo bar"))"', PhraseNode("foo bar"))
def test012(self):
self.expect('(("foo bar"))"', PhraseNode(["foo", "bar"]))

def test013(self):
self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")]))

if self.__class__ is TestQueryParser:
# This test fails when testZCTextIndex subclasses this class,
# because its lexicon's pipeline removes stopwords
self.expect('and/', AtomNode("and"))
def test014(self):
self.expect("foo-bar", PhraseNode(["foo", "bar"]))

self.expect("foo-bar", PhraseNode("foo bar"))
def test015(self):
self.expect("foo -bar", AndNode([AtomNode("foo"),
NotNode(AtomNode("bar"))]))

def test016(self):
self.expect("-foo bar", AndNode([AtomNode("bar"),
NotNode(AtomNode("foo"))]))

def test017(self):
self.expect("booh -foo-bar",
AndNode([AtomNode("booh"),
NotNode(PhraseNode("foo bar"))]))
NotNode(PhraseNode(["foo", "bar"]))]))

def test018(self):
self.expect('booh -"foo bar"',
AndNode([AtomNode("booh"),
NotNode(PhraseNode("foo bar"))]))
NotNode(PhraseNode(["foo", "bar"]))]))

def test019(self):
self.expect('foo"bar"',
AndNode([AtomNode("foo"), AtomNode("bar")]))

def test020(self):
self.expect('"foo"bar',
AndNode([AtomNode("foo"), AtomNode("bar")]))

def test021(self):
self.expect('foo"bar"blech',
AndNode([AtomNode("foo"), AtomNode("bar"),
AtomNode("blech")]))

def test022(self):
self.expect("foo*", GlobNode("foo*"))

def test023(self):
self.expect("foo* bar", AndNode([GlobNode("foo*"),
AtomNode("bar")]))

def testParseFailures(self):
def test101(self):
self.failure("")

def test102(self):
self.failure("not")

def test103(self):
self.failure("or")

def test104(self):
self.failure("and")

def test105(self):
self.failure("NOT")

def test106(self):
self.failure("OR")

def test107(self):
self.failure("AND")
self.failure("not foo")

def test108(self):
self.failure("NOT foo")

def test109(self):
self.failure(")")

def test110(self):
self.failure("(")

def test111(self):
self.failure("foo OR")

def test112(self):
self.failure("foo AND")

def test113(self):
self.failure("OR foo")
self.failure("and foo")

def test114(self):
self.failure("AND foo")

def test115(self):
self.failure("(foo) bar")

def test116(self):
self.failure("(foo OR)")

def test117(self):
self.failure("(foo AND)")

def test118(self):
self.failure("(NOT foo)")

def test119(self):
self.failure("-foo")

def test120(self):
self.failure("-foo -bar")
self.failure('""')

def test121(self):
self.failure("foo OR -bar")

def test122(self):
self.failure("foo AND -bar")

class StopWordTestQueryParser(TestQueryParserBase):

def setUp(self):
# Only 'stop' is a stopword (but 'and' is still an operator)
self.lexicon = Lexicon(Splitter(), FakeStopWordRemover())
self.parser = QueryParser(self.lexicon)

def test201(self):
self.expect('and/', AtomNode("and"))

def test202(self):
self.expect('foo AND stop', AtomNode("foo"), ["stop"])

def test203(self):
self.expect('foo AND NOT stop', AtomNode("foo"), ["stop"])

def test204(self):
self.expect('stop AND foo', AtomNode("foo"), ["stop"])

def test205(self):
self.expect('foo OR stop', AtomNode("foo"), ["stop"])

def test206(self):
self.expect('stop OR foo', AtomNode("foo"), ["stop"])

def test301(self):
self.failure('stop')

def test302(self):
self.failure('stop stop')

def test303(self):
self.failure('stop AND stop')

def test304(self):
self.failure('stop OR stop')

def test305(self):
self.failure('stop -foo')

def test306(self):
self.failure('stop AND NOT foo')

class FakeStopWordRemover:

def process(self, list):
return [word for word in list if word != "stop"]


def test_suite():
return makeSuite(TestQueryParser)
return TestSuite((makeSuite(TestQueryParser),
makeSuite(StopWordTestQueryParser),
))

if __name__=="__main__":
main(defaultTest='test_suite')
2 changes: 1 addition & 1 deletion tests/testZCTextIndex.py
Expand Up @@ -454,7 +454,7 @@ def setUp(self):
StopWordRemover())
caller = LexiconHolder(self.lexicon)
self.zc_index = ZCTextIndex('name', extra, caller, self.IndexFactory)
self.p = self.parser = QueryParser(self.lexicon)
self.parser = QueryParser(self.lexicon)
self.index = self.zc_index.index
self.add_docs()

Expand Down

0 comments on commit a1ab7a3

Please sign in to comment.