QueryParser.py:

- Rephrased the description of the grammar, pointing out that the lexicon decides on globbing syntax. - Refactored term and atom parsing (moving atom parsing into a separate method). The previously checked-in version accidentally accepted some invalid forms like ``foo AND -bar''; this is fixed. tests/testQueryParser.py: - Each test is now in a separate method; this produces more output (alas) but makes pinpointing the errors much simpler. - Added some tests catching ``foo AND -bar'' and similar. - Added an explicit test class for the handling of stopwords. The "and/" test no longer has to check self.__class__. - Some refactoring of the TestQueryParser class; the utility methods are now in a base class TestQueryParserBase, in a different order; compareParseTrees() now shows the parse tree it got when raising an exception. The parser is now self.parser instead of self.p (see below). tests/testZCTextIndex.py: - setUp() no longer needs to assign to self.p; the parser is consistently called self.parser now.
zopefoundation · May 20, 2002 · a1ab7a3 · a1ab7a3
1 parent 49dccce
commit a1ab7a3
Show file tree

Hide file tree

Showing 3 changed files with 218 additions and 73 deletions.
diff --git a/QueryParser.py b/QueryParser.py
@@ -27,17 +27,19 @@
 An ATOM is either:
 
 + A sequence of characters not containing whitespace or parentheses or
-  double quotes, and not equal to one of the key words 'AND', 'OR', 'NOT'; or
+  double quotes, and not equal (ignoring case) to one of the key words
+  'AND', 'OR', 'NOT'; or
 
-+ A non-empty string enclosed in double quotes.  The interior of the string
-  can contain whitespace, parentheses and key words.
++ A non-empty string enclosed in double quotes.  The interior of the
+  string can contain whitespace, parentheses and key words, but not
+  quotes.
 
-In addition, an ATOM may optionally be preceded by a hyphen, meaning
-that it must not be present.
++ A hyphen followed by one of the two forms above, meaning that it
+  must not be present.
 
-An unquoted ATOM may also end in a star.  This is a primitive
-"globbing" function, meaning to search for any word with a given
-prefix.
+An unquoted ATOM may also contain globbing characters.  Globbing
+syntax is defined by the lexicon; for example "foo*" could mean any
+word starting with "foo".
 
 When multiple consecutive ATOMs are found at the leaf level, they are
 connected by an implied AND operator, and an unquoted leading hyphen
@@ -202,32 +204,37 @@ def _parseTerm(self):
             tree = self._parseOrExpr()
             self._require(_RPAREN)
         else:
-            atoms = [self._get(_ATOM)]
-            while self._peek(_ATOM):
-                atoms.append(self._get(_ATOM))
             nodes = []
-            nots = []
-            for a in atoms:
-                words = self._lexicon.parseTerms(a)
-                if not words:
-                    self._ignored.append(a)
-                    continue # Only stopwords
-                if len(words) > 1:
-                    n = ParseTree.PhraseNode(" ".join(words))
-                elif self._lexicon.isGlob(words[0]):
-                    n = ParseTree.GlobNode(words[0])
-                else:
-                    n = ParseTree.AtomNode(words[0])
-                if a[0] == "-":
-                    n = ParseTree.NotNode(n)
-                    nots.append(n)
-                else:
-                    nodes.append(n)
+            nodes = [self._parseAtom()]
+            while self._peek(_ATOM):
+                nodes.append(self._parseAtom())
+            nodes = filter(None, nodes)
             if not nodes:
-                return None # Only stowords
-            nodes.extend(nots)
+                return None # Only stopwords
+            structure = [(isinstance(nodes[i], ParseTree.NotNode), i, nodes[i])
+                         for i in range(len(nodes))]
+            structure.sort()
+            nodes = [node for (bit, index, node) in structure]
+            if isinstance(nodes[0], ParseTree.NotNode):
+                raise ParseTree.ParseError(
+                    "a term must have at least one positive word")
             if len(nodes) == 1:
-                tree = nodes[0]
-            else:
-                tree = ParseTree.AndNode(nodes)
+                return nodes[0]
+            tree = ParseTree.AndNode(nodes)
+        return tree
+
+    def _parseAtom(self):
+        term = self._get(_ATOM)
+        words = self._lexicon.parseTerms(term)
+        if not words:
+            self._ignored.append(term)
+            return None
+        if len(words) > 1:
+            tree = ParseTree.PhraseNode(words)
+        elif self._lexicon.isGlob(words[0]):
+            tree = ParseTree.GlobNode(words[0])
+        else:
+            tree = ParseTree.AtomNode(words[0])
+        if term[0] == "-":
+            tree = ParseTree.NotNode(tree)
         return tree
diff --git a/tests/testQueryParser.py b/tests/testQueryParser.py
@@ -21,116 +21,254 @@
 from Products.ZCTextIndex.ParseTree import AtomNode, PhraseNode, GlobNode
 from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
 
-class TestQueryParser(TestCase):
+class TestQueryParserBase(TestCase):
 
-    def compareParseTrees(self, got, expected):
+    def setUp(self):
+        self.lexicon = Lexicon(Splitter())
+        self.parser = QueryParser(self.lexicon)
+
+    def expect(self, input, output, expected_ignored=[]):
+        tree = self.parser.parseQuery(input)
+        ignored = self.parser.getIgnored()
+        self.compareParseTrees(tree, output)
+        self.assertEqual(ignored, expected_ignored)
+        # Check that parseQueryEx() == (parseQuery(), getIgnored())
+        ex_tree, ex_ignored = self.parser.parseQueryEx(input)
+        self.compareParseTrees(ex_tree, tree)
+        self.assertEqual(ex_ignored, expected_ignored)
+
+    def failure(self, input):
+        self.assertRaises(ParseError, self.parser.parseQuery, input)
+        self.assertRaises(ParseError, self.parser.parseQueryEx, input)
+
+    def compareParseTrees(self, got, expected, msg=None):
+        if msg is None:
+            msg = repr(got)
         self.assertEqual(isinstance(got, ParseTreeNode), 1)
-        self.assertEqual(got.__class__, expected.__class__)
+        self.assertEqual(got.__class__, expected.__class__, msg)
         if isinstance(got, PhraseNode):
-            self.assertEqual(got.nodeType(), "PHRASE")
-            self.assertEqual(got.getValue(), expected.getValue())
+            self.assertEqual(got.nodeType(), "PHRASE", msg)
+            self.assertEqual(got.getValue(), expected.getValue(), msg)
         elif isinstance(got, GlobNode):
-            self.assertEqual(got.nodeType(), "GLOB")
-            self.assertEqual(got.getValue(), expected.getValue())
+            self.assertEqual(got.nodeType(), "GLOB", msg)
+            self.assertEqual(got.getValue(), expected.getValue(), msg)
         elif isinstance(got, AtomNode):
-            self.assertEqual(got.nodeType(), "ATOM")
-            self.assertEqual(got.getValue(), expected.getValue())
+            self.assertEqual(got.nodeType(), "ATOM", msg)
+            self.assertEqual(got.getValue(), expected.getValue(), msg)
         elif isinstance(got, NotNode):
             self.assertEqual(got.nodeType(), "NOT")
-            self.compareParseTrees(got.getValue(), expected.getValue())
+            self.compareParseTrees(got.getValue(), expected.getValue(), msg)
         elif isinstance(got, AndNode) or isinstance(got, OrNode):
             self.assertEqual(got.nodeType(),
-                             isinstance(got, AndNode) and "AND" or "OR")
+                             isinstance(got, AndNode) and "AND" or "OR", msg)
             list1 = got.getValue()
             list2 = expected.getValue()
-            self.assertEqual(len(list1), len(list2))
+            self.assertEqual(len(list1), len(list2), msg)
             for i in range(len(list1)):
-                self.compareParseTrees(list1[i], list2[i])
+                self.compareParseTrees(list1[i], list2[i], msg)
 
-    def expect(self, input, output):
-        tree = self.p.parseQuery(input)
-        self.compareParseTrees(tree, output)
+class TestQueryParser(TestQueryParserBase):
 
-    def failure(self, input):
-        self.assertRaises(ParseError, self.p.parseQuery, input)
-
-    def setUp(self):
-        self.lexicon = Lexicon(Splitter())
-        self.p = QueryParser(self.lexicon)
-
-    def testParseQuery(self):
+    def test001(self):
         self.expect("foo", AtomNode("foo"))
+
+    def test002(self):
         self.expect("note", AtomNode("note"))
+
+    def test003(self):
         self.expect("aa and bb AND cc",
                     AndNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
+
+    def test004(self):
         self.expect("aa OR bb or cc",
                     OrNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
+
+    def test005(self):
         self.expect("aa AND bb OR cc AnD dd",
                     OrNode([AndNode([AtomNode("aa"), AtomNode("bb")]),
                             AndNode([AtomNode("cc"), AtomNode("dd")])]))
+
+    def test006(self):
         self.expect("(aa OR bb) AND (cc OR dd)",
                     AndNode([OrNode([AtomNode("aa"), AtomNode("bb")]),
                              OrNode([AtomNode("cc"), AtomNode("dd")])]))
-        self.expect("aa AND not bb",
+
+    def test007(self):
+        self.expect("aa AND NOT bb",
                     AndNode([AtomNode("aa"), NotNode(AtomNode("bb"))]))
 
-        self.expect('"foo bar"', PhraseNode("foo bar"))
+    def test010(self):
+        self.expect('"foo bar"', PhraseNode(["foo", "bar"]))
+
+    def test011(self):
         self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
 
-        self.expect('(("foo bar"))"', PhraseNode("foo bar"))
+    def test012(self):
+        self.expect('(("foo bar"))"', PhraseNode(["foo", "bar"]))
+
+    def test013(self):
         self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")]))
 
-        if self.__class__ is TestQueryParser:
-            # This test fails when testZCTextIndex subclasses this class,
-            # because its lexicon's pipeline removes stopwords
-            self.expect('and/', AtomNode("and"))
+    def test014(self):
+        self.expect("foo-bar", PhraseNode(["foo", "bar"]))
 
-        self.expect("foo-bar", PhraseNode("foo bar"))
+    def test015(self):
         self.expect("foo -bar", AndNode([AtomNode("foo"),
                                          NotNode(AtomNode("bar"))]))
+
+    def test016(self):
         self.expect("-foo bar", AndNode([AtomNode("bar"),
                                          NotNode(AtomNode("foo"))]))
+
+    def test017(self):
         self.expect("booh -foo-bar",
                     AndNode([AtomNode("booh"),
-                             NotNode(PhraseNode("foo bar"))]))
+                             NotNode(PhraseNode(["foo", "bar"]))]))
+
+    def test018(self):
         self.expect('booh -"foo bar"',
                     AndNode([AtomNode("booh"),
-                             NotNode(PhraseNode("foo bar"))]))
+                             NotNode(PhraseNode(["foo", "bar"]))]))
+
+    def test019(self):
         self.expect('foo"bar"',
                     AndNode([AtomNode("foo"), AtomNode("bar")]))
+
+    def test020(self):
         self.expect('"foo"bar',
                     AndNode([AtomNode("foo"), AtomNode("bar")]))
+
+    def test021(self):
         self.expect('foo"bar"blech',
                     AndNode([AtomNode("foo"), AtomNode("bar"),
                              AtomNode("blech")]))
 
+    def test022(self):
         self.expect("foo*", GlobNode("foo*"))
+
+    def test023(self):
         self.expect("foo* bar", AndNode([GlobNode("foo*"),
                                          AtomNode("bar")]))
 
-    def testParseFailures(self):
+    def test101(self):
         self.failure("")
+
+    def test102(self):
         self.failure("not")
+
+    def test103(self):
+        self.failure("or")
+
+    def test104(self):
+        self.failure("and")
+
+    def test105(self):
+        self.failure("NOT")
+
+    def test106(self):
         self.failure("OR")
+
+    def test107(self):
         self.failure("AND")
-        self.failure("not foo")
+
+    def test108(self):
+        self.failure("NOT foo")
+
+    def test109(self):
         self.failure(")")
+
+    def test110(self):
         self.failure("(")
+
+    def test111(self):
         self.failure("foo OR")
+
+    def test112(self):
         self.failure("foo AND")
+
+    def test113(self):
         self.failure("OR foo")
-        self.failure("and foo")
+
+    def test114(self):
+        self.failure("AND foo")
+
+    def test115(self):
         self.failure("(foo) bar")
+
+    def test116(self):
         self.failure("(foo OR)")
+
+    def test117(self):
         self.failure("(foo AND)")
+
+    def test118(self):
         self.failure("(NOT foo)")
+
+    def test119(self):
         self.failure("-foo")
+
+    def test120(self):
         self.failure("-foo -bar")
-        self.failure('""')
+
+    def test121(self):
+        self.failure("foo OR -bar")
+
+    def test122(self):
+        self.failure("foo AND -bar")
+
+class StopWordTestQueryParser(TestQueryParserBase):
+
+    def setUp(self):
+        # Only 'stop' is a stopword (but 'and' is still an operator)
+        self.lexicon = Lexicon(Splitter(), FakeStopWordRemover())
+        self.parser = QueryParser(self.lexicon)
+
+    def test201(self):
+        self.expect('and/', AtomNode("and"))
+
+    def test202(self):
+        self.expect('foo AND stop', AtomNode("foo"), ["stop"])
+
+    def test203(self):
+        self.expect('foo AND NOT stop', AtomNode("foo"), ["stop"])
+
+    def test204(self):
+        self.expect('stop AND foo', AtomNode("foo"), ["stop"])
+
+    def test205(self):
+        self.expect('foo OR stop', AtomNode("foo"), ["stop"])
+
+    def test206(self):
+        self.expect('stop OR foo', AtomNode("foo"), ["stop"])
+
+    def test301(self):
+        self.failure('stop')
+
+    def test302(self):
+        self.failure('stop stop')
+
+    def test303(self):
+        self.failure('stop AND stop')
+
+    def test304(self):
+        self.failure('stop OR stop')
+
+    def test305(self):
+        self.failure('stop -foo')
+
+    def test306(self):
+        self.failure('stop AND NOT foo')
+
+class FakeStopWordRemover:
+
+    def process(self, list):
+        return [word for word in list if word != "stop"]
 
 
 def test_suite():
-    return makeSuite(TestQueryParser)
+    return TestSuite((makeSuite(TestQueryParser),
+                      makeSuite(StopWordTestQueryParser),
+                    ))
 
 if __name__=="__main__":
     main(defaultTest='test_suite')
diff --git a/tests/testZCTextIndex.py b/tests/testZCTextIndex.py
@@ -454,7 +454,7 @@ def setUp(self):
                                StopWordRemover())
         caller = LexiconHolder(self.lexicon)
         self.zc_index = ZCTextIndex('name', extra, caller, self.IndexFactory)
-        self.p = self.parser = QueryParser(self.lexicon)
+        self.parser = QueryParser(self.lexicon)
         self.index = self.zc_index.index
         self.add_docs()