Fix queries of the form 'extension module C'.

zopefoundation · May 16, 2002 · f94baf2 · f94baf2
1 parent 0c561da
commit f94baf2
Show file tree

Hide file tree

Showing 7 changed files with 27 additions and 11 deletions.
diff --git a/CosineIndex.py b/CosineIndex.py
@@ -115,6 +115,10 @@ def unindex_doc(self, docid):
 
     def search(self, term):
         wids = self._lexicon.termToWordIds(term)
+        if not wids:
+            return None # All docs match
+        if 0 in wids:
+            wids = filter(None, wids)
         return mass_weightedUnion(self._search_wids(wids))
 
     def search_glob(self, pattern):
@@ -123,6 +127,8 @@ def search_glob(self, pattern):
 
     def search_phrase(self, phrase):
         wids = self._lexicon.termToWordIds(phrase)
+        if 0 in wids:
+            return IIBTree()
         hits = mass_weightedIntersection(self._search_wids(wids))
         if not hits:
             return hits
@@ -157,6 +163,8 @@ def query_weight(self, terms):
         N = float(len(self._docweight))
         sum = 0.0
         for wid in wids:
+            if wid == 0:
+                continue
             wt = math.log(1.0 + N / len(self._wordinfo[wid]))
             sum += wt ** 2.0
         return scaled_int(math.sqrt(sum))

diff --git a/Lexicon.py b/Lexicon.py
@@ -62,9 +62,7 @@ def termToWordIds(self, text):
             last = element.process(last)
         wids = []
         for word in last:
-            wid = self._wids.get(word)
-            if wid is not None:
-                wids.append(wid)
+            wids.append(self._wids.get(word, 0))
         return wids
 
     def get_word(self, wid):

diff --git a/OkapiIndex.py b/OkapiIndex.py
@@ -109,6 +109,10 @@ def unindex_doc(self, docid):
 
     def search(self, term):
         wids = self._lexicon.termToWordIds(term)
+        if not wids:
+            return None # All docs match
+        if 0 in wids:
+            wids = filter(None, wids)
         return mass_weightedUnion(self._search_wids(wids))
 
     def search_glob(self, pattern):
@@ -117,6 +121,8 @@ def search_glob(self, pattern):
 
     def search_phrase(self, phrase):
         wids = self._lexicon.termToWordIds(phrase)
+        if 0 in wids:
+            return IIBTree()
         hits = mass_weightedIntersection(self._search_wids(wids))
         if not hits:
             return hits

diff --git a/SetOps.py b/SetOps.py
@@ -20,10 +20,10 @@
 
 def mass_weightedIntersection(L):
     "A list of (mapping, weight) pairs -> their weightedIntersection IIBTree."
+    L = [(map, weight) for (map, weight) in L if map is not None]
     if not L:
         return IIBTree()
     # Intersect with smallest first.
-    L = L[:]    # don't mutate the caller's L
     L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
     x, w = L[0]
     dummy, result = weightedUnion(IIBTree(), x, 1, w)

diff --git a/ZCTextIndex.py b/ZCTextIndex.py
@@ -72,6 +72,8 @@ def query(self, query, nbest=10):
         """
         tree = QueryParser().parseQuery(query)
         results = tree.executeQuery(self.index)
+        if results is None:
+            return [], 0
         chooser = NBest(nbest)
         chooser.addmany(results.items())
         return chooser.getbest(), len(results)

diff --git a/tests/mhindex.py b/tests/mhindex.py
@@ -143,7 +143,7 @@ def interact(self, nbest=NBEST, maxlines=MAXLINES):
                 if not text:
                     continue
             try:
-                n, results = self.timequery(text, top + nbest)
+                results, n = self.timequery(text, top + nbest)
             except:
                 reportexc()
                 text = ""
@@ -163,7 +163,7 @@ def interact(self, nbest=NBEST, maxlines=MAXLINES):
             top += nbest
 
     def query(self, text, nbest=NBEST, maxlines=MAXLINES):
-        n, results = self.timequery(text, nbest)
+        results, n = self.timequery(text, nbest)
         if not n:
             print "No hits for %r." % text
             return
@@ -173,11 +173,11 @@ def query(self, text, nbest=NBEST, maxlines=MAXLINES):
     def timequery(self, text, nbest):
         t0 = time.time()
         c0 = time.clock()
-        n, results = self.index.query(text, nbest)
+        results, n = self.index.query(text, nbest)
         t1 = time.time()
         c1 = time.clock()
         print "[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0)
-        return n, results
+        return results, n
 
     def formatresults(self, text, results, maxlines=MAXLINES,
                       lo=0, hi=sys.maxint):
@@ -397,9 +397,11 @@ def query(self, query, nbest=10):
         parser = QueryParser()
         tree = parser.parseQuery(query)
         results = tree.executeQuery(self.index)
+        if results is None:
+            return [], 0
         chooser = NBest(nbest)
         chooser.addmany(results.items())
-        return len(results), chooser.getbest()
+        return chooser.getbest(), len(results)
 
     def query_weight(self, query):
         parser = QueryParser()

diff --git a/tests/testLexicon.py b/tests/testLexicon.py
@@ -76,7 +76,7 @@ def testMissingTermToWordIds(self):
         lexicon = Lexicon(Splitter())
         wids = lexicon.sourceToWordIds('cats and dogs')
         wids = lexicon.termToWordIds('boxes')
-        self.assertEqual(wids, [])
+        self.assertEqual(wids, [0])
 
     def testOnePipelineElement(self):
         lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish'))
@@ -94,7 +94,7 @@ def testSplitterAdaptorNofold(self):
         lexicon = Lexicon(Splitter())
         wids = lexicon.sourceToWordIds('CATS and dogs')
         wids = lexicon.termToWordIds('cats and dogs')
-        self.assertEqual(wids, [2, 3])
+        self.assertEqual(wids, [0, 2, 3])
 
     def testTwoElementPipeline(self):
         lexicon = Lexicon(Splitter(),