Use the new SetOps for mass union/intersection.

zopefoundation · May 15, 2002 · 81933db · 81933db
1 parent 6aadf5a
commit 81933db
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 56 deletions.
diff --git a/Index.py b/Index.py
@@ -17,11 +17,12 @@
 import math
 
 from BTrees.IOBTree import IOBTree
-from BTrees.IIBTree import IIBTree, IIBucket, IISet
-from BTrees.IIBTree import weightedIntersection, weightedUnion
+from BTrees.IIBTree import IIBTree, IIBucket
 
 from Products.ZCTextIndex.IIndex import IIndex
 from Products.ZCTextIndex import WidCode
+from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
+                                        mass_weightedUnion
 
 import ZODB
 from Persistence import Persistent
@@ -62,7 +63,7 @@ def __init__(self, lexicon):
     def length(self):
         """Return the number of documents in the index."""
         return len(self._docwords)
-        
+
     def get_words(self, docid):
         """Returns the wordids for a given docid"""
         return WidCode.decode(self._docwords[docid])
@@ -114,15 +115,15 @@ def unindex_doc(self, docid):
 
     def search(self, term):
         wids = self._lexicon.termToWordIds(term)
-        return self._union(self._search_wids(wids))
+        return mass_weightedUnion(self._search_wids(wids))
 
     def search_glob(self, pattern):
         wids = self._lexicon.globToWordIds(pattern)
-        return self._union(self._search_wids(wids))
+        return mass_weightedUnion(self._search_wids(wids))
 
     def search_phrase(self, phrase):
         wids = self._lexicon.termToWordIds(phrase)
-        hits = self._intersection(self._search_wids(wids))
+        hits = mass_weightedIntersection(self._search_wids(wids))
         if not hits:
             return hits
         code = WidCode.encode(wids)
@@ -149,22 +150,6 @@ def _search_wids(self, wids):
         L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
         return L
 
-    def _intersection(self, L):
-        if not L:
-            return IIBTree()
-        d2w, weight = L[0]
-        dummy, result = weightedUnion(IIBTree(), d2w, 1, weight)
-        for d2w, weight in L[1:]:
-            dummy, result = weightedIntersection(result, d2w, 1, weight)
-        return result
-
-    def _union(self, L):
-        # XXX This can be optimized, see OkapiIndex
-        result = IIBTree()
-        for d2w, weight in L:
-            dummy, result = weightedUnion(result, d2w, 1, weight)
-        return result
-
     def query_weight(self, terms):
         wids = []
         for term in terms:

diff --git a/OkapiIndex.py b/OkapiIndex.py
@@ -20,12 +20,13 @@
 import math
 
 from BTrees.IOBTree import IOBTree
-from BTrees.IIBTree import IIBTree, IIBucket, IISet
-from BTrees.IIBTree import weightedIntersection, weightedUnion
+from BTrees.IIBTree import IIBTree, IIBucket
 
 from Products.ZCTextIndex.IIndex import IIndex
-from Products.ZCTextIndex import WidCode
 from Products.ZCTextIndex.NBest import NBest
+from Products.ZCTextIndex import WidCode
+from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
+                                        mass_weightedUnion
 
 # Instead of storing floats, we generally store scaled ints.  Binary pickles
 # can store those more efficiently.  The default SCALE_FACTOR of 1024
@@ -98,15 +99,15 @@ def unindex_doc(self, docid):
 
     def search(self, term):
         wids = self._lexicon.termToWordIds(term)
-        return self._union(self._search_wids(wids))
+        return mass_weightedUnion(self._search_wids(wids))
 
     def search_glob(self, pattern):
         wids = self._lexicon.globToWordIds(pattern)
-        return self._union(self._search_wids(wids))
+        return mass_weightedUnion(self._search_wids(wids))
 
     def search_phrase(self, phrase):
         wids = self._lexicon.termToWordIds(phrase)
-        hits = self._intersection(self._search_wids(wids))
+        hits = mass_weightedIntersection(self._search_wids(wids))
         if not hits:
             return hits
         code = WidCode.encode(wids)
@@ -156,34 +157,6 @@ def _search_wids(self, wids):
         # of tf would still done at Python speed, and it's a lot more
         # work than just multiplying by idf.
 
-    def _intersection(self, L):
-        if not L:
-            return IIBTree()
-        # Intersect with smallest first.
-        L = L[:]    # don't mutate the caller's L
-        L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
-        d2w, weight = L[0]
-        dummy, result = weightedUnion(IIBTree(), d2w, 1, weight)
-        for d2w, weight in L[1:]:
-            dummy, result = weightedIntersection(result, d2w, 1, weight)
-        return result
-
-    def _union(self, L):
-        if not L:
-            return IIBTree()
-        # Balance unions as closely as possible, smallest to largest.
-        merge = NBest(len(L))
-        for x, weight in L:
-            merge.add((x, weight), len(x))
-        while len(merge) > 1:
-            # Merge the two smallest so far, and add back to the queue.
-            (x, wx), dummy = merge.pop_smallest()
-            (y, wy), dummy = merge.pop_smallest()
-            dummy, z = weightedUnion(x, y, wx, wy)
-            merge.add((z, 1), len(z))
-        (result, weight), score = merge.pop_smallest()
-        return result
-
     def query_weight(self, terms):
         # XXX I have no idea what to put here
         return 10