Two changes and a question posing as a comment.

In unindex_doc() call _del_wordinfo() for each unique wid in the doc, not for each wid. Before we had WidCode and phrase searching, _docwords stored a list of the unique wids. The unindex code wasn't updated when _docwords started storing all the wids, even duplicates. Replace the try/except around __getitem__ in _add_wordinfo() with a .get() call. Add XXX comment about the purpose of the try/except(s) in _del_wordinfo(). I suspect they only existed because _del_wordinfo() was called repeatedly when a wid existed more than once.
zopefoundation · May 17, 2002 · 6d9ac11 · 6d9ac11
1 parent 61fa4de
commit 6d9ac11
Showing 1 changed file with 19 additions and 4 deletions.
diff --git a/BaseIndex.py b/BaseIndex.py
@@ -44,6 +44,13 @@ def scaled_int(f, scale=SCALE_FACTOR):
     # expensive.
     return int(f * scale + 0.5)
 
+def unique(l):
+    """Return a list of the unique elements in l."""
+    d = {}
+    for elt in l:
+        d[elt] = 1
+    return d.keys()
+
 class BaseIndex(Persistent):
 
     __implements__ = IIndex
@@ -108,7 +115,7 @@ def _get_frequencies(self, wids):
 
     # A subclass may wish to extend or override this.
     def unindex_doc(self, docid):
-        for wid in self.get_words(docid):
+        for wid in unique(self.get_words(docid)):
             self._del_wordinfo(wid, docid)
         del self._docwords[docid]
         del self._docweight[docid]
@@ -184,9 +191,8 @@ def _add_wordinfo(self, wid, f, docid):
         # space when it is live in memory.  An IIBTree stores two C
         # arrays of ints, one for the keys and one for the values.  It
         # holds upto 120 key-value pairs in a single bucket.
-        try:
-            map = self._wordinfo[wid]
-        except KeyError:
+        map = self._wordinfo.get(wid)
+        if map is None:
             map = {}
         else:
             # _add_wordinfo() is called for each update.  If the map
@@ -197,10 +203,19 @@ def _add_wordinfo(self, wid, f, docid):
         self._wordinfo[wid] = map # Not redundant, because of Persistency!
 
     def _del_wordinfo(self, wid, docid):
+        # XXX Not clear if the try/excepts here are guarding against
+        # corrupt data structures or if it is possible for the index
+        # to get in a state where it thinks an entry exits for the
+        # wid, docid pair and it doesn't.
         try:
             map = self._wordinfo[wid]
+        except KeyError:
+##            print "No info for wid", wid
+            return
+        try:
             del map[docid]
         except KeyError:
+##            print "doc %s does not use %s" % (docid, wid)
             return
         if len(map) == 0:
             del self._wordinfo[wid]