Merge casey-zctextindex-fewer-conflicts-branch:

- Indexes and Lexicon now much less likely to generate write conflicts. Previously *any* concurrent index/unindex operation would conflict - Performance and scalability fix for queries
zopefoundation · Jun 5, 2003 · 7105342 · 7105342
1 parent ef5bf87
commit 7105342
Show file tree

Hide file tree

Showing 10 changed files with 259 additions and 33 deletions.
diff --git a/BaseIndex.py b/BaseIndex.py
@@ -20,7 +20,7 @@
 from BTrees.IOBTree import IOBTree
 from BTrees.IIBTree import IIBTree, IIBucket, IITreeSet
 from BTrees.IIBTree import intersection, difference
-import BTrees.Length
+from BTrees.Length import Length
 
 from Products.ZCTextIndex.IIndex import IIndex
 from Products.ZCTextIndex import WidCode
@@ -83,12 +83,18 @@ def __init__(self, lexicon):
         self._docwords = IOBTree()
 
         # Use a BTree length for efficient length computation w/o conflicts
-        self.length = BTrees.Length.Length()
+        self.length = Length()
+        self.document_count = Length()
 
     def length(self):
         """Return the number of words in the index."""
         # This is overridden per instance
         return len(self._wordinfo)
+
+    def document_count(self):
+        """Return the number of documents in the index"""
+        # This is overridden per instance
+        return len(self._docweight)        
 
     def get_words(self, docid):
         """Return a list of the wordids for a given docid."""
@@ -104,6 +110,11 @@ def index_doc(self, docid, text):
         self._mass_add_wordinfo(wid2weight, docid)
         self._docweight[docid] = docweight
         self._docwords[docid] = WidCode.encode(wids)
+        try:
+            self.document_count.change(1)
+        except AttributeError:
+            # Upgrade document_count to Length object
+            self.document_count = Length(self.document_count())
         return len(wids)
 
     # A subclass may wish to extend or override this.  This is for adjusting
@@ -165,6 +176,11 @@ def unindex_doc(self, docid):
             self._del_wordinfo(wid, docid)
         del self._docwords[docid]
         del self._docweight[docid]
+        try:
+            self.document_count.change(-1)
+        except AttributeError:
+            # Upgrade document_count to Length object
+            self.document_count = Length(self.document_count())
 
     def search(self, term):
         wids = self._lexicon.termToWordIds(term)

diff --git a/CosineIndex.py b/CosineIndex.py
@@ -69,7 +69,7 @@ def __init__(self, lexicon):
     def _search_wids(self, wids):
         if not wids:
             return []
-        N = float(len(self._docweight))
+        N = float(self.document_count())
         L = []
         DictType = type({})
         for wid in wids:
@@ -86,7 +86,7 @@ def query_weight(self, terms):
         wids = []
         for term in terms:
             wids += self._lexicon.termToWordIds(term)
-        N = float(len(self._docweight))
+        N = float(self.document_count())
         sum = 0.0
         for wid in self._remove_oov_wids(wids):
             wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)

diff --git a/IIndex.py b/IIndex.py
@@ -20,6 +20,9 @@ class IIndex(Interface.Base):
     """Interface for an Index."""
 
     def length():
+        """Return the number of words in the index."""
+
+    def document_count():
         """Return the number of documents in the index."""
 
     def get_words(docid):
@@ -62,10 +65,13 @@ def query_weight(terms):
         """
 
     def index_doc(docid, text):
-        "XXX"
+        """Add a document with the specified id and text to the index. If a
+        document by that id already exists, replace its text with the new
+        text provided
+        """
 
     def unindex_doc(docid):
-        "XXX"
+        """Remove the document with the specified id from the index"""
 
     def has_doc(docid):
         """Returns true if docid is an id of a document in the index"""
diff --git a/Lexicon.py b/Lexicon.py
@@ -16,6 +16,7 @@
 
 from BTrees.IOBTree import IOBTree
 from BTrees.OIBTree import OIBTree
+from BTrees.Length import Length
 
 import ZODB
 from Persistence import Persistent
@@ -37,16 +38,13 @@ def __init__(self, *pipeline):
         # we never saw before, and that isn't a known stopword (or otherwise
         # filtered out).  Returning a special wid value for OOV words is a
         # way to let clients know when an OOV word appears.
-        self._nextwid = 1
+        self.length = Length()
         self._pipeline = pipeline
 
-        # Keep some statistics about indexing
-        self._nbytes = 0 # Number of bytes indexed (at start of pipeline)
-        self._nwords = 0 # Number of words indexed (after pipeline)
-
     def length(self):
         """Return the number of unique terms in the lexicon."""
-        return self._nextwid - 1
+        # Overridden in instances
+        return len(self._wids)
 
     def words(self):
         return self._wids.keys()
@@ -59,11 +57,15 @@ def items(self):
 
     def sourceToWordIds(self, text):
         last = _text2list(text)
-        for t in last:
-            self._nbytes += len(t)
         for element in self._pipeline:
             last = element.process(last)
-        self._nwords += len(last)
+        if not hasattr(self.length, 'change'):
+            # Make sure length is overridden with a BTrees.Length.Length
+            self.length = Length(self.length())        
+        # Strategically unload the length value so that we get the most
+        # recent value written to the database to minimize conflicting wids
+        # XXX this will not work when MVCC is implemented in the ZODB...
+        self.length._p_deactivate()
         return map(self._getWordIdCreate, last)
 
     def termToWordIds(self, text):
@@ -138,9 +140,10 @@ def _getWordIdCreate(self, word):
         return wid
 
     def _new_wid(self):
-        wid = self._nextwid
-        self._nextwid += 1
-        return wid
+        self.length.change(1)
+        while self._words.has_key(self.length()): # just to be safe
+            self.length.change(1)
+        return self.length()
 
 def _text2list(text):
     # Helper: splitter input may be a string or a list of strings

diff --git a/OkapiIndex.py b/OkapiIndex.py
@@ -18,6 +18,7 @@
 # understand what's going on.
 
 from BTrees.IIBTree import IIBucket
+from BTrees.Length import Length
 
 from Products.ZCTextIndex.IIndex import IIndex
 from Products.ZCTextIndex.BaseIndex import BaseIndex, \
@@ -50,20 +51,29 @@ def __init__(self, lexicon):
         # sum(self._docweight.values()), the total # of words in all docs
         # This is a long for "better safe than sorry" reasons.  It isn't
         # used often enough that speed should matter.
-        self._totaldoclen = 0L
+        # Use a BTree.Length.Length object to avoid concurrent write conflicts
+        self._totaldoclen = Length(0L)
 
     def index_doc(self, docid, text):
         count = BaseIndex.index_doc(self, docid, text)
-        self._totaldoclen += count
+        self._change_doc_len(count)
         return count
 
     def _reindex_doc(self, docid, text):
-        self._totaldoclen -= self._docweight[docid]
+        self._change_doc_len(-self._docweight[docid])
         return BaseIndex._reindex_doc(self, docid, text)
 
     def unindex_doc(self, docid):
-        self._totaldoclen -= self._docweight[docid]
+        self._change_doc_len(-self._docweight[docid])
         BaseIndex.unindex_doc(self, docid)
+
+    def _change_doc_len(self, delta):
+        # Change total doc length used for scoring
+        try:
+            self._totaldoclen.change(delta)
+        except AttributeError:
+            # Opportunistically upgrade _totaldoclen attribute to Length object
+            self._totaldoclen = Length(long(self._totaldoclen + delta))
 
     # The workhorse.  Return a list of (IIBucket, weight) pairs, one pair
     # for each wid t in wids.  The IIBucket, times the weight, maps D to
@@ -76,8 +86,13 @@ def unindex_doc(self, docid):
     def _search_wids(self, wids):
         if not wids:
             return []
-        N = float(len(self._docweight))  # total # of docs
-        meandoclen = self._totaldoclen / N
+        N = float(self.document_count())  # total # of docs
+        try:
+            doclen = self._totaldoclen()
+        except TypeError:
+            # _totaldoclen has not yet been upgraded
+            doclen = self._totaldoclen
+        meandoclen = doclen / N
         K1 = self.K1
         B = self.B
         K1_plus1 = K1 + 1.0
@@ -120,8 +135,13 @@ def _search_wids(self, wids):
     def _search_wids(self, wids):
         if not wids:
             return []
-        N = float(len(self._docweight))  # total # of docs
-        meandoclen = self._totaldoclen / N
+        N = float(self.document_count())  # total # of docs
+        try:
+            doclen = self._totaldoclen()
+        except TypeError:
+            # _totaldoclen has not yet been upgraded
+            doclen = self._totaldoclen
+        meandoclen = doclen / N
         #K1 = self.K1
         #B = self.B
         #K1_plus1 = K1 + 1.0

diff --git a/ZCTextIndex.py b/ZCTextIndex.py
@@ -173,13 +173,11 @@ def _index_object(self, docid, obj, threshold=None, attr=None):
         if text is None:
             return 0
         count = self.index.index_doc(docid, text)
-        self._p_changed = 1 # XXX
         return count
 
     def unindex_object(self, docid):
         if self.index.has_doc(docid):
             self.index.unindex_doc(docid)
-            self._p_changed = 1 # XXX
 
     def _apply_index(self, request, cid=''):
         """Apply query specified by request, a mapping containing the query.

diff --git a/tests/mhindex.py b/tests/mhindex.py
@@ -441,8 +441,6 @@ def bulkupdate(self, args):
             self.updatefolder(f, f.listmessages())
             print "Total", len(self.docpaths)
         self.commit()
-        print "Indexed", self.index.lexicon._nbytes, "bytes and",
-        print self.index.lexicon._nwords, "words;",
         print len(self.index.lexicon._words), "unique words."
 
     def updatefolder(self, f, msgs):