diff --git a/BaseIndex.py b/BaseIndex.py index 874b738..4781678 100644 --- a/BaseIndex.py +++ b/BaseIndex.py @@ -20,6 +20,7 @@ from BTrees.IOBTree import IOBTree from BTrees.IIBTree import IIBTree, IIBucket, IITreeSet from BTrees.IIBTree import intersection, difference +import BTrees.Length from Products.ZCTextIndex.IIndex import IIndex from Products.ZCTextIndex import WidCode @@ -52,6 +53,8 @@ def unique(L): class BaseIndex(Persistent): __implements__ = IIndex + + word_count = 0 def __init__(self, lexicon): self._lexicon = lexicon @@ -80,13 +83,18 @@ def __init__(self, lexicon): # docid -> WidCode'd list of wids # Used for un-indexing, and for phrase search. self._docwords = IOBTree() - + + # Use a BTree length for efficient length computation w/o conflicts + self.length = BTrees.Length.Length() + def length(self): """Return the number of words in the index.""" + # This is overridden per instance return len(self._wordinfo) def get_words(self, docid): """Return a list of the wordids for a given docid.""" + # Note this is overridden in the instance return WidCode.decode(self._docwords[docid]) # A subclass may wish to extend or override this. @@ -239,6 +247,7 @@ def _add_wordinfo(self, wid, f, docid): doc2score = self._wordinfo.get(wid) if doc2score is None: doc2score = {} + self.length.change(1) else: # _add_wordinfo() is called for each update. If the map # size exceeds the DICT_CUTOFF, convert to an IIBTree. @@ -262,15 +271,19 @@ def _add_wordinfo(self, wid, f, docid): def _mass_add_wordinfo(self, wid2weight, docid): dicttype = type({}) get_doc2score = self._wordinfo.get + new_word_count = 0 for wid, weight in wid2weight.items(): doc2score = get_doc2score(wid) if doc2score is None: doc2score = {} + new_word_count += 1 elif (isinstance(doc2score, dicttype) and len(doc2score) == self.DICT_CUTOFF): doc2score = IIBTree(doc2score) doc2score[docid] = weight self._wordinfo[wid] = doc2score # not redundant: Persistency! + self.length.change(new_word_count) + def _del_wordinfo(self, wid, docid): doc2score = self._wordinfo[wid] @@ -278,6 +291,7 @@ def _del_wordinfo(self, wid, docid): numdocs = len(doc2score) if numdocs == 0: del self._wordinfo[wid] + self.length.change(-1) return if numdocs == self.DICT_CUTOFF: new = {} diff --git a/tests/testIndex.py b/tests/testIndex.py index f41ad3e..722343a 100644 --- a/tests/testIndex.py +++ b/tests/testIndex.py @@ -37,6 +37,8 @@ def test_index_document(self, DOCID=1): self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(DOCID)), 5) + self.assertEqual(len(self.index._wordinfo), + self.index.length()) for map in self.index._wordinfo.values(): self.assertEqual(len(map), 1) self.assert_(map.has_key(DOCID)) @@ -48,6 +50,8 @@ def test_unindex_document(self): self.assertEqual(len(self.index._docweight), 0) self.assertEqual(len(self.index._wordinfo), 0) self.assertEqual(len(self.index._docwords), 0) + self.assertEqual(len(self.index._wordinfo), + self.index.length()) def test_index_two_documents(self): self.test_index_document() @@ -59,6 +63,8 @@ def test_index_two_documents(self): self.assertEqual(len(self.index._wordinfo), 8) self.assertEqual(len(self.index._docwords), 2) self.assertEqual(len(self.index.get_words(DOCID)), 4) + self.assertEqual(len(self.index._wordinfo), + self.index.length()) wids = self.lexicon.termToWordIds("document") self.assertEqual(len(wids), 1) document_wid = wids[0] @@ -80,6 +86,8 @@ def test_index_two_unindex_one(self): self.assertEqual(len(self.index._wordinfo), 4) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(DOCID)), 4) + self.assertEqual(len(self.index._wordinfo), + self.index.length()) for map in self.index._wordinfo.values(): self.assertEqual(len(map), 1) self.assert_(map.has_key(DOCID)) @@ -91,6 +99,8 @@ def test_index_duplicated_words(self, DOCID=1): self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(DOCID)), 7) + self.assertEqual(len(self.index._wordinfo), + self.index.length()) wids = self.lexicon.termToWordIds("repeat") self.assertEqual(len(wids), 1) repititive_wid = wids[0]