Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
Reimplemented Index.length to use a BTree.Length. Previous dynamic co…
Browse files Browse the repository at this point in the history
…mputation was way too slow for big indexes.

Updated tests to include length value checks
  • Loading branch information
caseman committed Jun 12, 2002
1 parent 0bf1237 commit b587692
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 1 deletion.
16 changes: 15 additions & 1 deletion BaseIndex.py
Expand Up @@ -20,6 +20,7 @@
from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import IIBTree, IIBucket, IITreeSet
from BTrees.IIBTree import intersection, difference
import BTrees.Length

from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode
Expand Down Expand Up @@ -52,6 +53,8 @@ def unique(L):
class BaseIndex(Persistent):

__implements__ = IIndex

word_count = 0

def __init__(self, lexicon):
self._lexicon = lexicon
Expand Down Expand Up @@ -80,13 +83,18 @@ def __init__(self, lexicon):
# docid -> WidCode'd list of wids
# Used for un-indexing, and for phrase search.
self._docwords = IOBTree()


# Use a BTree length for efficient length computation w/o conflicts
self.length = BTrees.Length.Length()

def length(self):
"""Return the number of words in the index."""
# This is overridden per instance
return len(self._wordinfo)

def get_words(self, docid):
"""Return a list of the wordids for a given docid."""
# Note this is overridden in the instance
return WidCode.decode(self._docwords[docid])

# A subclass may wish to extend or override this.
Expand Down Expand Up @@ -239,6 +247,7 @@ def _add_wordinfo(self, wid, f, docid):
doc2score = self._wordinfo.get(wid)
if doc2score is None:
doc2score = {}
self.length.change(1)
else:
# _add_wordinfo() is called for each update. If the map
# size exceeds the DICT_CUTOFF, convert to an IIBTree.
Expand All @@ -262,22 +271,27 @@ def _add_wordinfo(self, wid, f, docid):
def _mass_add_wordinfo(self, wid2weight, docid):
dicttype = type({})
get_doc2score = self._wordinfo.get
new_word_count = 0
for wid, weight in wid2weight.items():
doc2score = get_doc2score(wid)
if doc2score is None:
doc2score = {}
new_word_count += 1
elif (isinstance(doc2score, dicttype) and
len(doc2score) == self.DICT_CUTOFF):
doc2score = IIBTree(doc2score)
doc2score[docid] = weight
self._wordinfo[wid] = doc2score # not redundant: Persistency!
self.length.change(new_word_count)


def _del_wordinfo(self, wid, docid):
doc2score = self._wordinfo[wid]
del doc2score[docid]
numdocs = len(doc2score)
if numdocs == 0:
del self._wordinfo[wid]
self.length.change(-1)
return
if numdocs == self.DICT_CUTOFF:
new = {}
Expand Down
10 changes: 10 additions & 0 deletions tests/testIndex.py
Expand Up @@ -37,6 +37,8 @@ def test_index_document(self, DOCID=1):
self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 5)
self.assertEqual(len(self.index._wordinfo),
self.index.length())
for map in self.index._wordinfo.values():
self.assertEqual(len(map), 1)
self.assert_(map.has_key(DOCID))
Expand All @@ -48,6 +50,8 @@ def test_unindex_document(self):
self.assertEqual(len(self.index._docweight), 0)
self.assertEqual(len(self.index._wordinfo), 0)
self.assertEqual(len(self.index._docwords), 0)
self.assertEqual(len(self.index._wordinfo),
self.index.length())

def test_index_two_documents(self):
self.test_index_document()
Expand All @@ -59,6 +63,8 @@ def test_index_two_documents(self):
self.assertEqual(len(self.index._wordinfo), 8)
self.assertEqual(len(self.index._docwords), 2)
self.assertEqual(len(self.index.get_words(DOCID)), 4)
self.assertEqual(len(self.index._wordinfo),
self.index.length())
wids = self.lexicon.termToWordIds("document")
self.assertEqual(len(wids), 1)
document_wid = wids[0]
Expand All @@ -80,6 +86,8 @@ def test_index_two_unindex_one(self):
self.assertEqual(len(self.index._wordinfo), 4)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 4)
self.assertEqual(len(self.index._wordinfo),
self.index.length())
for map in self.index._wordinfo.values():
self.assertEqual(len(map), 1)
self.assert_(map.has_key(DOCID))
Expand All @@ -91,6 +99,8 @@ def test_index_duplicated_words(self, DOCID=1):
self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 7)
self.assertEqual(len(self.index._wordinfo),
self.index.length())
wids = self.lexicon.termToWordIds("repeat")
self.assertEqual(len(wids), 1)
repititive_wid = wids[0]
Expand Down

0 comments on commit b587692

Please sign in to comment.