diff --git a/BaseIndex.py b/BaseIndex.py index 5bdcccf..291d085 100644 --- a/BaseIndex.py +++ b/BaseIndex.py @@ -53,7 +53,7 @@ def __init__(self, lexicon): # wid -> {docid -> weight}; t -> D -> w(D, t) # Different indexers have different notions of term weight, but we - # expect all indexers to use ._wordinfo to map wids to its notion + # expect each indexer to use ._wordinfo to map wids to its notion # of a docid-to-weight map. # There are two kinds of OOV words: wid 0 is explicitly OOV, # and it's possible that the lexicon will return a non-zero wid @@ -64,6 +64,12 @@ def __init__(self, lexicon): # wid 0 must not be a key in _wordinfo. self._wordinfo = IOBTree() + # docid -> weight + # Different indexers have different notions of doc weight, but we + # expect each indexer to use ._docweight to map docids to its + # notion of what a doc weight is. + self._docweight = IIBTree() + # docid -> WidCode'd list of wids # Used for un-indexing, and for phrase search. self._docwords = IOBTree() diff --git a/CosineIndex.py b/CosineIndex.py index 58e1a92..a1acece 100644 --- a/CosineIndex.py +++ b/CosineIndex.py @@ -54,8 +54,8 @@ def __init__(self, lexicon): # ._wordinfo for cosine is wid -> {docid -> weight}; # t -> D -> w(d, t)/W(d) + # ._docweight for Okapi is # docid -> W(docid) - self._docweight = IIBTree() # Most of the computation for computing a relevance score for the # document occurs in the search() method. The code currently diff --git a/OkapiIndex.py b/OkapiIndex.py index c4eb928..13b22ba 100644 --- a/OkapiIndex.py +++ b/OkapiIndex.py @@ -63,20 +63,20 @@ def __init__(self, lexicon): # ._wordinfo for Okapi is # wid -> {docid -> frequency}; t -> D -> f(D, t) + # ._docweight for Okapi is # docid -> # of words in the doc # This is just len(self._docwords[docid]), but _docwords is stored # in compressed form, so uncompressing it just to count the list # length would be ridiculously expensive. - self._doclen = IIBTree() - # sum(self._doclen.values()), the total # of words in all docs + # sum(self._docweight.values()), the total # of words in all docs # This is a long for "better safe than sorry" reasons. It isn't # used often enough that speed should matter. self._totaldoclen = 0L def index_doc(self, docid, text): wids = self._lexicon.sourceToWordIds(text) - self._doclen[docid] = len(wids) + self._docweight[docid] = len(wids) self._totaldoclen += len(wids) wid2count = self._get_frequencies(wids) @@ -92,8 +92,8 @@ def unindex_doc(self, docid): del self._docwords[docid] - count = self._doclen[docid] - del self._doclen[docid] + count = self._docweight[docid] + del self._docweight[docid] self._totaldoclen -= count # The workhorse. Return a list of (IIBucket, weight) pairs, one pair @@ -105,7 +105,7 @@ def unindex_doc(self, docid): def _search_wids(self, wids): if not wids: return [] - N = float(len(self._doclen)) # total # of docs + N = float(len(self._docweight)) # total # of docs meandoclen = self._totaldoclen / N K1 = self.K1 B = self.B @@ -117,7 +117,7 @@ def _search_wids(self, wids): # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) L = [] - docid2len = self._doclen + docid2len = self._docweight for t in wids: assert self._wordinfo.has_key(t) # caller responsible for OOV d2f = self._wordinfo[t] # map {docid -> f(docid, t)} diff --git a/tests/testIndex.py b/tests/testIndex.py index d76c5f2..ac15745 100644 --- a/tests/testIndex.py +++ b/tests/testIndex.py @@ -18,25 +18,11 @@ from Products.ZCTextIndex.CosineIndex import CosineIndex from Products.ZCTextIndex.OkapiIndex import OkapiIndex -# The cosine and Okapi indices have the same public interfaces, but these -# tests access internal attributes, and those aren't identical. -# The IndexTest class is abstract, and subclasses must implement the -# check_docid_known and num_docs_known methods. CosineIndexTest (later in -# this file) does those in terms of ._docweight, while OkapiIndexTest -# (later in this file) does them in terms of ._doclen. +# Subclasses must set a class variable IndexFactory to the appropriate +# index object constructor. class IndexTest(TestCase): - # Subclasses must implement these methods, and set a class variable - # IndexFactory to the appropriate index object constructor. - - def check_docid_known(self, DOCID): - raise NotImplementedError - - def num_docs_known(self): - raise NotImplementedError - - def setUp(self): self.lexicon = Lexicon(Splitter()) self.index = self.IndexFactory(self.lexicon) @@ -44,8 +30,8 @@ def setUp(self): def test_index_document(self, DOCID=1): doc = "simple document contains five words" self.index.index_doc(DOCID, doc) - self.check_docid_known(DOCID) - self.assertEqual(self.num_docs_known(), 1) + self.assert_(self.index._docweight[DOCID]) + self.assertEqual(len(self.index._docweight), 1) self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(DOCID)), 5) @@ -57,7 +43,7 @@ def test_unindex_document(self): DOCID = 1 self.test_index_document(DOCID) self.index.unindex_doc(DOCID) - self.assertEqual(self.num_docs_known(), 0) + self.assertEqual(len(self.index._docweight), 0) self.assertEqual(len(self.index._wordinfo), 0) self.assertEqual(len(self.index._docwords), 0) @@ -66,8 +52,8 @@ def test_index_two_documents(self): doc = "another document just four" DOCID = 2 self.index.index_doc(DOCID, doc) - self.check_docid_known(DOCID) - self.assertEqual(self.num_docs_known(), 2) + self.assert_(self.index._docweight[DOCID]) + self.assertEqual(len(self.index._docweight), 2) self.assertEqual(len(self.index._wordinfo), 8) self.assertEqual(len(self.index._docwords), 2) self.assertEqual(len(self.index.get_words(DOCID)), 4) @@ -87,8 +73,8 @@ def test_index_two_unindex_one(self): self.test_index_two_documents() self.index.unindex_doc(1) DOCID = 2 - self.assertEqual(self.num_docs_known(), 1) - self.check_docid_known(DOCID) + self.assertEqual(len(self.index._docweight), 1) + self.assert_(self.index._docweight[DOCID]) self.assertEqual(len(self.index._wordinfo), 4) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(DOCID)), 4) @@ -99,7 +85,7 @@ def test_index_two_unindex_one(self): def test_index_duplicated_words(self, DOCID=1): doc = "very simple repeat repeat repeat document test" self.index.index_doc(DOCID, doc) - self.check_docid_known(DOCID) + self.assert_(self.index._docweight[DOCID]) self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index.get_words(DOCID)), 7) @@ -144,23 +130,9 @@ def test_search_glob(self): class CosineIndexTest(IndexTest): IndexFactory = CosineIndex - def check_docid_known(self, docid): - self.assert_(self.index._docweight.has_key(docid)) - self.assert_(self.index._docweight[docid] > 0) - - def num_docs_known(self): - return len(self.index._docweight) - class OkapiIndexTest(IndexTest): IndexFactory = OkapiIndex - def check_docid_known(self, docid): - self.assert_(self.index._doclen.has_key(docid)) - self.assert_(self.index._doclen[docid] > 0) - - def num_docs_known(self): - return len(self.index._doclen) - def test_suite(): return TestSuite((makeSuite(CosineIndexTest), makeSuite(OkapiIndexTest),