Refactor/combine _docweight/_doclen.

zopefoundation · May 17, 2002 · 46df9cc · 46df9cc
1 parent 53f93fe
commit 46df9cc
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 47 deletions.
diff --git a/BaseIndex.py b/BaseIndex.py
@@ -53,7 +53,7 @@ def __init__(self, lexicon):
 
         # wid -> {docid -> weight}; t -> D -> w(D, t)
         # Different indexers have different notions of term weight, but we
-        # expect all indexers to use ._wordinfo to map wids to its notion
+        # expect each indexer to use ._wordinfo to map wids to its notion
         # of a docid-to-weight map.
         # There are two kinds of OOV words:  wid 0 is explicitly OOV,
         # and it's possible that the lexicon will return a non-zero wid
@@ -64,6 +64,12 @@ def __init__(self, lexicon):
         # wid 0 must not be a key in _wordinfo.
         self._wordinfo = IOBTree()
 
+        # docid -> weight
+        # Different indexers have different notions of doc weight, but we
+        # expect each indexer to use ._docweight to map docids to its
+        # notion of what a doc weight is.
+        self._docweight = IIBTree()
+
         # docid -> WidCode'd list of wids
         # Used for un-indexing, and for phrase search.
         self._docwords = IOBTree()

diff --git a/CosineIndex.py b/CosineIndex.py
@@ -54,8 +54,8 @@ def __init__(self, lexicon):
         # ._wordinfo for cosine is wid -> {docid -> weight};
         # t -> D -> w(d, t)/W(d)
 
+        # ._docweight for Okapi is
         # docid -> W(docid)
-        self._docweight = IIBTree()
 
     # Most of the computation for computing a relevance score for the
     # document occurs in the search() method.  The code currently

diff --git a/OkapiIndex.py b/OkapiIndex.py
@@ -63,20 +63,20 @@ def __init__(self, lexicon):
         # ._wordinfo for Okapi is
         # wid -> {docid -> frequency}; t -> D -> f(D, t)
 
+        # ._docweight for Okapi is
         # docid -> # of words in the doc
         # This is just len(self._docwords[docid]), but _docwords is stored
         # in compressed form, so uncompressing it just to count the list
         # length would be ridiculously expensive.
-        self._doclen = IIBTree()
 
-        # sum(self._doclen.values()), the total # of words in all docs
+        # sum(self._docweight.values()), the total # of words in all docs
         # This is a long for "better safe than sorry" reasons.  It isn't
         # used often enough that speed should matter.
         self._totaldoclen = 0L
 
     def index_doc(self, docid, text):
         wids = self._lexicon.sourceToWordIds(text)
-        self._doclen[docid] = len(wids)
+        self._docweight[docid] = len(wids)
         self._totaldoclen += len(wids)
 
         wid2count = self._get_frequencies(wids)
@@ -92,8 +92,8 @@ def unindex_doc(self, docid):
 
         del self._docwords[docid]
 
-        count = self._doclen[docid]
-        del self._doclen[docid]
+        count = self._docweight[docid]
+        del self._docweight[docid]
         self._totaldoclen -= count
 
     # The workhorse.  Return a list of (IIBucket, weight) pairs, one pair
@@ -105,7 +105,7 @@ def unindex_doc(self, docid):
     def _search_wids(self, wids):
         if not wids:
             return []
-        N = float(len(self._doclen))  # total # of docs
+        N = float(len(self._docweight))  # total # of docs
         meandoclen = self._totaldoclen / N
         K1 = self.K1
         B = self.B
@@ -117,7 +117,7 @@ def _search_wids(self, wids):
         #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
 
         L = []
-        docid2len = self._doclen
+        docid2len = self._docweight
         for t in wids:
             assert self._wordinfo.has_key(t)  # caller responsible for OOV
             d2f = self._wordinfo[t] # map {docid -> f(docid, t)}

diff --git a/tests/testIndex.py b/tests/testIndex.py
@@ -18,34 +18,20 @@
 from Products.ZCTextIndex.CosineIndex import CosineIndex
 from Products.ZCTextIndex.OkapiIndex import OkapiIndex
 
-# The cosine and Okapi indices have the same public interfaces, but these
-# tests access internal attributes, and those aren't identical.
-# The IndexTest class is abstract, and subclasses must implement the
-# check_docid_known and num_docs_known methods.  CosineIndexTest (later in
-# this file) does those in terms of ._docweight, while OkapiIndexTest
-# (later in this file) does them in terms of ._doclen.
+# Subclasses must set a class variable IndexFactory to the appropriate
+# index object constructor.
 
 class IndexTest(TestCase):
 
-    # Subclasses must implement these methods, and set a class variable
-    # IndexFactory to the appropriate index object constructor.
-
-    def check_docid_known(self, DOCID):
-        raise NotImplementedError
-
-    def num_docs_known(self):
-        raise NotImplementedError
-
-
     def setUp(self):
         self.lexicon = Lexicon(Splitter())
         self.index = self.IndexFactory(self.lexicon)
 
     def test_index_document(self, DOCID=1):
         doc = "simple document contains five words"
         self.index.index_doc(DOCID, doc)
-        self.check_docid_known(DOCID)
-        self.assertEqual(self.num_docs_known(), 1)
+        self.assert_(self.index._docweight[DOCID])
+        self.assertEqual(len(self.index._docweight), 1)
         self.assertEqual(len(self.index._wordinfo), 5)
         self.assertEqual(len(self.index._docwords), 1)
         self.assertEqual(len(self.index.get_words(DOCID)), 5)
@@ -57,7 +43,7 @@ def test_unindex_document(self):
         DOCID = 1
         self.test_index_document(DOCID)
         self.index.unindex_doc(DOCID)
-        self.assertEqual(self.num_docs_known(), 0)
+        self.assertEqual(len(self.index._docweight), 0)
         self.assertEqual(len(self.index._wordinfo), 0)
         self.assertEqual(len(self.index._docwords), 0)
 
@@ -66,8 +52,8 @@ def test_index_two_documents(self):
         doc = "another document just four"
         DOCID = 2
         self.index.index_doc(DOCID, doc)
-        self.check_docid_known(DOCID)
-        self.assertEqual(self.num_docs_known(), 2)
+        self.assert_(self.index._docweight[DOCID])
+        self.assertEqual(len(self.index._docweight), 2)
         self.assertEqual(len(self.index._wordinfo), 8)
         self.assertEqual(len(self.index._docwords), 2)
         self.assertEqual(len(self.index.get_words(DOCID)), 4)
@@ -87,8 +73,8 @@ def test_index_two_unindex_one(self):
         self.test_index_two_documents()
         self.index.unindex_doc(1)
         DOCID = 2
-        self.assertEqual(self.num_docs_known(), 1)
-        self.check_docid_known(DOCID)
+        self.assertEqual(len(self.index._docweight), 1)
+        self.assert_(self.index._docweight[DOCID])
         self.assertEqual(len(self.index._wordinfo), 4)
         self.assertEqual(len(self.index._docwords), 1)
         self.assertEqual(len(self.index.get_words(DOCID)), 4)
@@ -99,7 +85,7 @@ def test_index_two_unindex_one(self):
     def test_index_duplicated_words(self, DOCID=1):
         doc = "very simple repeat repeat repeat document test"
         self.index.index_doc(DOCID, doc)
-        self.check_docid_known(DOCID)
+        self.assert_(self.index._docweight[DOCID])
         self.assertEqual(len(self.index._wordinfo), 5)
         self.assertEqual(len(self.index._docwords), 1)
         self.assertEqual(len(self.index.get_words(DOCID)), 7)
@@ -144,23 +130,9 @@ def test_search_glob(self):
 class CosineIndexTest(IndexTest):
     IndexFactory = CosineIndex
 
-    def check_docid_known(self, docid):
-        self.assert_(self.index._docweight.has_key(docid))
-        self.assert_(self.index._docweight[docid] > 0)
-
-    def num_docs_known(self):
-        return len(self.index._docweight)
-
 class OkapiIndexTest(IndexTest):
     IndexFactory = OkapiIndex
 
-    def check_docid_known(self, docid):
-        self.assert_(self.index._doclen.has_key(docid))
-        self.assert_(self.index._doclen[docid] > 0)
-
-    def num_docs_known(self):
-        return len(self.index._doclen)
-
 def test_suite():
     return TestSuite((makeSuite(CosineIndexTest),
                       makeSuite(OkapiIndexTest),