Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
Refactor/combine _docweight/_doclen.
Browse files Browse the repository at this point in the history
  • Loading branch information
Tim Peters committed May 17, 2002
1 parent 53f93fe commit 46df9cc
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 47 deletions.
8 changes: 7 additions & 1 deletion BaseIndex.py
Expand Up @@ -53,7 +53,7 @@ def __init__(self, lexicon):

# wid -> {docid -> weight}; t -> D -> w(D, t)
# Different indexers have different notions of term weight, but we
# expect all indexers to use ._wordinfo to map wids to its notion
# expect each indexer to use ._wordinfo to map wids to its notion
# of a docid-to-weight map.
# There are two kinds of OOV words: wid 0 is explicitly OOV,
# and it's possible that the lexicon will return a non-zero wid
Expand All @@ -64,6 +64,12 @@ def __init__(self, lexicon):
# wid 0 must not be a key in _wordinfo.
self._wordinfo = IOBTree()

# docid -> weight
# Different indexers have different notions of doc weight, but we
# expect each indexer to use ._docweight to map docids to its
# notion of what a doc weight is.
self._docweight = IIBTree()

# docid -> WidCode'd list of wids
# Used for un-indexing, and for phrase search.
self._docwords = IOBTree()
Expand Down
2 changes: 1 addition & 1 deletion CosineIndex.py
Expand Up @@ -54,8 +54,8 @@ def __init__(self, lexicon):
# ._wordinfo for cosine is wid -> {docid -> weight};
# t -> D -> w(d, t)/W(d)

# ._docweight for Okapi is
# docid -> W(docid)
self._docweight = IIBTree()

# Most of the computation for computing a relevance score for the
# document occurs in the search() method. The code currently
Expand Down
14 changes: 7 additions & 7 deletions OkapiIndex.py
Expand Up @@ -63,20 +63,20 @@ def __init__(self, lexicon):
# ._wordinfo for Okapi is
# wid -> {docid -> frequency}; t -> D -> f(D, t)

# ._docweight for Okapi is
# docid -> # of words in the doc
# This is just len(self._docwords[docid]), but _docwords is stored
# in compressed form, so uncompressing it just to count the list
# length would be ridiculously expensive.
self._doclen = IIBTree()

# sum(self._doclen.values()), the total # of words in all docs
# sum(self._docweight.values()), the total # of words in all docs
# This is a long for "better safe than sorry" reasons. It isn't
# used often enough that speed should matter.
self._totaldoclen = 0L

def index_doc(self, docid, text):
wids = self._lexicon.sourceToWordIds(text)
self._doclen[docid] = len(wids)
self._docweight[docid] = len(wids)
self._totaldoclen += len(wids)

wid2count = self._get_frequencies(wids)
Expand All @@ -92,8 +92,8 @@ def unindex_doc(self, docid):

del self._docwords[docid]

count = self._doclen[docid]
del self._doclen[docid]
count = self._docweight[docid]
del self._docweight[docid]
self._totaldoclen -= count

# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
Expand All @@ -105,7 +105,7 @@ def unindex_doc(self, docid):
def _search_wids(self, wids):
if not wids:
return []
N = float(len(self._doclen)) # total # of docs
N = float(len(self._docweight)) # total # of docs
meandoclen = self._totaldoclen / N
K1 = self.K1
B = self.B
Expand All @@ -117,7 +117,7 @@ def _search_wids(self, wids):
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

L = []
docid2len = self._doclen
docid2len = self._docweight
for t in wids:
assert self._wordinfo.has_key(t) # caller responsible for OOV
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
Expand Down
48 changes: 10 additions & 38 deletions tests/testIndex.py
Expand Up @@ -18,34 +18,20 @@
from Products.ZCTextIndex.CosineIndex import CosineIndex
from Products.ZCTextIndex.OkapiIndex import OkapiIndex

# The cosine and Okapi indices have the same public interfaces, but these
# tests access internal attributes, and those aren't identical.
# The IndexTest class is abstract, and subclasses must implement the
# check_docid_known and num_docs_known methods. CosineIndexTest (later in
# this file) does those in terms of ._docweight, while OkapiIndexTest
# (later in this file) does them in terms of ._doclen.
# Subclasses must set a class variable IndexFactory to the appropriate
# index object constructor.

class IndexTest(TestCase):

# Subclasses must implement these methods, and set a class variable
# IndexFactory to the appropriate index object constructor.

def check_docid_known(self, DOCID):
raise NotImplementedError

def num_docs_known(self):
raise NotImplementedError


def setUp(self):
self.lexicon = Lexicon(Splitter())
self.index = self.IndexFactory(self.lexicon)

def test_index_document(self, DOCID=1):
doc = "simple document contains five words"
self.index.index_doc(DOCID, doc)
self.check_docid_known(DOCID)
self.assertEqual(self.num_docs_known(), 1)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._docweight), 1)
self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 5)
Expand All @@ -57,7 +43,7 @@ def test_unindex_document(self):
DOCID = 1
self.test_index_document(DOCID)
self.index.unindex_doc(DOCID)
self.assertEqual(self.num_docs_known(), 0)
self.assertEqual(len(self.index._docweight), 0)
self.assertEqual(len(self.index._wordinfo), 0)
self.assertEqual(len(self.index._docwords), 0)

Expand All @@ -66,8 +52,8 @@ def test_index_two_documents(self):
doc = "another document just four"
DOCID = 2
self.index.index_doc(DOCID, doc)
self.check_docid_known(DOCID)
self.assertEqual(self.num_docs_known(), 2)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._docweight), 2)
self.assertEqual(len(self.index._wordinfo), 8)
self.assertEqual(len(self.index._docwords), 2)
self.assertEqual(len(self.index.get_words(DOCID)), 4)
Expand All @@ -87,8 +73,8 @@ def test_index_two_unindex_one(self):
self.test_index_two_documents()
self.index.unindex_doc(1)
DOCID = 2
self.assertEqual(self.num_docs_known(), 1)
self.check_docid_known(DOCID)
self.assertEqual(len(self.index._docweight), 1)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 4)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 4)
Expand All @@ -99,7 +85,7 @@ def test_index_two_unindex_one(self):
def test_index_duplicated_words(self, DOCID=1):
doc = "very simple repeat repeat repeat document test"
self.index.index_doc(DOCID, doc)
self.check_docid_known(DOCID)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 7)
Expand Down Expand Up @@ -144,23 +130,9 @@ def test_search_glob(self):
class CosineIndexTest(IndexTest):
IndexFactory = CosineIndex

def check_docid_known(self, docid):
self.assert_(self.index._docweight.has_key(docid))
self.assert_(self.index._docweight[docid] > 0)

def num_docs_known(self):
return len(self.index._docweight)

class OkapiIndexTest(IndexTest):
IndexFactory = OkapiIndex

def check_docid_known(self, docid):
self.assert_(self.index._doclen.has_key(docid))
self.assert_(self.index._doclen[docid] > 0)

def num_docs_known(self):
return len(self.index._doclen)

def test_suite():
return TestSuite((makeSuite(CosineIndexTest),
makeSuite(OkapiIndexTest),
Expand Down

0 comments on commit 46df9cc

Please sign in to comment.