Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
Reindex docs touching as few docid->w(docid, w) maps as possible.
Browse files Browse the repository at this point in the history
  • Loading branch information
Tim Peters committed May 17, 2002
1 parent 8bfe9b7 commit 2e56b82
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 8 deletions.
43 changes: 41 additions & 2 deletions BaseIndex.py
Expand Up @@ -19,6 +19,7 @@

from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import IIBTree, IIBucket, IITreeSet
from BTrees.IIBTree import intersection, difference

from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode
Expand Down Expand Up @@ -91,8 +92,7 @@ def get_words(self, docid):
# A subclass may wish to extend or override this.
def index_doc(self, docid, text):
if self._docwords.has_key(docid):
# XXX Do something smarter than this.
self.unindex_doc(docid)
return self._reindex_doc(docid, text)
wids = self._lexicon.sourceToWordIds(text)
wid2weight, docweight = self._get_frequencies(wids)
for wid, weight in wid2weight.items():
Expand All @@ -101,6 +101,45 @@ def index_doc(self, docid, text):
self._docwords[docid] = WidCode.encode(wids)
return len(wids)

# A subclass may wish to extend or override this. This is for adjusting
# to a new version of a doc that already exists. The goal is to be
# faster than simply unindexing the old version in its entirety and then
# adding the new version in its entirety.
def _reindex_doc(self, docid, text):
# Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
old_wids = self.get_words(docid)
old_wid2w, old_docw = self._get_frequencies(old_wids)

new_wids = self._lexicon.sourceToWordIds(text)
new_wid2w, new_docw = self._get_frequencies(new_wids)

old_widset = IITreeSet(old_wid2w.keys())
new_widset = IITreeSet(new_wid2w.keys())

in_both_widset = intersection(old_widset, new_widset)
only_old_widset = difference(old_widset, in_both_widset)
only_new_widset = difference(new_widset, in_both_widset)
del old_widset, new_widset

for wid in only_old_widset.keys():
self._del_wordinfo(wid, docid)

for wid in only_new_widset.keys():
self._add_wordinfo(wid, new_wid2w[wid], docid)

for wid in in_both_widset.keys():
# For the Okapi indexer, the "if" will trigger only for words
# whose counts have changed. For the cosine indexer, the "if"
# may trigger for every wid, since W(d) probably changed and
# W(d) is divided into every score.
newscore = new_wid2w[wid]
if old_wid2w[wid] != newscore:
self._add_wordinfo(wid, newscore, docid)

self._docweight[docid] = new_docw
self._docwords[docid] = WidCode.encode(new_wids)
return len(new_wids)

# Subclass must override.
def _get_frequencies(self, wids):
# Compute term frequencies and a doc weight, whatever those mean
Expand Down
5 changes: 5 additions & 0 deletions OkapiIndex.py
Expand Up @@ -54,6 +54,11 @@ def __init__(self, lexicon):
def index_doc(self, docid, text):
count = BaseIndex.index_doc(self, docid, text)
self._totaldoclen += count
return count

def _reindex_doc(self, docid, text):
self._totaldoclen -= self._docweight[docid]
return BaseIndex._reindex_doc(self, docid, text)

def unindex_doc(self, docid):
self._totaldoclen -= self._docweight[docid]
Expand Down
29 changes: 23 additions & 6 deletions tests/testZCTextIndex.py
Expand Up @@ -142,18 +142,29 @@ class CosineIndexTests(ZCIndexTestsBase, testIndex.CosineIndexTest):
def testRanking(self):
self.words = ["cold", "days", "eat", "hot", "lot", "nine", "old",
"pease", "porridge", "pot"]
self.docs = ["Pease porridge hot, pease porridge cold,",
"Pease porridge in the pot,",
"Nine days old.",
"In the pot cold, in the pot hot,",
"Pease porridge, pease porridge,",
"Eat the lot."]
self._ranking_index()
self._ranking_tf()
self._ranking_idf()
self._ranking_queries()

# A digression to exercise re-indexing. This should leave
# things exactly as they were.
docs = self.docs
for variant in ("hot cold porridge python", "pease hot pithy ",
docs[-1]):
self.zc_index.index_object(len(docs), Indexable(variant))
self._ranking_tf()
self._ranking_idf()
self._ranking_queries()

def _ranking_index(self):
docs = ["Pease porridge hot, pease porridge cold,",
"Pease porridge in the pot,",
"Nine days old.",
"In the pot cold, in the pot hot,",
"Pease porridge, pease porridge,",
"Eat the lot."]
docs = self.docs
for i in range(len(docs)):
self.zc_index.index_object(i + 1, Indexable(docs[i]))

Expand Down Expand Up @@ -220,6 +231,12 @@ def testAbsoluteScores(self):
"one two three"]
for i in range(len(docs)):
self.zc_index.index_object(i + 1, Indexable(docs[i]))

# A brief digression to exercise re-indexing. This should leave
# things exactly as they were.
for variant in "one xyz", "xyz two three", "abc def", docs[-1]:
self.zc_index.index_object(len(docs), Indexable(variant))

self.assertEqual(self.index._totaldoclen, 6)
# So the mean doc length is 2. We use that later.

Expand Down

0 comments on commit 2e56b82

Please sign in to comment.