Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
Merge casey-zctextindex-fewer-conflicts-branch:
Browse files Browse the repository at this point in the history
  - Indexes and Lexicon now much less likely to generate write conflicts.
    Previously *any* concurrent index/unindex operation would conflict

  - Performance and scalability fix for queries
  • Loading branch information
caseman committed Jun 5, 2003
1 parent ef5bf87 commit 7105342
Show file tree
Hide file tree
Showing 10 changed files with 259 additions and 33 deletions.
20 changes: 18 additions & 2 deletions BaseIndex.py
Expand Up @@ -20,7 +20,7 @@
from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import IIBTree, IIBucket, IITreeSet
from BTrees.IIBTree import intersection, difference
import BTrees.Length
from BTrees.Length import Length

from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode
Expand Down Expand Up @@ -83,12 +83,18 @@ def __init__(self, lexicon):
self._docwords = IOBTree()

# Use a BTree length for efficient length computation w/o conflicts
self.length = BTrees.Length.Length()
self.length = Length()
self.document_count = Length()

def length(self):
"""Return the number of words in the index."""
# This is overridden per instance
return len(self._wordinfo)

def document_count(self):
"""Return the number of documents in the index"""
# This is overridden per instance
return len(self._docweight)

def get_words(self, docid):
"""Return a list of the wordids for a given docid."""
Expand All @@ -104,6 +110,11 @@ def index_doc(self, docid, text):
self._mass_add_wordinfo(wid2weight, docid)
self._docweight[docid] = docweight
self._docwords[docid] = WidCode.encode(wids)
try:
self.document_count.change(1)
except AttributeError:
# Upgrade document_count to Length object
self.document_count = Length(self.document_count())
return len(wids)

# A subclass may wish to extend or override this. This is for adjusting
Expand Down Expand Up @@ -165,6 +176,11 @@ def unindex_doc(self, docid):
self._del_wordinfo(wid, docid)
del self._docwords[docid]
del self._docweight[docid]
try:
self.document_count.change(-1)
except AttributeError:
# Upgrade document_count to Length object
self.document_count = Length(self.document_count())

def search(self, term):
wids = self._lexicon.termToWordIds(term)
Expand Down
4 changes: 2 additions & 2 deletions CosineIndex.py
Expand Up @@ -69,7 +69,7 @@ def __init__(self, lexicon):
def _search_wids(self, wids):
if not wids:
return []
N = float(len(self._docweight))
N = float(self.document_count())
L = []
DictType = type({})
for wid in wids:
Expand All @@ -86,7 +86,7 @@ def query_weight(self, terms):
wids = []
for term in terms:
wids += self._lexicon.termToWordIds(term)
N = float(len(self._docweight))
N = float(self.document_count())
sum = 0.0
for wid in self._remove_oov_wids(wids):
wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)
Expand Down
10 changes: 8 additions & 2 deletions IIndex.py
Expand Up @@ -20,6 +20,9 @@ class IIndex(Interface.Base):
"""Interface for an Index."""

def length():
"""Return the number of words in the index."""

def document_count():
"""Return the number of documents in the index."""

def get_words(docid):
Expand Down Expand Up @@ -62,10 +65,13 @@ def query_weight(terms):
"""

def index_doc(docid, text):
"XXX"
"""Add a document with the specified id and text to the index. If a
document by that id already exists, replace its text with the new
text provided
"""

def unindex_doc(docid):
"XXX"
"""Remove the document with the specified id from the index"""

def has_doc(docid):
"""Returns true if docid is an id of a document in the index"""
27 changes: 15 additions & 12 deletions Lexicon.py
Expand Up @@ -16,6 +16,7 @@

from BTrees.IOBTree import IOBTree
from BTrees.OIBTree import OIBTree
from BTrees.Length import Length

import ZODB
from Persistence import Persistent
Expand All @@ -37,16 +38,13 @@ def __init__(self, *pipeline):
# we never saw before, and that isn't a known stopword (or otherwise
# filtered out). Returning a special wid value for OOV words is a
# way to let clients know when an OOV word appears.
self._nextwid = 1
self.length = Length()
self._pipeline = pipeline

# Keep some statistics about indexing
self._nbytes = 0 # Number of bytes indexed (at start of pipeline)
self._nwords = 0 # Number of words indexed (after pipeline)

def length(self):
"""Return the number of unique terms in the lexicon."""
return self._nextwid - 1
# Overridden in instances
return len(self._wids)

def words(self):
return self._wids.keys()
Expand All @@ -59,11 +57,15 @@ def items(self):

def sourceToWordIds(self, text):
last = _text2list(text)
for t in last:
self._nbytes += len(t)
for element in self._pipeline:
last = element.process(last)
self._nwords += len(last)
if not hasattr(self.length, 'change'):
# Make sure length is overridden with a BTrees.Length.Length
self.length = Length(self.length())
# Strategically unload the length value so that we get the most
# recent value written to the database to minimize conflicting wids
# XXX this will not work when MVCC is implemented in the ZODB...
self.length._p_deactivate()
return map(self._getWordIdCreate, last)

def termToWordIds(self, text):
Expand Down Expand Up @@ -138,9 +140,10 @@ def _getWordIdCreate(self, word):
return wid

def _new_wid(self):
wid = self._nextwid
self._nextwid += 1
return wid
self.length.change(1)
while self._words.has_key(self.length()): # just to be safe
self.length.change(1)
return self.length()

def _text2list(text):
# Helper: splitter input may be a string or a list of strings
Expand Down
36 changes: 28 additions & 8 deletions OkapiIndex.py
Expand Up @@ -18,6 +18,7 @@
# understand what's going on.

from BTrees.IIBTree import IIBucket
from BTrees.Length import Length

from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex.BaseIndex import BaseIndex, \
Expand Down Expand Up @@ -50,20 +51,29 @@ def __init__(self, lexicon):
# sum(self._docweight.values()), the total # of words in all docs
# This is a long for "better safe than sorry" reasons. It isn't
# used often enough that speed should matter.
self._totaldoclen = 0L
# Use a BTree.Length.Length object to avoid concurrent write conflicts
self._totaldoclen = Length(0L)

def index_doc(self, docid, text):
count = BaseIndex.index_doc(self, docid, text)
self._totaldoclen += count
self._change_doc_len(count)
return count

def _reindex_doc(self, docid, text):
self._totaldoclen -= self._docweight[docid]
self._change_doc_len(-self._docweight[docid])
return BaseIndex._reindex_doc(self, docid, text)

def unindex_doc(self, docid):
self._totaldoclen -= self._docweight[docid]
self._change_doc_len(-self._docweight[docid])
BaseIndex.unindex_doc(self, docid)

def _change_doc_len(self, delta):
# Change total doc length used for scoring
try:
self._totaldoclen.change(delta)
except AttributeError:
# Opportunistically upgrade _totaldoclen attribute to Length object
self._totaldoclen = Length(long(self._totaldoclen + delta))

# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to
Expand All @@ -76,8 +86,13 @@ def unindex_doc(self, docid):
def _search_wids(self, wids):
if not wids:
return []
N = float(len(self._docweight)) # total # of docs
meandoclen = self._totaldoclen / N
N = float(self.document_count()) # total # of docs
try:
doclen = self._totaldoclen()
except TypeError:
# _totaldoclen has not yet been upgraded
doclen = self._totaldoclen
meandoclen = doclen / N
K1 = self.K1
B = self.B
K1_plus1 = K1 + 1.0
Expand Down Expand Up @@ -120,8 +135,13 @@ def _search_wids(self, wids):
def _search_wids(self, wids):
if not wids:
return []
N = float(len(self._docweight)) # total # of docs
meandoclen = self._totaldoclen / N
N = float(self.document_count()) # total # of docs
try:
doclen = self._totaldoclen()
except TypeError:
# _totaldoclen has not yet been upgraded
doclen = self._totaldoclen
meandoclen = doclen / N
#K1 = self.K1
#B = self.B
#K1_plus1 = K1 + 1.0
Expand Down
2 changes: 0 additions & 2 deletions ZCTextIndex.py
Expand Up @@ -173,13 +173,11 @@ def _index_object(self, docid, obj, threshold=None, attr=None):
if text is None:
return 0
count = self.index.index_doc(docid, text)
self._p_changed = 1 # XXX
return count

def unindex_object(self, docid):
if self.index.has_doc(docid):
self.index.unindex_doc(docid)
self._p_changed = 1 # XXX

def _apply_index(self, request, cid=''):
"""Apply query specified by request, a mapping containing the query.
Expand Down
2 changes: 0 additions & 2 deletions tests/mhindex.py
Expand Up @@ -441,8 +441,6 @@ def bulkupdate(self, args):
self.updatefolder(f, f.listmessages())
print "Total", len(self.docpaths)
self.commit()
print "Indexed", self.index.lexicon._nbytes, "bytes and",
print self.index.lexicon._nwords, "words;",
print len(self.index.lexicon._words), "unique words."

def updatefolder(self, f, msgs):
Expand Down

0 comments on commit 7105342

Please sign in to comment.