Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
Changed word id creation algorithm in Lexicon. Instead of relying on …
Browse files Browse the repository at this point in the history
…an increasing length counter, we use a number from a randomized range. This avoids conflict errors while adding new words in multiple parallel transactions. Inspired by code from ``enfold.fixes``.
  • Loading branch information
hannosch committed Oct 2, 2010
1 parent b6bc1c4 commit cb548d6
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 34 deletions.
5 changes: 5 additions & 0 deletions CHANGES.txt
Expand Up @@ -4,6 +4,11 @@ Changelog
2.13.1 (unreleased)
-------------------

- Changed word id creation algorithm in Lexicon. Instead of relying on an
increasing length counter, we use a number from a randomized range. This
avoids conflict errors while adding new words in multiple parallel
transactions. Inspired by code from ``enfold.fixes``.

- Lexicon: Added clear method.

- Lexicon: Removed BBB code for instances created with Zope < 2.6.2.
Expand Down
39 changes: 27 additions & 12 deletions src/Products/ZCTextIndex/Lexicon.py
Expand Up @@ -14,6 +14,7 @@
"""Lexicon.
"""

from random import randrange
import re

from BTrees.IOBTree import IOBTree
Expand All @@ -32,6 +33,9 @@ class Lexicon(Persistent):

implements(ILexicon)

_v_nextid = None
_wid_length_based = True # Flag to distinguish new and old lexica

def __init__(self, *pipeline):
self.clear()
self._pipeline = pipeline
Expand All @@ -40,6 +44,7 @@ def clear(self):
"""Empty the lexicon.
"""
self.length = Length()
self._wid_length_based = False
self._wids = OIBTree() # word -> wid
self._words = IOBTree() # wid -> word
# wid 0 is reserved for words that aren't in the lexicon (OOV -- out
Expand Down Expand Up @@ -67,11 +72,6 @@ def sourceToWordIds(self, text):
last = _text2list(text)
for element in self._pipeline:
last = element.process(last)
# Strategically unload the length value so that we get the most
# recent value written to the database to minimize conflicting wids
# Because length is independent, this will load the most
# recent value stored, regardless of whether MVCC is enabled
self.length._p_deactivate()
return map(self._getWordIdCreate, last)

def termToWordIds(self, text):
Expand Down Expand Up @@ -141,22 +141,37 @@ def globToWordIds(self, pattern):
def _getWordIdCreate(self, word):
wid = self._wids.get(word)
if wid is None:
wid = self._new_wid()
# WidCode requires us to use at least 0x4000 as a base number.
# The algorithm in versions before 2.13 used the length as a base
# number. So we don't even try to generate numbers below the
# length as they are likely all taken
minimum = 0x4000
if self._wid_length_based:
minimum = max(self.length(), 0x4000)

while True:
if self._v_nextid is None:
self._v_nextid = randrange(minimum, 0x10000000)

wid = self._v_nextid
self._v_nextid += 1

if wid not in self._words:
break

self._v_nextid = None

self.length.change(1)
self._wids[word] = wid
self._words[wid] = word
return wid

def _new_wid(self):
self.length.change(1)
while self._words.has_key(self.length()): # just to be safe
self.length.change(1)
return self.length()

def _text2list(text):
# Helper: splitter input may be a string or a list of strings
try:
text + ""
except:
except Exception:
return text
else:
return [text]
Expand Down
10 changes: 0 additions & 10 deletions src/Products/ZCTextIndex/WidCode.py
Expand Up @@ -119,13 +119,3 @@ def _fill():
_encoding = tuple(_encoding)

_fill()

def test():
for i in range(2**20):
if i % 1000 == 0: print i
wids = [i]
code = encode(wids)
assert decode(code) == wids, (wids, code, decode(code))

if __name__ == "__main__":
test()
32 changes: 20 additions & 12 deletions src/Products/ZCTextIndex/tests/testLexicon.py
Expand Up @@ -20,7 +20,6 @@
import sys

import transaction
import ZODB


class StupidPipelineElement:
Expand Down Expand Up @@ -87,10 +86,8 @@ def test_interfaces(self):
verifyClass(ILexicon, self._getTargetClass())

def test_clear(self):
from Products.ZCTextIndex.Lexicon import Splitter

lexicon = self._makeOne()
wids = lexicon.sourceToWordIds('foo')
lexicon.sourceToWordIds('foo')
self.assertEqual(len(lexicon._wids), 1)
self.assertEqual(len(lexicon._words), 1)
self.assertEqual(lexicon.length(), 1)
Expand All @@ -105,15 +102,18 @@ def testSourceToWordIds(self):

lexicon = self._makeOne(Splitter())
wids = lexicon.sourceToWordIds('cats and dogs')
self.assertEqual(wids, [1, 2, 3])
self.assertEqual(len(wids), 3)
first = wids[0]
self.assertEqual(wids, [first, first+1, first+2])

def testTermToWordIds(self):
from Products.ZCTextIndex.Lexicon import Splitter

lexicon = self._makeOne(Splitter())
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('dogs')
self.assertEqual(wids, [3])
self.assertEqual(len(wids), 1)
self.assert_(wids[0] > 0)

def testMissingTermToWordIds(self):
from Products.ZCTextIndex.Lexicon import Splitter
Expand All @@ -134,7 +134,8 @@ def process_post_glob(self, lst):
lexicon = self._makeOne(AddedSplitter())
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('dogs')
self.assertEqual(wids, [3])
self.assertEqual(len(wids), 1)
self.assert_(wids[0] > 0)

def testMissingTermToWordIdsWithProcess_post_glob(self):
"""This test is for added process_post_glob"""
Expand All @@ -156,7 +157,8 @@ def testOnePipelineElement(self):
StupidPipelineElement('dogs', 'fish'))
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('fish')
self.assertEqual(wids, [3])
self.assertEqual(len(wids), 1)
self.assert_(wids[0] > 0)

def testSplitterAdaptorFold(self):
from Products.ZCTextIndex.Lexicon import CaseNormalizer
Expand All @@ -165,15 +167,19 @@ def testSplitterAdaptorFold(self):
lexicon = self._makeOne(Splitter(), CaseNormalizer())
wids = lexicon.sourceToWordIds('CATS and dogs')
wids = lexicon.termToWordIds('cats and dogs')
self.assertEqual(wids, [1, 2, 3])
self.assertEqual(len(wids), 3)
first = wids[0]
self.assertEqual(wids, [first, first+1, first+2])

def testSplitterAdaptorNofold(self):
from Products.ZCTextIndex.Lexicon import Splitter

lexicon = self._makeOne(Splitter())
wids = lexicon.sourceToWordIds('CATS and dogs')
wids = lexicon.termToWordIds('cats and dogs')
self.assertEqual(wids, [0, 2, 3])
self.assertEqual(len(wids), 3)
second = wids[1]
self.assertEqual(wids, [0, second, second+1])

def testTwoElementPipeline(self):
from Products.ZCTextIndex.Lexicon import Splitter
Expand All @@ -183,7 +189,8 @@ def testTwoElementPipeline(self):
WackyReversePipelineElement('fish'))
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('hsif')
self.assertEqual(wids, [1])
self.assertEqual(len(wids), 1)
self.assert_(wids[0] > 0)

def testThreeElementPipeline(self):
from Products.ZCTextIndex.Lexicon import Splitter
Expand All @@ -194,7 +201,8 @@ def testThreeElementPipeline(self):
WackyReversePipelineElement('fish'))
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('hsif')
self.assertEqual(wids, [2])
self.assertEqual(len(wids), 1)
self.assert_(wids[0] > 0)

def testSplitterLocaleAwareness(self):
import locale
Expand Down

0 comments on commit cb548d6

Please sign in to comment.