Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
- converted ILexicon to z3 and bridged it back
Browse files Browse the repository at this point in the history
- ZCTextIndex now accepts lexicons with the z3 interface
  • Loading branch information
Unknown committed Oct 31, 2005
1 parent 5708522 commit af127c8
Show file tree
Hide file tree
Showing 6 changed files with 547 additions and 9 deletions.
28 changes: 28 additions & 0 deletions ILexicon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Lexicon z2 interfaces.
$Id$
"""


# create ILexicon
from Interface.bridge import createZope3Bridge
from interfaces import ILexicon as z3ILexicon
import ILexicon

createZope3Bridge(z3ILexicon, ILexicon, 'ILexicon')

del createZope3Bridge
del z3ILexicon
229 changes: 229 additions & 0 deletions Lexicon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Lexicon.
$Id$
"""

import re

from BTrees.IOBTree import IOBTree
from BTrees.OIBTree import OIBTree
from BTrees.Length import Length

import ZODB
from Persistence import Persistent
from zope.interface import implements

from Products.ZCTextIndex.StopDict import get_stopdict
from Products.ZCTextIndex.ParseTree import QueryError
from Products.ZCTextIndex.PipelineFactory import element_factory
from ILexicon import ILexicon as z2ILexicon
from interfaces import ILexicon


class Lexicon(Persistent):

__implements__ = z2ILexicon
implements(ILexicon)

def __init__(self, *pipeline):
self._wids = OIBTree() # word -> wid
self._words = IOBTree() # wid -> word
# wid 0 is reserved for words that aren't in the lexicon (OOV -- out
# of vocabulary). This can happen, e.g., if a query contains a word
# we never saw before, and that isn't a known stopword (or otherwise
# filtered out). Returning a special wid value for OOV words is a
# way to let clients know when an OOV word appears.
self.length = Length()
self._pipeline = pipeline

def length(self):
"""Return the number of unique terms in the lexicon."""
# Overridden in instances
return len(self._wids)

def words(self):
return self._wids.keys()

def wids(self):
return self._words.keys()

def items(self):
return self._wids.items()

def sourceToWordIds(self, text):
last = _text2list(text)
for element in self._pipeline:
last = element.process(last)
if not hasattr(self.length, 'change'):
# Make sure length is overridden with a BTrees.Length.Length
self.length = Length(self.length())
# Strategically unload the length value so that we get the most
# recent value written to the database to minimize conflicting wids
# Because length is independent, this will load the most
# recent value stored, regardless of whether MVCC is enabled
self.length._p_deactivate()
return map(self._getWordIdCreate, last)

def termToWordIds(self, text):
last = _text2list(text)
for element in self._pipeline:
last = element.process(last)
wids = []
for word in last:
wids.append(self._wids.get(word, 0))
return wids

def parseTerms(self, text):
last = _text2list(text)
for element in self._pipeline:
process = getattr(element, "processGlob", element.process)
last = process(last)
return last

def isGlob(self, word):
return "*" in word or "?" in word

def get_word(self, wid):
return self._words[wid]

def get_wid(self, word):
return self._wids.get(word, 0)

def globToWordIds(self, pattern):
# Implement * and ? just as in the shell, except the pattern
# must not start with either of these
prefix = ""
while pattern and pattern[0] not in "*?":
prefix += pattern[0]
pattern = pattern[1:]
if not pattern:
# There were no globbing characters in the pattern
wid = self._wids.get(prefix, 0)
if wid:
return [wid]
else:
return []
if not prefix:
# The pattern starts with a globbing character.
# This is too efficient, so we raise an exception.
raise QueryError(
"pattern %r shouldn't start with glob character" % pattern)
pat = prefix
for c in pattern:
if c == "*":
pat += ".*"
elif c == "?":
pat += "."
else:
pat += re.escape(c)
pat += "$"
prog = re.compile(pat)
keys = self._wids.keys(prefix) # Keys starting at prefix
wids = []
for key in keys:
if not key.startswith(prefix):
break
if prog.match(key):
wids.append(self._wids[key])
return wids

def _getWordIdCreate(self, word):
wid = self._wids.get(word)
if wid is None:
wid = self._new_wid()
self._wids[word] = wid
self._words[wid] = word
return wid

def _new_wid(self):
self.length.change(1)
while self._words.has_key(self.length()): # just to be safe
self.length.change(1)
return self.length()

def _text2list(text):
# Helper: splitter input may be a string or a list of strings
try:
text + ""
except:
return text
else:
return [text]

# Sample pipeline elements

class Splitter:

import re
rx = re.compile(r"(?L)\w+")
rxGlob = re.compile(r"(?L)\w+[\w*?]*") # See globToWordIds() above

def process(self, lst):
result = []
for s in lst:
result += self.rx.findall(s)
return result

def processGlob(self, lst):
result = []
for s in lst:
result += self.rxGlob.findall(s)
return result

element_factory.registerFactory('Word Splitter',
'Whitespace splitter',
Splitter)

class CaseNormalizer:

def process(self, lst):
return [w.lower() for w in lst]

element_factory.registerFactory('Case Normalizer',
'Case Normalizer',
CaseNormalizer)

element_factory.registerFactory('Stop Words',
' Don\'t remove stop words',
None)

class StopWordRemover:

dict = get_stopdict().copy()

try:
from Products.ZCTextIndex.stopper import process as _process
except ImportError:
def process(self, lst):
has_key = self.dict.has_key
return [w for w in lst if not has_key(w)]
else:
def process(self, lst):
return self._process(self.dict, lst)

element_factory.registerFactory('Stop Words',
'Remove listed stop words only',
StopWordRemover)

class StopWordAndSingleCharRemover(StopWordRemover):

dict = get_stopdict().copy()
for c in range(255):
dict[chr(c)] = None

element_factory.registerFactory('Stop Words',
'Remove listed and single char words',
StopWordAndSingleCharRemover)
15 changes: 9 additions & 6 deletions ZCTextIndex.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,18 @@
from Products.PluginIndexes.common import safe_callable
from Products.PluginIndexes.interfaces import IPluggableIndex

from Products.ZCTextIndex.ILexicon import ILexicon
from Products.ZCTextIndex.Lexicon import \
Lexicon, Splitter, CaseNormalizer, StopWordRemover
from Products.ZCTextIndex.NBest import NBest
from Products.ZCTextIndex.QueryParser import QueryParser
from PipelineFactory import element_factory
from CosineIndex import CosineIndex
from ILexicon import ILexicon as z2ILexicon
from interfaces import ILexicon
from interfaces import IZCLexicon
from interfaces import IZCTextIndex
from OkapiIndex import OkapiIndex
from PipelineFactory import element_factory

from Products.ZCTextIndex.CosineIndex import CosineIndex
from Products.ZCTextIndex.OkapiIndex import OkapiIndex

index_types = {'Okapi BM25 Rank':OkapiIndex,
'Cosine Measure':CosineIndex}
Expand Down Expand Up @@ -89,7 +90,8 @@ def __init__(self, id, extra=None, caller=None, index_factory=None,
if lexicon is None:
raise LookupError, 'Lexicon "%s" not found' % escape(lexicon_id)

if not ILexicon.isImplementedBy(lexicon):
if not (ILexicon.providedBy(lexicon) or
z2ILexicon.isImplementedBy(lexicon)):
raise ValueError('Object "%s" does not implement '
'ZCTextIndex Lexicon interface'
% lexicon.getId())
Expand Down Expand Up @@ -134,7 +136,8 @@ def getLexicon(self):
return self._v_lexicon
except AttributeError:
lexicon = getattr(aq_parent(aq_inner(self)), self.lexicon_id)
if not ILexicon.isImplementedBy(lexicon):
if not (ILexicon.providedBy(lexicon) or
z2ILexicon.isImplementedBy(lexicon)):
raise TypeError('Object "%s" is not a ZCTextIndex Lexicon'
% repr(lexicon))
self._v_lexicon = lexicon
Expand Down
64 changes: 64 additions & 0 deletions interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,70 @@ class IZCTextIndex(Interface):
"""


class ILexicon(Interface):

"""Object responsible for converting text to word identifiers.
"""

def termToWordIds(text):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parse the text as if they are search terms, and skips words
that aren't in the lexicon.
"""

def sourceToWordIds(text):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parse the text as if they come from a source document, and
creates new word ids for words that aren't (yet) in the
lexicon.
"""

def globToWordIds(pattern):
"""Return a sequence of ids of words matching the pattern.
The argument should be a single word using globbing syntax,
e.g. 'foo*' meaning anything starting with 'foo'.
Return the wids for all words in the lexicon that match the
pattern.
"""

def length():
"""Return the number of unique term in the lexicon.
"""

def get_word(wid):
"""Return the word for the given word id.
Raise KeyError if the word id is not in the lexicon.
"""

def get_wid(word):
"""Return the wird id for the given word.
Return 0 of the word is not in the lexicon.
"""

def parseTerms(text):
"""Pass the text through the pipeline.
Return a list of words, normalized by the pipeline
(e.g. stopwords removed, case normalized etc.).
"""

def isGlob(word):
"""Return true if the word is a globbing pattern.
The word should be one of the words returned by parseTerm().
"""


class IZCLexicon(Interface):

"""Lexicon for ZCTextIndex.
Expand Down

0 comments on commit af127c8

Please sign in to comment.