- converted ILexicon to z3 and bridged it back

- ZCTextIndex now accepts lexicons with the z3 interface
zopefoundation · Oct 31, 2005 · af127c8 · af127c8
1 parent 5708522
commit af127c8
Show file tree

Hide file tree

Showing 6 changed files with 547 additions and 9 deletions.
diff --git a/ILexicon.py b/ILexicon.py
@@ -0,0 +1,28 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Lexicon z2 interfaces.
+
+$Id$
+"""
+
+
+# create ILexicon
+from Interface.bridge import createZope3Bridge
+from interfaces import ILexicon as z3ILexicon
+import ILexicon
+
+createZope3Bridge(z3ILexicon, ILexicon, 'ILexicon')
+
+del createZope3Bridge
+del z3ILexicon
diff --git a/Lexicon.py b/Lexicon.py
@@ -0,0 +1,229 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Lexicon.
+
+$Id$
+"""
+
+import re
+
+from BTrees.IOBTree import IOBTree
+from BTrees.OIBTree import OIBTree
+from BTrees.Length import Length
+
+import ZODB
+from Persistence import Persistent
+from zope.interface import implements
+
+from Products.ZCTextIndex.StopDict import get_stopdict
+from Products.ZCTextIndex.ParseTree import QueryError
+from Products.ZCTextIndex.PipelineFactory import element_factory
+from ILexicon import ILexicon as z2ILexicon
+from interfaces import ILexicon
+
+
+class Lexicon(Persistent):
+
+    __implements__ = z2ILexicon
+    implements(ILexicon)
+
+    def __init__(self, *pipeline):
+        self._wids = OIBTree()  # word -> wid
+        self._words = IOBTree() # wid -> word
+        # wid 0 is reserved for words that aren't in the lexicon (OOV -- out
+        # of vocabulary).  This can happen, e.g., if a query contains a word
+        # we never saw before, and that isn't a known stopword (or otherwise
+        # filtered out).  Returning a special wid value for OOV words is a
+        # way to let clients know when an OOV word appears.
+        self.length = Length()
+        self._pipeline = pipeline
+
+    def length(self):
+        """Return the number of unique terms in the lexicon."""
+        # Overridden in instances
+        return len(self._wids)
+
+    def words(self):
+        return self._wids.keys()
+
+    def wids(self):
+        return self._words.keys()
+
+    def items(self):
+        return self._wids.items()
+
+    def sourceToWordIds(self, text):
+        last = _text2list(text)
+        for element in self._pipeline:
+            last = element.process(last)
+        if not hasattr(self.length, 'change'):
+            # Make sure length is overridden with a BTrees.Length.Length
+            self.length = Length(self.length())        
+        # Strategically unload the length value so that we get the most
+        # recent value written to the database to minimize conflicting wids
+        # Because length is independent, this will load the most
+        # recent value stored, regardless of whether MVCC is enabled
+        self.length._p_deactivate()
+        return map(self._getWordIdCreate, last)
+
+    def termToWordIds(self, text):
+        last = _text2list(text)
+        for element in self._pipeline:
+            last = element.process(last)
+        wids = []
+        for word in last:
+            wids.append(self._wids.get(word, 0))
+        return wids
+
+    def parseTerms(self, text):
+        last = _text2list(text)
+        for element in self._pipeline:
+            process = getattr(element, "processGlob", element.process)
+            last = process(last)
+        return last
+
+    def isGlob(self, word):
+        return "*" in word or "?" in word
+
+    def get_word(self, wid):
+        return self._words[wid]
+
+    def get_wid(self, word):
+        return self._wids.get(word, 0)
+
+    def globToWordIds(self, pattern):
+        # Implement * and ? just as in the shell, except the pattern
+        # must not start with either of these
+        prefix = ""
+        while pattern and pattern[0] not in "*?":
+            prefix += pattern[0]
+            pattern = pattern[1:]
+        if not pattern:
+            # There were no globbing characters in the pattern
+            wid = self._wids.get(prefix, 0)
+            if wid:
+                return [wid]
+            else:
+                return []
+        if not prefix:
+            # The pattern starts with a globbing character.
+            # This is too efficient, so we raise an exception.
+            raise QueryError(
+                "pattern %r shouldn't start with glob character" % pattern)
+        pat = prefix
+        for c in pattern:
+            if c == "*":
+                pat += ".*"
+            elif c == "?":
+                pat += "."
+            else:
+                pat += re.escape(c)
+        pat += "$"
+        prog = re.compile(pat)
+        keys = self._wids.keys(prefix) # Keys starting at prefix
+        wids = []
+        for key in keys:
+            if not key.startswith(prefix):
+                break
+            if prog.match(key):
+                wids.append(self._wids[key])
+        return wids
+
+    def _getWordIdCreate(self, word):
+        wid = self._wids.get(word)
+        if wid is None:
+            wid = self._new_wid()
+            self._wids[word] = wid
+            self._words[wid] = word
+        return wid
+
+    def _new_wid(self):
+        self.length.change(1)
+        while self._words.has_key(self.length()): # just to be safe
+            self.length.change(1)
+        return self.length()
+
+def _text2list(text):
+    # Helper: splitter input may be a string or a list of strings
+    try:
+        text + ""
+    except:
+        return text
+    else:
+        return [text]
+
+# Sample pipeline elements
+
+class Splitter:
+
+    import re
+    rx = re.compile(r"(?L)\w+")
+    rxGlob = re.compile(r"(?L)\w+[\w*?]*") # See globToWordIds() above
+
+    def process(self, lst):
+        result = []
+        for s in lst:
+            result += self.rx.findall(s)
+        return result
+
+    def processGlob(self, lst):
+        result = []
+        for s in lst:
+            result += self.rxGlob.findall(s)
+        return result
+
+element_factory.registerFactory('Word Splitter',
+                                 'Whitespace splitter',
+                                 Splitter)
+
+class CaseNormalizer:
+
+    def process(self, lst):
+        return [w.lower() for w in lst]
+
+element_factory.registerFactory('Case Normalizer',
+                                'Case Normalizer',
+                                CaseNormalizer)
+
+element_factory.registerFactory('Stop Words',
+                                ' Don\'t remove stop words',
+                                None)
+
+class StopWordRemover:
+
+    dict = get_stopdict().copy()
+
+    try:
+        from Products.ZCTextIndex.stopper import process as _process
+    except ImportError:
+        def process(self, lst):
+            has_key = self.dict.has_key
+            return [w for w in lst if not has_key(w)]
+    else:
+        def process(self, lst):
+            return self._process(self.dict, lst)
+
+element_factory.registerFactory('Stop Words',
+                                'Remove listed stop words only',
+                                StopWordRemover)
+
+class StopWordAndSingleCharRemover(StopWordRemover):
+
+    dict = get_stopdict().copy()
+    for c in range(255):
+        dict[chr(c)] = None
+
+element_factory.registerFactory('Stop Words',
+                                'Remove listed and single char words',
+                                StopWordAndSingleCharRemover)
diff --git a/ZCTextIndex.py b/ZCTextIndex.py
@@ -33,17 +33,18 @@
 from Products.PluginIndexes.common import safe_callable
 from Products.PluginIndexes.interfaces import IPluggableIndex
 
-from Products.ZCTextIndex.ILexicon import ILexicon
 from Products.ZCTextIndex.Lexicon import \
      Lexicon, Splitter, CaseNormalizer, StopWordRemover
 from Products.ZCTextIndex.NBest import NBest
 from Products.ZCTextIndex.QueryParser import QueryParser
-from PipelineFactory import element_factory
+from CosineIndex import CosineIndex
+from ILexicon import ILexicon as z2ILexicon
+from interfaces import ILexicon
 from interfaces import IZCLexicon
 from interfaces import IZCTextIndex
+from OkapiIndex import OkapiIndex
+from PipelineFactory import element_factory
 
-from Products.ZCTextIndex.CosineIndex import CosineIndex
-from Products.ZCTextIndex.OkapiIndex import OkapiIndex
 
 index_types = {'Okapi BM25 Rank':OkapiIndex,
                'Cosine Measure':CosineIndex}
@@ -89,7 +90,8 @@ def __init__(self, id, extra=None, caller=None, index_factory=None,
         if lexicon is None:
             raise LookupError, 'Lexicon "%s" not found' % escape(lexicon_id)
 
-        if not ILexicon.isImplementedBy(lexicon):
+        if not (ILexicon.providedBy(lexicon) or
+                z2ILexicon.isImplementedBy(lexicon)):
             raise ValueError('Object "%s" does not implement '
                              'ZCTextIndex Lexicon interface'
                              % lexicon.getId())
@@ -134,7 +136,8 @@ def getLexicon(self):
             return self._v_lexicon
         except AttributeError:
             lexicon = getattr(aq_parent(aq_inner(self)), self.lexicon_id)
-            if not ILexicon.isImplementedBy(lexicon):
+            if not (ILexicon.providedBy(lexicon) or
+                    z2ILexicon.isImplementedBy(lexicon)):
                 raise TypeError('Object "%s" is not a ZCTextIndex Lexicon'
                                 % repr(lexicon))
             self._v_lexicon = lexicon

diff --git a/interfaces.py b/interfaces.py
@@ -24,6 +24,70 @@ class IZCTextIndex(Interface):
     """
 
 
+class ILexicon(Interface):
+
+    """Object responsible for converting text to word identifiers.
+    """
+
+    def termToWordIds(text):
+        """Return a sequence of ids of the words parsed from the text.
+
+        The input text may be either a string or a list of strings.
+
+        Parse the text as if they are search terms, and skips words
+        that aren't in the lexicon.
+        """
+
+    def sourceToWordIds(text):
+        """Return a sequence of ids of the words parsed from the text.
+
+        The input text may be either a string or a list of strings.
+
+        Parse the text as if they come from a source document, and
+        creates new word ids for words that aren't (yet) in the
+        lexicon.
+        """
+
+    def globToWordIds(pattern):
+        """Return a sequence of ids of words matching the pattern.
+
+        The argument should be a single word using globbing syntax,
+        e.g. 'foo*' meaning anything starting with 'foo'.
+
+        Return the wids for all words in the lexicon that match the
+        pattern.
+        """
+
+    def length():
+        """Return the number of unique term in the lexicon.
+        """
+
+    def get_word(wid):
+        """Return the word for the given word id.
+
+        Raise KeyError if the word id is not in the lexicon.
+        """
+
+    def get_wid(word):
+        """Return the wird id for the given word.
+
+        Return 0 of the word is not in the lexicon.
+        """
+
+    def parseTerms(text):
+        """Pass the text through the pipeline.
+
+        Return a list of words, normalized by the pipeline
+        (e.g. stopwords removed, case normalized etc.).
+        """
+
+    def isGlob(word):
+        """Return true if the word is a globbing pattern.
+
+        The word should be one of the words returned by parseTerm().
+        """
+
+
 class IZCLexicon(Interface):
 
     """Lexicon for ZCTextIndex.