Merged TextIndexDS9-branch into trunk.

zopefoundation · May 14, 2002 · 232ee1a · 232ee1a
commit 232ee1a
Show file tree

Hide file tree

Showing 35 changed files with 4,054 additions and 0 deletions.
diff --git a/HTMLSplitter.py b/HTMLSplitter.py
@@ -0,0 +1,41 @@
+from Products.ZCTextIndex.ISplitter import ISplitter
+
+import re
+
+class HTMLSplitter:
+
+    __implements__ = ISplitter
+
+    def process(self, text):
+        return re.sub('<[^>]*>', ' ', text).split()
+
+class HTMLWordSplitter:
+
+    __implements__ = ISplitter
+
+    def process(self, text):
+        splat = []
+        for t in text:
+            splat += self.split(t)
+        return splat    
+
+    def split(self, text):    
+        text = text.lower()
+        remove = ["<[^>]*>",
+                  "&[A-Za-z]+;",
+                  "\W+"]
+        for pat in remove:
+            text = re.sub(pat, " ", text)
+        rx = re.compile("[A-Za-z]")
+        return [word for word in text.split()
+                if len(word) > 1 and rx.search(word)]
+
+if __name__ == "__main__":
+    import sys
+    splitter = HTMLWordSplitter()
+    for path in sys.argv[1:]:
+        f = open(path, "rb")
+        buf = f.read()
+        f.close()
+        print path
+        print splitter.process([buf])
diff --git a/IIndex.py b/IIndex.py
@@ -0,0 +1,58 @@
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Index Interface."""
+
+import Interface
+
+class IIndex(Interface.Base):
+    """Interface for an Index."""
+
+    def search(term):
+        """Execute a search on a single term given as a string.
+
+        Return an IIBucket.
+        """
+
+    def search_phrase(phrase):
+        """Execute a search on a phrase given as a string.
+
+        Return an IIBucket.
+        """
+
+    def search_glob(pattern):
+        """Execute a pattern search.
+
+        The pattern represents a set of words by using * and ?.  For
+        example, "foo*" represents the set of all words in the lexicon
+        starting with "foo".
+
+        NOTE: Currently only a single trailing * is supported.
+
+        Return an IIBucket.
+        """
+
+    def query_weight(terms):
+        """Return the weight for a set of query terms.
+
+        'terms' is a sequence of all terms included in the query,
+        although not terms with a not.  If a term appears more than
+        once in a query, it should appear more than once in terms.
+        """
+
+    def index_doc(docid, text):
+        "XXX"
+
+    def unindex_doc(docid):
+        "XXX"
diff --git a/ILexicon.py b/ILexicon.py
@@ -0,0 +1,51 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from Interface import Base as Interface
+
+class ILexicon(Interface):
+    """Object responsible for converting text to word identifiers."""
+
+    def termToWordIds(text):
+        """Return a sequence of ids of the words parsed from the text.
+
+        The input text may be either a string or a list of strings.
+
+        Parses the text as if they are search terms, and skips words that
+        aren't in the lexicon.
+        """
+
+    def sourceToWordIds(text):
+        """Return a sequence of ids of the words parsed from the text.
+
+        The input text may be either a string or a list of strings.
+
+        Parses the text as if they come from a source document, and creates
+        new word ids for words that aren't (yet) in the lexicon.
+        """
+
+    def globToWordIds(pattern):
+        """Return a sequence of ids of words matching the pattern.
+
+        The argument should be a single word using globbing syntax,
+        e.g. 'foo*' meaning anything starting with 'foo'.
+
+        NOTE: Currently only a single trailing * is supported.
+
+        Returns the wids for all words in the lexicon that match the
+        pattern.
+        """
+
+    def length():
+        """Return the number of unique term in the lexicon."""
diff --git a/INBest.py b/INBest.py
@@ -0,0 +1,73 @@
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""NBest Interface.
+
+An NBest object remembers the N best-scoring items ever passed to its
+.add(item, score) method.  If .add() is called M times, the worst-case
+number of comparisons performed overall is M * log2(N).
+"""
+
+
+import Interface
+
+class INBest(Interface.Base):
+    """Interface for an N-Best chooser."""
+
+    def add(item, score):
+        """Record that item 'item' has score 'score'.  No return value.
+
+        The N best-scoring items are remembered, where N was passed to
+        the constructor.  'item' can by anything.  'score' should be
+        a number, and larger numbers are considered better.
+        """
+
+    def addmany(sequence):
+        """Like "for item, score in sequence: self.add(item, score)".
+
+        This is simply faster than calling add() len(seq) times.
+        """
+
+    def getbest():
+        """Return the (at most) N best-scoring items as a sequence.
+
+        The return value is a sequence of 2-tuples, (item, score), with
+        the largest score first.  If .add() has been called fewer than
+        N times, this sequence will contain fewer than N pairs.
+        """
+
+    def pop_smallest():
+        """Return and remove the (item, score) pair with lowest score.
+
+        If len(self) is 0, raise IndexError.
+
+        To be cleaer, this is the lowest score among the N best-scoring
+        seen so far.  This is most useful if the capacity of the NBest
+        object is never exceeded, in which case  pop_smallest() allows
+        using the object as an ordinary smallest-in-first-out priority
+        queue.
+        """
+
+    def __len__():
+        """Return the number of (item, score) pairs currently known.
+
+        This is N (the value passed to the constructor), unless .add()
+        has been called fewer than N times.
+        """
+
+    def capacity():
+        """Return the maximum number of (item, score) pairs.
+
+        This is N (the value passed to the constructor).
+        """
diff --git a/IPipelineElement.py b/IPipelineElement.py
@@ -0,0 +1,23 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from Interface import Base as Interface
+
+class IPipelineElement(Interface):
+
+    def process(source):
+        """Provide a text processing step.
+
+        Process a source sequence of words into a result sequence.
+        """
diff --git a/IQueryParser.py b/IQueryParser.py
@@ -0,0 +1,63 @@
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Query Parser Interface."""
+
+import Interface
+
+class IQueryParser(Interface.Base):
+    """Interface for Query Parsers."""
+
+    def parseQuery(query):
+        """Parse a query string.
+
+        Return a parse tree (which implements IQueryParseTree).
+
+        May raise ParseTree.ParseError.
+        """
+
+class IQueryParseTree(Interface.Base):
+    """Interface for parse trees returned by parseQuery()."""
+
+    def nodeType():
+        """Return the node type.
+
+        This is one of 'AND', 'OR', 'NOT', 'ATOM', 'PHRASE' or 'GLOB'.
+        """
+
+    def getValue():
+        """Return a node-type specific value.
+
+        For node type:    Return:
+        'AND'             a list of parse trees
+        'OR'              a list of parse trees
+        'NOT'             a parse tree
+        'ATOM'            a string (representing a single search term)
+        'PHRASE'          a string (representing a search phrase)
+        'GLOB'            a string (representing a pattern, e.g. "foo*")
+        """
+
+    def terms():
+        """Return a list of all terms in this node, excluding NOT subtrees."""
+
+    def executeQuery(index):
+        """Execute the query represented by this node against the index.
+
+        The index argument must implement the IIndex interface.
+
+        Return an IIBucket or IIBTree mapping document ids to scores
+        (higher scores mean better results).
+
+        May raise ParseTree.QueryError.
+        """
diff --git a/ISplitter.py b/ISplitter.py
@@ -0,0 +1,21 @@
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from Interface import Base as Interface
+
+class ISplitter(Interface):
+    """A splitter."""
+
+    def process(text):
+        """Run the splitter over the input text, returning a list of terms."""