Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
Merged TextIndexDS9-branch into trunk.
Browse files Browse the repository at this point in the history
  • Loading branch information
gvanrossum committed May 14, 2002
0 parents commit 232ee1a
Show file tree
Hide file tree
Showing 35 changed files with 4,054 additions and 0 deletions.
41 changes: 41 additions & 0 deletions HTMLSplitter.py
@@ -0,0 +1,41 @@
from Products.ZCTextIndex.ISplitter import ISplitter

import re

class HTMLSplitter:

__implements__ = ISplitter

def process(self, text):
return re.sub('<[^>]*>', ' ', text).split()

class HTMLWordSplitter:

__implements__ = ISplitter

def process(self, text):
splat = []
for t in text:
splat += self.split(t)
return splat

def split(self, text):
text = text.lower()
remove = ["<[^>]*>",
"&[A-Za-z]+;",
"\W+"]
for pat in remove:
text = re.sub(pat, " ", text)
rx = re.compile("[A-Za-z]")
return [word for word in text.split()
if len(word) > 1 and rx.search(word)]

if __name__ == "__main__":
import sys
splitter = HTMLWordSplitter()
for path in sys.argv[1:]:
f = open(path, "rb")
buf = f.read()
f.close()
print path
print splitter.process([buf])
58 changes: 58 additions & 0 deletions IIndex.py
@@ -0,0 +1,58 @@
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################

"""Index Interface."""

import Interface

class IIndex(Interface.Base):
"""Interface for an Index."""

def search(term):
"""Execute a search on a single term given as a string.
Return an IIBucket.
"""

def search_phrase(phrase):
"""Execute a search on a phrase given as a string.
Return an IIBucket.
"""

def search_glob(pattern):
"""Execute a pattern search.
The pattern represents a set of words by using * and ?. For
example, "foo*" represents the set of all words in the lexicon
starting with "foo".
NOTE: Currently only a single trailing * is supported.
Return an IIBucket.
"""

def query_weight(terms):
"""Return the weight for a set of query terms.
'terms' is a sequence of all terms included in the query,
although not terms with a not. If a term appears more than
once in a query, it should appear more than once in terms.
"""

def index_doc(docid, text):
"XXX"

def unindex_doc(docid):
"XXX"
51 changes: 51 additions & 0 deletions ILexicon.py
@@ -0,0 +1,51 @@
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################

from Interface import Base as Interface

class ILexicon(Interface):
"""Object responsible for converting text to word identifiers."""

def termToWordIds(text):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parses the text as if they are search terms, and skips words that
aren't in the lexicon.
"""

def sourceToWordIds(text):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parses the text as if they come from a source document, and creates
new word ids for words that aren't (yet) in the lexicon.
"""

def globToWordIds(pattern):
"""Return a sequence of ids of words matching the pattern.
The argument should be a single word using globbing syntax,
e.g. 'foo*' meaning anything starting with 'foo'.
NOTE: Currently only a single trailing * is supported.
Returns the wids for all words in the lexicon that match the
pattern.
"""

def length():
"""Return the number of unique term in the lexicon."""
73 changes: 73 additions & 0 deletions INBest.py
@@ -0,0 +1,73 @@
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################

"""NBest Interface.
An NBest object remembers the N best-scoring items ever passed to its
.add(item, score) method. If .add() is called M times, the worst-case
number of comparisons performed overall is M * log2(N).
"""


import Interface

class INBest(Interface.Base):
"""Interface for an N-Best chooser."""

def add(item, score):
"""Record that item 'item' has score 'score'. No return value.
The N best-scoring items are remembered, where N was passed to
the constructor. 'item' can by anything. 'score' should be
a number, and larger numbers are considered better.
"""

def addmany(sequence):
"""Like "for item, score in sequence: self.add(item, score)".
This is simply faster than calling add() len(seq) times.
"""

def getbest():
"""Return the (at most) N best-scoring items as a sequence.
The return value is a sequence of 2-tuples, (item, score), with
the largest score first. If .add() has been called fewer than
N times, this sequence will contain fewer than N pairs.
"""

def pop_smallest():
"""Return and remove the (item, score) pair with lowest score.
If len(self) is 0, raise IndexError.
To be cleaer, this is the lowest score among the N best-scoring
seen so far. This is most useful if the capacity of the NBest
object is never exceeded, in which case pop_smallest() allows
using the object as an ordinary smallest-in-first-out priority
queue.
"""

def __len__():
"""Return the number of (item, score) pairs currently known.
This is N (the value passed to the constructor), unless .add()
has been called fewer than N times.
"""

def capacity():
"""Return the maximum number of (item, score) pairs.
This is N (the value passed to the constructor).
"""
23 changes: 23 additions & 0 deletions IPipelineElement.py
@@ -0,0 +1,23 @@
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################

from Interface import Base as Interface

class IPipelineElement(Interface):

def process(source):
"""Provide a text processing step.
Process a source sequence of words into a result sequence.
"""
63 changes: 63 additions & 0 deletions IQueryParser.py
@@ -0,0 +1,63 @@
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################

"""Query Parser Interface."""

import Interface

class IQueryParser(Interface.Base):
"""Interface for Query Parsers."""

def parseQuery(query):
"""Parse a query string.
Return a parse tree (which implements IQueryParseTree).
May raise ParseTree.ParseError.
"""

class IQueryParseTree(Interface.Base):
"""Interface for parse trees returned by parseQuery()."""

def nodeType():
"""Return the node type.
This is one of 'AND', 'OR', 'NOT', 'ATOM', 'PHRASE' or 'GLOB'.
"""

def getValue():
"""Return a node-type specific value.
For node type: Return:
'AND' a list of parse trees
'OR' a list of parse trees
'NOT' a parse tree
'ATOM' a string (representing a single search term)
'PHRASE' a string (representing a search phrase)
'GLOB' a string (representing a pattern, e.g. "foo*")
"""

def terms():
"""Return a list of all terms in this node, excluding NOT subtrees."""

def executeQuery(index):
"""Execute the query represented by this node against the index.
The index argument must implement the IIndex interface.
Return an IIBucket or IIBTree mapping document ids to scores
(higher scores mean better results).
May raise ParseTree.QueryError.
"""
21 changes: 21 additions & 0 deletions ISplitter.py
@@ -0,0 +1,21 @@
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################

from Interface import Base as Interface

class ISplitter(Interface):
"""A splitter."""

def process(text):
"""Run the splitter over the input text, returning a list of terms."""

0 comments on commit 232ee1a

Please sign in to comment.