This repository has been archived by the owner on May 13, 2020. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merged TextIndexDS9-branch into trunk.
- Loading branch information
0 parents
commit 232ee1a
Showing
35 changed files
with
4,054 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from Products.ZCTextIndex.ISplitter import ISplitter | ||
|
||
import re | ||
|
||
class HTMLSplitter: | ||
|
||
__implements__ = ISplitter | ||
|
||
def process(self, text): | ||
return re.sub('<[^>]*>', ' ', text).split() | ||
|
||
class HTMLWordSplitter: | ||
|
||
__implements__ = ISplitter | ||
|
||
def process(self, text): | ||
splat = [] | ||
for t in text: | ||
splat += self.split(t) | ||
return splat | ||
|
||
def split(self, text): | ||
text = text.lower() | ||
remove = ["<[^>]*>", | ||
"&[A-Za-z]+;", | ||
"\W+"] | ||
for pat in remove: | ||
text = re.sub(pat, " ", text) | ||
rx = re.compile("[A-Za-z]") | ||
return [word for word in text.split() | ||
if len(word) > 1 and rx.search(word)] | ||
|
||
if __name__ == "__main__": | ||
import sys | ||
splitter = HTMLWordSplitter() | ||
for path in sys.argv[1:]: | ||
f = open(path, "rb") | ||
buf = f.read() | ||
f.close() | ||
print path | ||
print splitter.process([buf]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
############################################################################## | ||
# | ||
# Copyright (c) 2001, 2002 Zope Corporation and Contributors. | ||
# All Rights Reserved. | ||
# | ||
# This software is subject to the provisions of the Zope Public License, | ||
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution. | ||
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED | ||
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS | ||
# FOR A PARTICULAR PURPOSE. | ||
# | ||
############################################################################## | ||
|
||
"""Index Interface.""" | ||
|
||
import Interface | ||
|
||
class IIndex(Interface.Base): | ||
"""Interface for an Index.""" | ||
|
||
def search(term): | ||
"""Execute a search on a single term given as a string. | ||
Return an IIBucket. | ||
""" | ||
|
||
def search_phrase(phrase): | ||
"""Execute a search on a phrase given as a string. | ||
Return an IIBucket. | ||
""" | ||
|
||
def search_glob(pattern): | ||
"""Execute a pattern search. | ||
The pattern represents a set of words by using * and ?. For | ||
example, "foo*" represents the set of all words in the lexicon | ||
starting with "foo". | ||
NOTE: Currently only a single trailing * is supported. | ||
Return an IIBucket. | ||
""" | ||
|
||
def query_weight(terms): | ||
"""Return the weight for a set of query terms. | ||
'terms' is a sequence of all terms included in the query, | ||
although not terms with a not. If a term appears more than | ||
once in a query, it should appear more than once in terms. | ||
""" | ||
|
||
def index_doc(docid, text): | ||
"XXX" | ||
|
||
def unindex_doc(docid): | ||
"XXX" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
############################################################################## | ||
# | ||
# Copyright (c) 2002 Zope Corporation and Contributors. | ||
# All Rights Reserved. | ||
# | ||
# This software is subject to the provisions of the Zope Public License, | ||
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution. | ||
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED | ||
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS | ||
# FOR A PARTICULAR PURPOSE | ||
# | ||
############################################################################## | ||
|
||
from Interface import Base as Interface | ||
|
||
class ILexicon(Interface): | ||
"""Object responsible for converting text to word identifiers.""" | ||
|
||
def termToWordIds(text): | ||
"""Return a sequence of ids of the words parsed from the text. | ||
The input text may be either a string or a list of strings. | ||
Parses the text as if they are search terms, and skips words that | ||
aren't in the lexicon. | ||
""" | ||
|
||
def sourceToWordIds(text): | ||
"""Return a sequence of ids of the words parsed from the text. | ||
The input text may be either a string or a list of strings. | ||
Parses the text as if they come from a source document, and creates | ||
new word ids for words that aren't (yet) in the lexicon. | ||
""" | ||
|
||
def globToWordIds(pattern): | ||
"""Return a sequence of ids of words matching the pattern. | ||
The argument should be a single word using globbing syntax, | ||
e.g. 'foo*' meaning anything starting with 'foo'. | ||
NOTE: Currently only a single trailing * is supported. | ||
Returns the wids for all words in the lexicon that match the | ||
pattern. | ||
""" | ||
|
||
def length(): | ||
"""Return the number of unique term in the lexicon.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
############################################################################## | ||
# | ||
# Copyright (c) 2001, 2002 Zope Corporation and Contributors. | ||
# All Rights Reserved. | ||
# | ||
# This software is subject to the provisions of the Zope Public License, | ||
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution. | ||
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED | ||
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS | ||
# FOR A PARTICULAR PURPOSE. | ||
# | ||
############################################################################## | ||
|
||
"""NBest Interface. | ||
An NBest object remembers the N best-scoring items ever passed to its | ||
.add(item, score) method. If .add() is called M times, the worst-case | ||
number of comparisons performed overall is M * log2(N). | ||
""" | ||
|
||
|
||
import Interface | ||
|
||
class INBest(Interface.Base): | ||
"""Interface for an N-Best chooser.""" | ||
|
||
def add(item, score): | ||
"""Record that item 'item' has score 'score'. No return value. | ||
The N best-scoring items are remembered, where N was passed to | ||
the constructor. 'item' can by anything. 'score' should be | ||
a number, and larger numbers are considered better. | ||
""" | ||
|
||
def addmany(sequence): | ||
"""Like "for item, score in sequence: self.add(item, score)". | ||
This is simply faster than calling add() len(seq) times. | ||
""" | ||
|
||
def getbest(): | ||
"""Return the (at most) N best-scoring items as a sequence. | ||
The return value is a sequence of 2-tuples, (item, score), with | ||
the largest score first. If .add() has been called fewer than | ||
N times, this sequence will contain fewer than N pairs. | ||
""" | ||
|
||
def pop_smallest(): | ||
"""Return and remove the (item, score) pair with lowest score. | ||
If len(self) is 0, raise IndexError. | ||
To be cleaer, this is the lowest score among the N best-scoring | ||
seen so far. This is most useful if the capacity of the NBest | ||
object is never exceeded, in which case pop_smallest() allows | ||
using the object as an ordinary smallest-in-first-out priority | ||
queue. | ||
""" | ||
|
||
def __len__(): | ||
"""Return the number of (item, score) pairs currently known. | ||
This is N (the value passed to the constructor), unless .add() | ||
has been called fewer than N times. | ||
""" | ||
|
||
def capacity(): | ||
"""Return the maximum number of (item, score) pairs. | ||
This is N (the value passed to the constructor). | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
############################################################################## | ||
# | ||
# Copyright (c) 2002 Zope Corporation and Contributors. | ||
# All Rights Reserved. | ||
# | ||
# This software is subject to the provisions of the Zope Public License, | ||
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution. | ||
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED | ||
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS | ||
# FOR A PARTICULAR PURPOSE | ||
# | ||
############################################################################## | ||
|
||
from Interface import Base as Interface | ||
|
||
class IPipelineElement(Interface): | ||
|
||
def process(source): | ||
"""Provide a text processing step. | ||
Process a source sequence of words into a result sequence. | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
############################################################################## | ||
# | ||
# Copyright (c) 2001, 2002 Zope Corporation and Contributors. | ||
# All Rights Reserved. | ||
# | ||
# This software is subject to the provisions of the Zope Public License, | ||
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution. | ||
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED | ||
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS | ||
# FOR A PARTICULAR PURPOSE. | ||
# | ||
############################################################################## | ||
|
||
"""Query Parser Interface.""" | ||
|
||
import Interface | ||
|
||
class IQueryParser(Interface.Base): | ||
"""Interface for Query Parsers.""" | ||
|
||
def parseQuery(query): | ||
"""Parse a query string. | ||
Return a parse tree (which implements IQueryParseTree). | ||
May raise ParseTree.ParseError. | ||
""" | ||
|
||
class IQueryParseTree(Interface.Base): | ||
"""Interface for parse trees returned by parseQuery().""" | ||
|
||
def nodeType(): | ||
"""Return the node type. | ||
This is one of 'AND', 'OR', 'NOT', 'ATOM', 'PHRASE' or 'GLOB'. | ||
""" | ||
|
||
def getValue(): | ||
"""Return a node-type specific value. | ||
For node type: Return: | ||
'AND' a list of parse trees | ||
'OR' a list of parse trees | ||
'NOT' a parse tree | ||
'ATOM' a string (representing a single search term) | ||
'PHRASE' a string (representing a search phrase) | ||
'GLOB' a string (representing a pattern, e.g. "foo*") | ||
""" | ||
|
||
def terms(): | ||
"""Return a list of all terms in this node, excluding NOT subtrees.""" | ||
|
||
def executeQuery(index): | ||
"""Execute the query represented by this node against the index. | ||
The index argument must implement the IIndex interface. | ||
Return an IIBucket or IIBTree mapping document ids to scores | ||
(higher scores mean better results). | ||
May raise ParseTree.QueryError. | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
############################################################################## | ||
# | ||
# Copyright (c) 2002 Zope Corporation and Contributors. | ||
# All Rights Reserved. | ||
# | ||
# This software is subject to the provisions of the Zope Public License, | ||
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution. | ||
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED | ||
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS | ||
# FOR A PARTICULAR PURPOSE | ||
# | ||
############################################################################## | ||
|
||
from Interface import Base as Interface | ||
|
||
class ISplitter(Interface): | ||
"""A splitter.""" | ||
|
||
def process(text): | ||
"""Run the splitter over the input text, returning a list of terms.""" |
Oops, something went wrong.