Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
- Collector #1815: ZCTextIndex accepts (again) sequences of strings to
Browse files Browse the repository at this point in the history
        be indexed.
  • Loading branch information
zopyx committed Jul 4, 2005
1 parent eb8733f commit c1c6667
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 7 deletions.
3 changes: 3 additions & 0 deletions IIndex.py
Expand Up @@ -68,6 +68,9 @@ def index_doc(docid, text):
"""Add a document with the specified id and text to the index. If a
document by that id already exists, replace its text with the new
text provided
text may be either a string (Unicode or otherwise) or a list
of strings from which to extract the terms under which to
index the source document.
"""

def unindex_doc(docid):
Expand Down
31 changes: 24 additions & 7 deletions ZCTextIndex.py
Expand Up @@ -152,7 +152,14 @@ def query(self, query, nbest=10):
## Pluggable Index APIs ##

def index_object(self, documentId, obj, threshold=None):
""" wrapper to handle indexing of multiple attributes """
"""Wrapper for index_doc() handling indexing of multiple attributes.
Enter the document with the specified documentId in the index
under the terms extracted from the indexed text attributes,
each of which should yield either a string or a list of
strings (Unicode or otherwise) to be passed to index_doc().
"""
# XXX We currently ignore subtransaction threshold

# needed for backward compatibility
try: fields = self._indexed_attrs
Expand All @@ -168,12 +175,22 @@ def index_object(self, documentId, obj, threshold=None):
text = text()
if text is None:
continue
all_texts.append(text)

if all_texts:
return self.index.index_doc(documentId, ' '.join(all_texts))
else:
return 0
# To index each attribute separately, we could use the
# following line, but we have preferred to make a single
# call to index_doc() for all attributes together.
# res += self.index.index_doc(documentId, text)
if text:
if isinstance(text, (list, tuple, )):
all_texts.extend(text)
else:
all_texts.append(text)

# Check that we're sending only strings
all_texts = filter(lambda text: isinstance(text, basestring), \
all_texts)
if all_texts:
return self.index.index_doc(documentId, all_texts)
return res

def unindex_object(self, docid):
if self.index.has_doc(docid):
Expand Down
23 changes: 23 additions & 0 deletions tests/testZCTextIndex.py
Expand Up @@ -151,6 +151,29 @@ def testMultipleAttributes(self):
nbest, total = zc_index.query('foo alpha gamma')
self.assertEqual(len(nbest), 0)

def testListAttributes(self):
lexicon = PLexicon('lexicon', '',
Splitter(),
CaseNormalizer(),
StopWordRemover())
caller = LexiconHolder(self.lexicon)
zc_index = ZCTextIndex('name',
None,
caller,
self.IndexFactory,
'text1,text2',
'lexicon')
doc = Indexable2('Hello Tim', \
['Now is the winter of our discontent',
'Made glorious summer by this sun of York', ])
zc_index.index_object(1, doc)
nbest, total = zc_index.query('glorious')
self.assertEqual(len(nbest), 1)
nbest, total = zc_index.query('York Tim')
self.assertEqual(len(nbest), 1)
nbest, total = zc_index.query('Tuesday Tim York')
self.assertEqual(len(nbest), 0)

def testStopWords(self):
# the only non-stopword is question
text = ("to be or not to be "
Expand Down

0 comments on commit c1c6667

Please sign in to comment.