Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
Add a little splitter that behaves pretty much like HTMLWordSplitter,
Browse files Browse the repository at this point in the history
but works with a TextIndex Lexicon.
  • Loading branch information
Jeremy Hylton committed May 17, 2002
1 parent 77d01db commit 8bfe9b7
Showing 1 changed file with 18 additions and 2 deletions.
20 changes: 18 additions & 2 deletions tests/indexhtml.py
@@ -1,5 +1,4 @@
#! /usr/bin/env python

"""Index a collection of HTML files on the filesystem.
usage: indexhtml.py [options] dir
Expand All @@ -9,6 +8,7 @@
options:
-f data.fs -- the path to the filestorage datafile
"""
from __future__ import nested_scopes

import os
from time import clock
Expand All @@ -32,12 +32,28 @@ class Struct:
caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
return ZCTextIndex("read", extra, caller)

# XXX make a splitter more like the HTMLSplitter for TextIndex
# signature is
# Splitter(string, stop_words, encoding,
# singlechar, indexnumbers, casefolding)

class MySplitter:
def __init__(self):
self._v_splitter = HTMLWordSplitter()
def __call__(self, text, stopdict, *args, **kwargs):
words = self._v_splitter._split(text)
def lookup(w):
return stopdict.get(w, w)
return filter(None, map(lookup, words))

def make_old_index():
from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
from Products.PluginIndexes.TextIndex.Lexicon \
import Lexicon, stop_word_dict

return TextIndex("read", lexicon=Lexicon(stop_word_dict))
l = Lexicon(stop_word_dict)
l.SplitterFunc = MySplitter()
return TextIndex("read", lexicon=l)

def main(db, root, dir):
rt["index"] = index = INDEX()
Expand Down

0 comments on commit 8bfe9b7

Please sign in to comment.