Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
Get rid of the unused HTMLSplitter class (it's too simple).
Browse files Browse the repository at this point in the history
Add glob support to the HTMLWordSplitter class.
  • Loading branch information
gvanrossum committed May 22, 2002
1 parent e356217 commit 4b55e62
Showing 1 changed file with 9 additions and 16 deletions.
25 changes: 9 additions & 16 deletions HTMLSplitter.py
Expand Up @@ -17,33 +17,26 @@

import re

class HTMLSplitter:

__implements__ = ISplitter

def process(self, text):
return re.sub('<[^>]*>', ' ', text).split()

class HTMLWordSplitter:

__implements__ = ISplitter

def process(self, text):
def process(self, text, wordpat=r"\w+"):
splat = []
for t in text:
splat += self._split(t)
splat += self._split(t, wordpat)
return splat

def _split(self, text):
def processGlob(self, text):
return self.process(text, r"\w+[\w*?]*") # see Lexicon.globToWordIds()

def _split(self, text, wordpat):
text = text.lower()
remove = ["<[^>]*>",
"&[A-Za-z]+;",
"\W+"]
remove = [r"<[^<>]*>",
r"&[A-Za-z]+;"]
for pat in remove:
text = re.sub(pat, " ", text)
rx = re.compile("[A-Za-z]")
return [word for word in text.split()
if len(word) > 1 and rx.search(word)]
return re.findall(wordpat, text)

element_factory.registerFactory('Word Splitter',
'HTML aware splitter',
Expand Down

0 comments on commit 4b55e62

Please sign in to comment.