Get rid of the unused HTMLSplitter class (it's too simple).

Add glob support to the HTMLWordSplitter class.
zopefoundation · May 22, 2002 · 4b55e62 · 4b55e62
1 parent e356217
commit 4b55e62
Showing 1 changed file with 9 additions and 16 deletions.
diff --git a/HTMLSplitter.py b/HTMLSplitter.py
@@ -17,33 +17,26 @@
 
 import re
 
-class HTMLSplitter:
-
-    __implements__ = ISplitter
-
-    def process(self, text):
-        return re.sub('<[^>]*>', ' ', text).split()
-
 class HTMLWordSplitter:
 
     __implements__ = ISplitter
 
-    def process(self, text):
+    def process(self, text, wordpat=r"\w+"):
         splat = []
         for t in text:
-            splat += self._split(t)
+            splat += self._split(t, wordpat)
         return splat
 
-    def _split(self, text):
+    def processGlob(self, text):
+        return self.process(text, r"\w+[\w*?]*") # see Lexicon.globToWordIds()
+
+    def _split(self, text, wordpat):
         text = text.lower()
-        remove = ["<[^>]*>",
-                  "&[A-Za-z]+;",
-                  "\W+"]
+        remove = [r"<[^<>]*>",
+                  r"&[A-Za-z]+;"]
         for pat in remove:
             text = re.sub(pat, " ", text)
-        rx = re.compile("[A-Za-z]")
-        return [word for word in text.split()
-                if len(word) > 1 and rx.search(word)]
+        return re.findall(wordpat, text)
 
 element_factory.registerFactory('Word Splitter', 
                                 'HTML aware splitter',