Keep some statistics about indexing: total number of bytes and words

indexed (where the bytes are counted before entry into the pipeline, and the words are counted after the pipeline is done). To get the numbers, use the _nbytes and _nwords instance variables directly.
zopefoundation · May 15, 2002 · 22eb625 · 22eb625
1 parent 9526a1b
commit 22eb625
Showing 1 changed file with 7 additions and 0 deletions.
diff --git a/Lexicon.py b/Lexicon.py
@@ -30,6 +30,10 @@ def __init__(self, *pipeline):
         self._nextwid = 1
         self._pipeline = pipeline
 
+        # Keep some statistics about indexing
+        self._nbytes = 0 # Number of bytes indexed (at start of pipeline)
+        self._nwords = 0 # Number of words indexed (after pipeline)
+
     def length(self):
         """Return the number of unique terms in the lexicon."""
         return self._nextwid - 1
@@ -45,8 +49,11 @@ def items(self):
 
     def sourceToWordIds(self, text):
         last = _text2list(text)
+        for t in last:
+            self._nbytes += len(t)
         for element in self._pipeline:
             last = element.process(last)
+        self._nwords += len(last)
         return map(self._getWordIdCreate, last)
 
     def termToWordIds(self, text):