Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
Keep some statistics about indexing: total number of bytes and words
Browse files Browse the repository at this point in the history
indexed (where the bytes are counted before entry into the pipeline,
and the words are counted after the pipeline is done).  To get the
numbers, use the _nbytes and _nwords instance variables directly.
  • Loading branch information
gvanrossum committed May 15, 2002
1 parent 9526a1b commit 22eb625
Showing 1 changed file with 7 additions and 0 deletions.
7 changes: 7 additions & 0 deletions Lexicon.py
Expand Up @@ -30,6 +30,10 @@ def __init__(self, *pipeline):
self._nextwid = 1
self._pipeline = pipeline

# Keep some statistics about indexing
self._nbytes = 0 # Number of bytes indexed (at start of pipeline)
self._nwords = 0 # Number of words indexed (after pipeline)

def length(self):
"""Return the number of unique terms in the lexicon."""
return self._nextwid - 1
Expand All @@ -45,8 +49,11 @@ def items(self):

def sourceToWordIds(self, text):
last = _text2list(text)
for t in last:
self._nbytes += len(t)
for element in self._pipeline:
last = element.process(last)
self._nwords += len(last)
return map(self._getWordIdCreate, last)

def termToWordIds(self, text):
Expand Down

0 comments on commit 22eb625

Please sign in to comment.