Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
testDocUpdate(): Thanks to stop-word removal, there weren't actually
Browse files Browse the repository at this point in the history
*any* words in common across the versions.  Helped Will along by adding
a pragmatic comment to his "knocking indeed" rant.  Reworked to use
the inscrutable magic of dict.setdefault.
  • Loading branch information
Tim Peters committed May 17, 2002
1 parent 0307d26 commit 73c279b
Showing 1 changed file with 18 additions and 16 deletions.
34 changes: 18 additions & 16 deletions tests/testZCTextIndex.py
Expand Up @@ -32,11 +32,12 @@ def eq(scaled1, scaled2, epsilon=scaled_int(0.01)):
if abs(scaled1 - scaled2) > epsilon:
raise AssertionError, "%s != %s" % (scaled1, scaled2)

# a series of text chunks to use for the re-index tests
# A series of text chunks to use for the re-index tests (testDocUpdate).
text = [
"""Here's a knocking indeed! If a
man were porter of hell-gate, he should have
old turning the key.""",
old turning the key. knock (that made sure
sure there's at least one word in common)."""

"""Knock,
knock, knock! Who's there, i' the name of
Expand Down Expand Up @@ -96,26 +97,27 @@ def testStopWords(self):

def testDocUpdate(self):
docid = 1
stop = get_stopdict()
unique = {} # compute a set of unique words for each version
d = {} # find some common words
common = []
N = len(text)
stop = get_stopdict()

d = {} # word -> list of version numbers containing that word
for version, i in zip(text, range(N)):
# use a simple splitter rather than an official one
words = [w for w in re.split("\W+", version.lower())
if len(w) > 1 and not stop.has_key(w)]
# count occurences of each word
word_seen = {}
for w in words:
l = d[w] = d.get(w, [])
l.append(i)
for k, v in d.items():
if len(v) == 1:
v = v[0]
l = unique[v] = unique.get(v, [])
l.append(k)
elif len(v) == N:
common.append(k)
if not word_seen.has_key(w):
d.setdefault(w, []).append(i)
word_seen[w] = 1

unique = {} # version number -> list of words unique to that version
common = [] # list of words common to all versions
for w, versionlist in d.items():
if len(versionlist) == 1:
unique.setdefault(versionlist[0], []).append(w)
elif len(versionlist) == N:
common.append(w)

for version, i in zip(text, range(N)):
doc = Indexable(version)
Expand Down

0 comments on commit 73c279b

Please sign in to comment.