This repository has been archived by the owner on May 13, 2020. It is now read-only.
/
testZCTextIndex.py
544 lines (468 loc) · 20.9 KB
/
testZCTextIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from Interface.Verify import verifyClass
import Acquisition
from zExceptions import NotFound
from Products.PluginIndexes.common.PluggableIndex import \
PluggableIndexInterface
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex, PLexicon
from Products.ZCTextIndex.tests import \
testIndex, testQueryEngine, testQueryParser
from Products.ZCTextIndex.BaseIndex import \
scaled_int, SCALE_FACTOR, inverse_doc_frequency
from Products.ZCTextIndex.CosineIndex import CosineIndex
from Products.ZCTextIndex.OkapiIndex import OkapiIndex
from Products.ZCTextIndex.Lexicon import Splitter
from Products.ZCTextIndex.Lexicon import CaseNormalizer, StopWordRemover
from Products.ZCTextIndex.QueryParser import QueryParser
from Products.ZCTextIndex.StopDict import get_stopdict
from Products.ZCTextIndex.ParseTree import ParseError
import re
import unittest
class Indexable:
def __init__(self, text):
self.text = text
class Indexable2:
def __init__(self, text1, text2):
self.text1 = text1
self.text2 = text2
class LexiconHolder(Acquisition.Implicit):
def __init__(self, lexicon):
self.lexicon = lexicon
def getPhysicalPath(self):
return ('',) # Pretend to be the root
def dummyUnrestrictedTraverse(self, path):
if path == ('', 'lexicon',):
return self.lexicon
raise NotFound, path
# The tests classes below create a ZCTextIndex(). Then they create
# instance variables that point to the internal components used by
# ZCTextIndex. These tests run the individual module unit tests with
# the fully integrated ZCTextIndex.
def eq(scaled1, scaled2, epsilon=scaled_int(0.01)):
if abs(scaled1 - scaled2) > epsilon:
raise AssertionError, "%s != %s" % (scaled1, scaled2)
# A series of text chunks to use for the re-index tests (testDocUpdate).
text = [
"""Here's a knocking indeed! If a
man were porter of hell-gate, he should have
old turning the key. knock (that made sure
sure there's at least one word in common)."""
"""Knock,
knock, knock! Who's there, i' the name of
Beelzebub? Here's a farmer, that hanged
himself on the expectation of plenty: come in
time; have napkins enow about you; here
you'll sweat for't.""",
"""Knock,
knock! Who's there, in the other devil's
name? Faith, here's an equivocator, that could
swear in both the scales against either scale;
who committed treason enough for God's sake,
yet could not equivocate to heaven: O, come
in, equivocator.""",
"""Knock,
knock, knock! Who's there? Faith, here's an
English tailor come hither, for stealing out of
a French hose: come in, tailor; here you may
roast your goose.""",
"""Knock,
knock; never at quiet! What are you? But
this place is too cold for hell. I'll devil-porter
it no further: I had thought to have let in
some of all professions that go the primrose
way to the everlasting bonfire."""
]
# Subclasses should derive from one of testIndex.{CosineIndexTest,
# OkapiIndexTest} too.
class ZCIndexTestsBase:
def setUp(self):
self.lexicon = PLexicon('lexicon', '',
Splitter(),
CaseNormalizer(),
StopWordRemover())
caller = LexiconHolder(self.lexicon)
self.zc_index = ZCTextIndex('name',
None,
caller,
self.IndexFactory,
'text',
'lexicon')
self.index = self.zc_index.index
def parserFailure(self, query):
self.assertRaises(ParseError, self.zc_index.query, query)
def parserSuccess(self, query, n):
r, num = self.zc_index.query(query)
self.assertEqual(num, n)
if n:
self.assertEqual(r[0][0], 1)
def testMultipleAttributes(self):
lexicon = PLexicon('lexicon', '',
Splitter(),
CaseNormalizer(),
StopWordRemover())
caller = LexiconHolder(self.lexicon)
zc_index = ZCTextIndex('name',
None,
caller,
self.IndexFactory,
'text1,text2',
'lexicon')
doc = Indexable2('foo bar', 'alpha omega')
zc_index.index_object(1, doc)
nbest, total = zc_index.query('foo')
self.assertEqual(len(nbest), 1)
nbest, total = zc_index.query('foo alpha')
self.assertEqual(len(nbest), 1)
nbest, total = zc_index.query('foo alpha gamma')
self.assertEqual(len(nbest), 0)
def testStopWords(self):
# the only non-stopword is question
text = ("to be or not to be "
"that is the question")
doc = Indexable(text)
self.zc_index.index_object(1, doc)
for word in text.split():
if word != "question":
wids = self.lexicon.termToWordIds(word)
self.assertEqual(wids, [])
self.assertEqual(len(self.index.get_words(1)), 1)
self.parserSuccess('question', 1)
self.parserSuccess('question AND to AND be', 1)
self.parserSuccess('to AND question AND be', 1)
self.parserSuccess('question AND NOT gardenia', 1)
self.parserSuccess('question AND gardenia', 0)
self.parserSuccess('gardenia', 0)
self.parserSuccess('question OR gardenia', 1)
self.parserSuccess('question AND NOT to AND NOT be', 1)
self.parserSuccess('question OR to OR be', 1)
self.parserSuccess('question to be', 1)
self.parserFailure('to be')
self.parserFailure('to AND be')
self.parserFailure('to OR be')
self.parserFailure('to AND NOT be')
self.parserFailure('to AND NOT question')
self.parserFailure('to AND NOT gardenia')
def testDocUpdate(self):
docid = 1 # doesn't change -- we index the same doc repeatedly
N = len(text)
stop = get_stopdict()
d = {} # word -> list of version numbers containing that word
for version, i in zip(text, range(N)):
# use a simple splitter rather than an official one
words = [w for w in re.split("\W+", version.lower())
if len(w) > 1 and not stop.has_key(w)]
word_seen = {}
for w in words:
if not word_seen.has_key(w):
d.setdefault(w, []).append(i)
word_seen[w] = 1
unique = {} # version number -> list of words unique to that version
common = [] # list of words common to all versions
for w, versionlist in d.items():
if len(versionlist) == 1:
unique.setdefault(versionlist[0], []).append(w)
elif len(versionlist) == N:
common.append(w)
self.assert_(len(common) > 0)
self.assert_(len(unique) > 0)
for version, i in zip(text, range(N)):
doc = Indexable(version)
self.zc_index.index_object(docid, doc)
for w in common:
nbest, total = self.zc_index.query(w)
self.assertEqual(total, 1, "did not find %s" % w)
for k, v in unique.items():
if k == i:
continue
for w in v:
nbest, total = self.zc_index.query(w)
self.assertEqual(total, 0, "did not expect to find %s" % w)
class CosineIndexTests(ZCIndexTestsBase, testIndex.CosineIndexTest):
# A fairly involved test of the ranking calculations based on
# an example set of documents in queries in Managing
# Gigabytes, pp. 180-188. This test peeks into many internals of the
# cosine indexer.
def testInterface(self):
verifyClass(PluggableIndexInterface, ZCTextIndex)
def testRanking(self):
self.words = ["cold", "days", "eat", "hot", "lot", "nine", "old",
"pease", "porridge", "pot"]
self.docs = ["Pease porridge hot, pease porridge cold,",
"Pease porridge in the pot,",
"Nine days old.",
"In the pot cold, in the pot hot,",
"Pease porridge, pease porridge,",
"Eat the lot."]
self._ranking_index()
self._ranking_tf()
self._ranking_idf()
self._ranking_queries()
# A digression to exercise re-indexing.
docs = self.docs
for variant in "hot cold porridge python", "pease hot pithy":
self.zc_index.index_object(len(docs), Indexable(variant))
try:
self._ranking_tf()
except (AssertionError, KeyError):
pass
else:
self.fail("expected _ranking_tf() to fail -- reindex")
try:
self._ranking_idf()
except (AssertionError, KeyError):
pass
else:
self.fail("expected _ranking_idf() to fail -- reindex")
try:
self._ranking_queries()
except AssertionError:
pass
else:
self.fail("expected _ranking_queries() to fail -- reindex")
# This should leave things exactly as they were.
self.zc_index.index_object(len(docs), Indexable(docs[-1]))
self._ranking_tf()
self._ranking_idf()
self._ranking_queries()
def _ranking_index(self):
docs = self.docs
for i in range(len(docs)):
self.zc_index.index_object(i + 1, Indexable(docs[i]))
def _ranking_tf(self):
# matrix of term weights for the rows are docids
# and the columns are indexes into this list:
l_wdt = [(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0),
(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0),
(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
(0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)]
l_Wd = [2.78, 1.73, 1.73, 2.21, 2.39, 1.41]
for i in range(len(l_Wd)):
docid = i + 1
scaled_Wd = scaled_int(l_Wd[i])
eq(scaled_Wd, self.index._get_Wd(docid))
wdts = [scaled_int(t) for t in l_wdt[i]]
for j in range(len(wdts)):
wdt = self.index._get_wdt(docid, self.words[j])
eq(wdts[j], wdt)
def _ranking_idf(self):
word_freqs = [2, 1, 1, 2, 1, 1, 1, 3, 3, 2]
idfs = [1.39, 1.95, 1.95, 1.39, 1.95, 1.95, 1.95, 1.10, 1.10, 1.39]
for i in range(len(self.words)):
word = self.words[i]
eq(word_freqs[i], self.index._get_ft(word))
eq(scaled_int(idfs[i]), self.index._get_wt(word))
def _ranking_queries(self):
queries = ["eat", "porridge", "hot OR porridge",
"eat OR nine OR day OR old OR porridge"]
wqs = [1.95, 1.10, 1.77, 3.55]
results = [[(6, 0.71)],
[(1, 0.61), (2, 0.58), (5, 0.71)],
[(1, 0.66), (2, 0.36), (4, 0.36), (5, 0.44)],
[(1, 0.19), (2, 0.18), (3, 0.63), (5, 0.22), (6, 0.39)]]
for i in range(len(queries)):
raw = queries[i]
q = QueryParser(self.lexicon).parseQuery(raw)
wq = self.index.query_weight(q.terms())
eq(wq, scaled_int(wqs[i]))
r, n = self.zc_index.query(raw)
self.assertEqual(len(r), len(results[i]))
# convert the results to a dict for each checking
d = {}
for doc, score in results[i]:
d[doc] = scaled_int(score)
for doc, score in r:
score = scaled_int(float(score / SCALE_FACTOR) / wq)
self.assert_(0 <= score <= SCALE_FACTOR)
eq(d[doc], score)
class OkapiIndexTests(ZCIndexTestsBase, testIndex.OkapiIndexTest):
# A white-box test.
def testAbsoluteScores(self):
docs = ["one",
"one two",
"one two three"]
for i in range(len(docs)):
self.zc_index.index_object(i + 1, Indexable(docs[i]))
self._checkAbsoluteScores()
# Exercise re-indexing.
for variant in "one xyz", "xyz two three", "abc def":
self.zc_index.index_object(len(docs), Indexable(variant))
try:
self._checkAbsoluteScores()
except AssertionError:
pass
else:
self.fail("expected _checkAbsoluteScores() to fail -- reindex")
# This should leave things exactly as they were.
self.zc_index.index_object(len(docs), Indexable(docs[-1]))
self._checkAbsoluteScores()
def _checkAbsoluteScores(self):
self.assertEqual(self.index._totaldoclen(), 6)
# So the mean doc length is 2. We use that later.
r, num = self.zc_index.query("one")
self.assertEqual(num, 3)
self.assertEqual(len(r), 3)
# Because our Okapi's B parameter is > 0, and "one" only appears
# once in each doc, the verbosity hypothesis favors shorter docs.
self.assertEqual([doc for doc, score in r], [1, 2, 3])
# The way the Okapi math works, a word that appears exactly once in
# an average (length) doc gets tf score 1. Our second doc has
# an average length, so its score should by 1 (tf) times the
# inverse doc frequency of "one". But "one" appears in every
# doc, so its IDF is log(1 + 3/3) = log(2).
self.assertEqual(r[1][1], scaled_int(inverse_doc_frequency(3, 3)))
# Similarly for "two".
r, num = self.zc_index.query("two")
self.assertEqual(num, 2)
self.assertEqual(len(r), 2)
self.assertEqual([doc for doc, score in r], [2, 3])
self.assertEqual(r[0][1], scaled_int(inverse_doc_frequency(2, 3)))
# And "three", except that doesn't appear in an average-size doc, so
# the math is much more involved.
r, num = self.zc_index.query("three")
self.assertEqual(num, 1)
self.assertEqual(len(r), 1)
self.assertEqual([doc for doc, score in r], [3])
idf = inverse_doc_frequency(1, 3)
meandoclen = 2.0
lengthweight = 1.0 - OkapiIndex.B + OkapiIndex.B * 3 / meandoclen
tf = (1.0 + OkapiIndex.K1) / (1.0 + OkapiIndex.K1 * lengthweight)
self.assertEqual(r[0][1], scaled_int(tf * idf))
# More of a black-box test, but based on insight into how Okapi is trying
# to think.
def testRelativeScores(self):
# Create 9 10-word docs.
# All contain one instance of "one".
# Doc #i contains i instances of "two" and 9-i of "xyz".
for i in range(1, 10):
doc = "one " + "two " * i + "xyz " * (9 - i)
self.zc_index.index_object(i, Indexable(doc))
self._checkRelativeScores()
# Exercise re-indexing.
self.zc_index.index_object(9, Indexable("two xyz"))
try:
self._checkRelativeScores()
except AssertionError:
pass
else:
self.fail("expected _checkRelativeScores() to fail after reindex")
# This should leave things exactly as they were.
self.zc_index.index_object(9, Indexable(doc))
self._checkRelativeScores()
def _checkRelativeScores(self):
r, num = self.zc_index.query("one two")
self.assertEqual(num, 9)
self.assertEqual(len(r), 9)
# The more twos in a doc, the better the score should be.
self.assertEqual([doc for doc, score in r], range(9, 0, -1))
# Search for "two" alone shouldn't make any difference to relative
# results.
r, num = self.zc_index.query("two")
self.assertEqual(num, 9)
self.assertEqual(len(r), 9)
self.assertEqual([doc for doc, score in r], range(9, 0, -1))
# Searching for xyz should skip doc 9, and favor the lower-numbered
# docs (they have more instances of xyz).
r, num = self.zc_index.query("xyz")
self.assertEqual(num, 8)
self.assertEqual(len(r), 8)
self.assertEqual([doc for doc, score in r], range(1, 9))
# And relative results shouldn't change if we add "one".
r, num = self.zc_index.query("xyz one")
self.assertEqual(num, 8)
self.assertEqual(len(r), 8)
self.assertEqual([doc for doc, score in r], range(1, 9))
# But if we search for all the words, it's much muddier. The boost
# in going from i instances to i+1 of a given word is smaller than
# the boost in going from i-1 to i, so the winner will be the one
# that balances the # of twos and xyzs best. But the test is nasty
# that way: doc 4 has 4 two and 5 xyz, while doc 5 has the reverse.
# However, xyz is missing from doc 9, so xyz has a larger idf than
# two has. Since all the doc lengths are the same, doc lengths don't
# matter. So doc 4 should win, and doc 5 should come in second.
# The loser will be the most unbalanced, but is that doc 1 (1 two 8
# xyz) or doc 8 (8 two 1 xyz)? Again xyz has a higher idf, so doc 1
# is more valuable, and doc 8 is the loser.
r, num = self.zc_index.query("xyz one two")
self.assertEqual(num, 8)
self.assertEqual(len(r), 8)
self.assertEqual(r[0][0], 4) # winner
self.assertEqual(r[1][0], 5) # runner up
self.assertEqual(r[-1][0], 8) # loser
self.assertEqual(r[-2][0], 1) # penultimate loser
# And nothing about the relative results in the last test should
# change if we leave "one" out of the search (it appears in all
# docs, so it's a wash).
r, num = self.zc_index.query("two xyz")
self.assertEqual(num, 8)
self.assertEqual(len(r), 8)
self.assertEqual(r[0][0], 4) # winner
self.assertEqual(r[1][0], 5) # runner up
self.assertEqual(r[-1][0], 8) # loser
self.assertEqual(r[-2][0], 1) # penultimate loser
############################################################################
# Subclasses of QueryTestsBase must set a class variable IndexFactory to
# the kind of index to be constructed.
class QueryTestsBase(testQueryEngine.TestQueryEngine,
testQueryParser.TestQueryParser):
# The FauxIndex in testQueryEngine contains four documents.
# docid 1: foo, bar, ham
# docid 2: bar, ham
# docid 3: foo, ham
# docid 4: ham
docs = ["foo bar ham", "bar ham", "foo ham", "ham"]
def setUp(self):
self.lexicon = PLexicon('lexicon', '',
Splitter(),
CaseNormalizer(),
StopWordRemover())
caller = LexiconHolder(self.lexicon)
self.zc_index = ZCTextIndex('name',
None,
caller,
self.IndexFactory,
'text',
'lexicon')
self.parser = QueryParser(self.lexicon)
self.index = self.zc_index.index
self.add_docs()
def add_docs(self):
for i in range(len(self.docs)):
text = self.docs[i]
obj = Indexable(text)
self.zc_index.index_object(i + 1, obj)
def compareSet(self, set, dict):
# XXX The FauxIndex and the real Index score documents very
# differently. The set comparison can't actually compare the
# items, but it can compare the keys. That will have to do for now.
setkeys = list(set.keys())
dictkeys = dict.keys()
setkeys.sort()
dictkeys.sort()
self.assertEqual(setkeys, dictkeys)
class CosineQueryTests(QueryTestsBase):
IndexFactory = CosineIndex
class OkapiQueryTests(QueryTestsBase):
IndexFactory = OkapiIndex
############################################################################
def test_suite():
s = unittest.TestSuite()
for klass in (CosineIndexTests, OkapiIndexTests,
CosineQueryTests, OkapiQueryTests):
s.addTest(unittest.makeSuite(klass))
return s
if __name__=='__main__':
unittest.main(defaultTest='test_suite')