This repository has been archived by the owner on May 13, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
ZCTextIndex.py
346 lines (273 loc) · 12 KB
/
ZCTextIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Plug in text index for ZCatalog with relevance ranking."""
from cgi import escape
from types import TupleType
import ZODB
from Persistence import Persistent
import Acquisition
from Acquisition import aq_base, aq_inner, aq_parent
from OFS.SimpleItem import SimpleItem
from Globals import DTMLFile, InitializeClass
from AccessControl.SecurityInfo import ClassSecurityInfo
from AccessControl.Permissions import manage_zcatalog_indexes, search_zcatalog
from Products.PluginIndexes.common.PluggableIndex import \
PluggableIndexInterface
from Products.PluginIndexes.common.util import parseIndexRequest
from Products.ZCTextIndex.ILexicon import ILexicon
from Products.ZCTextIndex.Lexicon import \
Lexicon, Splitter, CaseNormalizer, StopWordRemover
from Products.ZCTextIndex.NBest import NBest
from Products.ZCTextIndex.QueryParser import QueryParser
from PipelineFactory import element_factory
from Products.ZCTextIndex.CosineIndex import CosineIndex
from Products.ZCTextIndex.OkapiIndex import OkapiIndex
index_types = {'Okapi BM25 Rank':OkapiIndex,
'Cosine Measure':CosineIndex}
class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
"""Persistent TextIndex"""
__implements__ = PluggableIndexInterface
## Magic class attributes ##
meta_type = 'ZCTextIndex'
manage_options = (
{'label': 'Overview', 'action': 'manage_main'},
)
query_options = ['query']
security = ClassSecurityInfo()
security.declareObjectProtected(manage_zcatalog_indexes)
## Constructor ##
def __init__(self, id, extra=None, caller=None, index_factory=None,
field_name=None, lexicon_id=None):
self.id = id
# Arguments can be passed directly to the constructor or
# via the silly "extra" record.
self._fieldname = field_name or getattr(extra, 'doc_attr', '') or id
lexicon_id = lexicon_id or extra.lexicon_id
lexicon = getattr(caller, lexicon_id, None)
if lexicon is None:
raise LookupError, 'Lexicon "%s" not found' % escape(lexicon_id)
if not ILexicon.isImplementedBy(lexicon):
raise ValueError('Object "%s" does not implement '
'ZCTextIndex Lexicon interface'
% lexicon.getId())
self.lexicon_id = lexicon.getId()
self._v_lexicon = lexicon
if index_factory is None:
if extra.index_type not in index_types.keys():
raise ValueError, 'Invalid index type "%s"' % escape(
extra.index_type)
self._index_factory = index_types[extra.index_type]
self._index_type = extra.index_type
else:
self._index_factory = index_factory
self.index = self._index_factory(self.getLexicon())
## Private Methods ##
security.declarePrivate('getLexicon')
def getLexicon(self):
"""Get the lexicon for this index
"""
if hasattr(aq_base(self), 'lexicon'):
# Fix up old ZCTextIndexes by removing direct lexicon ref
# and changing it to an ID
lexicon = getattr(aq_parent(aq_inner(self)), self.lexicon.getId())
self.lexicon_id = lexicon.getId()
del self.lexicon
if getattr(aq_base(self), 'lexicon_path', None):
# Fix up slightly less old ZCTextIndexes by removing
# the physical path and changing it to an ID.
# There's no need to use a physical path, which otherwise
# makes it difficult to move or rename ZCatalogs.
self.lexicon_id = self.lexicon_path[-1]
del self.lexicon_path
try:
return self._v_lexicon
except AttributeError:
lexicon = getattr(aq_parent(aq_inner(self)), self.lexicon_id)
if not ILexicon.isImplementedBy(lexicon):
raise TypeError('Object "%s" is not a ZCTextIndex Lexicon'
% repr(lexicon))
self._v_lexicon = lexicon
return lexicon
## External methods not in the Pluggable Index API ##
security.declareProtected(search_zcatalog, 'query')
def query(self, query, nbest=10):
"""Return pair (mapping from docids to scores, num results).
The num results is the total number of results before trimming
to the nbest results.
"""
tree = QueryParser(self.getLexicon()).parseQuery(query)
results = tree.executeQuery(self.index)
if results is None:
return [], 0
chooser = NBest(nbest)
chooser.addmany(results.items())
return chooser.getbest(), len(results)
## Pluggable Index APIs ##
def index_object(self, docid, obj, threshold=None):
# XXX We currently ignore subtransaction threshold
text = getattr(obj, self._fieldname, None)
if text is None:
return 0
if callable(text):
text = text()
count = self.index.index_doc(docid, text)
self._p_changed = 1 # XXX
return count
def unindex_object(self, docid):
if self.index.has_doc(docid):
self.index.unindex_doc(docid)
self._p_changed = 1 # XXX
def _apply_index(self, request, cid=''):
"""Apply query specified by request, a mapping containing the query.
Returns two object on success, the resultSet containing the
matching record numbers and a tuple containing the names of
the fields used
Returns None if request is not valid for this index.
"""
record = parseIndexRequest(request, self.id, self.query_options)
if record.keys is None:
return None
query_str = ' '.join(record.keys)
if not query_str:
return None
tree = QueryParser(self.getLexicon()).parseQuery(query_str)
results = tree.executeQuery(self.index)
return results, (self._fieldname,)
def getEntryForObject(self, documentId, default=None):
"""Return the list of words indexed for documentId"""
try:
word_ids = self.index.get_words(documentId)
except KeyError:
return default
get_word = self.getLexicon().get_word
return [get_word(wid) for wid in word_ids]
def uniqueValues(self, name=None, withLengths=0):
raise NotImplementedError
## The ZCatalog Index management screen uses these methods ##
def numObjects(self):
"""Return number of unique words in the index"""
return self.index.length()
def clear(self):
"""reinitialize the index (but not the lexicon)"""
try:
# Remove the cached reference to the lexicon
# So that it is refreshed
del self._v_lexicon
except (AttributeError, KeyError):
pass
self.index = self._index_factory(self.getLexicon())
## User Interface Methods ##
manage_main = DTMLFile('dtml/manageZCTextIndex', globals())
def getIndexType(self):
"""Return index type string"""
return getattr(self, '_index_type', self._index_factory.__name__)
def getFieldName(self):
"""Return indexed attribute name"""
return self._fieldname
def getLexiconURL(self):
"""Return the url of the lexicon used by the index"""
try:
lex = self.getLexicon()
except (KeyError, AttributeError):
return None
else:
return lex.absolute_url()
InitializeClass(ZCTextIndex)
def manage_addZCTextIndex(self, id, extra=None, REQUEST=None,
RESPONSE=None):
"""Add a text index"""
return self.manage_addIndex(id, 'ZCTextIndex', extra,
REQUEST, RESPONSE, REQUEST.URL3)
manage_addZCTextIndexForm = DTMLFile('dtml/addZCTextIndex', globals())
manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals())
def manage_addLexicon(self, id, title='', elements=[], REQUEST=None):
"""Add ZCTextIndex Lexicon"""
pipeline = []
for el_record in elements:
if not hasattr(el_record, 'name'):
continue # Skip over records that only specify element group
element = element_factory.instantiate(el_record.group, el_record.name)
if element is not None:
if el_record.group == 'Word Splitter':
# I don't like hardcoding this, but its a simple solution
# to get the splitter element first in the pipeline
pipeline.insert(0, element)
else:
pipeline.append(element)
lexicon = PLexicon(id, title, *pipeline)
self._setObject(id, lexicon)
if REQUEST is not None:
return self.manage_main(self, REQUEST, update_menu=1)
# I am borrowing the existing vocabulary permissions for now to avoid
# adding new permissions. This may change when old style Vocabs go away
LexiconQueryPerm = 'Query Vocabulary'
LexiconMgmtPerm = 'Manage Vocabulary'
class PLexicon(Lexicon, Acquisition.Implicit, SimpleItem):
"""Lexicon for ZCTextIndex"""
meta_type = 'ZCTextIndex Lexicon'
manage_options = ({'label':'Overview', 'action':'manage_main'},
{'label':'Query', 'action':'queryLexicon'},
) + SimpleItem.manage_options
security = ClassSecurityInfo()
security.declareObjectProtected(LexiconQueryPerm)
def __init__(self, id, title='', *pipeline):
self.id = str(id)
self.title = str(title)
PLexicon.inheritedAttribute('__init__')(self, *pipeline)
## User Interface Methods ##
def getPipelineNames(self):
"""Return list of names of pipeline element classes"""
return [element.__class__.__name__ for element in self._pipeline]
_queryLexicon = DTMLFile('dtml/queryLexicon', globals())
security.declareProtected(LexiconQueryPerm, 'queryLexicon')
def queryLexicon(self, REQUEST, words=None, page=0, rows=20, cols=4):
"""Lexicon browser/query user interface
"""
if words:
wids = []
for word in words:
wids.extend(self.globToWordIds(word))
words = [self.get_word(wid) for wid in wids]
else:
words = self.words()
word_count = len(words)
rows = max(min(rows, 500), 1)
cols = max(min(cols, 12), 1)
page_count = word_count / (rows * cols) + \
(word_count % (rows * cols) > 0)
page = max(min(page, page_count - 1), 0)
start = rows * cols * page
end = min(rows * cols * (page + 1), word_count)
if word_count:
words = list(words[start:end])
else:
words = []
columns = []
i = 0
while i < len(words):
columns.append(words[i:i + rows])
i += rows
return self._queryLexicon(self, REQUEST,
page=page,
rows=rows,
cols=cols,
start_word=start+1,
end_word=end,
word_count=word_count,
page_count=page_count,
page_range=xrange(page_count),
page_columns=columns)
security.declareProtected(LexiconMgmtPerm, 'manage_main')
manage_main = DTMLFile('dtml/manageLexicon', globals())
InitializeClass(PLexicon)