This repository has been archived by the owner on May 13, 2020. It is now read-only.
/
ZCTextIndex.py
372 lines (295 loc) · 12.8 KB
/
ZCTextIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Plug in text index for ZCatalog with relevance ranking."""
from cgi import escape
from types import TupleType
import ZODB
from Persistence import Persistent
import Acquisition
from Acquisition import aq_base, aq_inner, aq_parent
from OFS.SimpleItem import SimpleItem
from Globals import DTMLFile, InitializeClass
from AccessControl.SecurityInfo import ClassSecurityInfo
from AccessControl.Permissions import manage_zcatalog_indexes, search_zcatalog
from Products.PluginIndexes.common.PluggableIndex import \
PluggableIndexInterface
from Products.PluginIndexes.common.util import parseIndexRequest
from Products.PluginIndexes.common import safe_callable
from Products.ZCTextIndex.ILexicon import ILexicon
from Products.ZCTextIndex.Lexicon import \
Lexicon, Splitter, CaseNormalizer, StopWordRemover
from Products.ZCTextIndex.NBest import NBest
from Products.ZCTextIndex.QueryParser import QueryParser
from PipelineFactory import element_factory
from Products.ZCTextIndex.CosineIndex import CosineIndex
from Products.ZCTextIndex.OkapiIndex import OkapiIndex
index_types = {'Okapi BM25 Rank':OkapiIndex,
'Cosine Measure':CosineIndex}
class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
"""Persistent TextIndex"""
__implements__ = PluggableIndexInterface
## Magic class attributes ##
meta_type = 'ZCTextIndex'
manage_options = (
{'label': 'Overview', 'action': 'manage_main'},
)
query_options = ['query']
security = ClassSecurityInfo()
security.declareObjectProtected(manage_zcatalog_indexes)
## Constructor ##
def __init__(self, id, extra=None, caller=None, index_factory=None,
field_name=None, lexicon_id=None):
self.id = id
# Arguments can be passed directly to the constructor or
# via the silly "extra" record.
self._fieldname = field_name or getattr(extra, 'doc_attr', '') or id
self._indexed_attrs = self._fieldname.split(',')
self._indexed_attrs = [ attr.strip() for attr in self._indexed_attrs if attr ]
lexicon_id = lexicon_id or getattr(extra, 'lexicon_id', '')
lexicon = getattr(caller, lexicon_id, None)
if lexicon is None:
raise LookupError, 'Lexicon "%s" not found' % escape(lexicon_id)
if not ILexicon.isImplementedBy(lexicon):
raise ValueError('Object "%s" does not implement '
'ZCTextIndex Lexicon interface'
% lexicon.getId())
self.lexicon_id = lexicon.getId()
self._v_lexicon = lexicon
if index_factory is None:
if extra.index_type not in index_types.keys():
raise ValueError, 'Invalid index type "%s"' % escape(
extra.index_type)
self._index_factory = index_types[extra.index_type]
self._index_type = extra.index_type
else:
self._index_factory = index_factory
self.index = self._index_factory(aq_base(self.getLexicon()))
## Private Methods ##
security.declarePrivate('getLexicon')
def getLexicon(self):
"""Get the lexicon for this index
"""
if hasattr(aq_base(self), 'lexicon'):
# Fix up old ZCTextIndexes by removing direct lexicon ref
# and changing it to an ID
lexicon = getattr(aq_parent(aq_inner(self)), self.lexicon.getId())
self.lexicon_id = lexicon.getId()
del self.lexicon
if getattr(aq_base(self), 'lexicon_path', None):
# Fix up slightly less old ZCTextIndexes by removing
# the physical path and changing it to an ID.
# There's no need to use a physical path, which otherwise
# makes it difficult to move or rename ZCatalogs.
self.lexicon_id = self.lexicon_path[-1]
del self.lexicon_path
try:
return self._v_lexicon
except AttributeError:
lexicon = getattr(aq_parent(aq_inner(self)), self.lexicon_id)
if not ILexicon.isImplementedBy(lexicon):
raise TypeError('Object "%s" is not a ZCTextIndex Lexicon'
% repr(lexicon))
self._v_lexicon = lexicon
return lexicon
## External methods not in the Pluggable Index API ##
security.declareProtected(search_zcatalog, 'query')
def query(self, query, nbest=10):
"""Return pair (mapping from docids to scores, num results).
The num results is the total number of results before trimming
to the nbest results.
"""
tree = QueryParser(self.getLexicon()).parseQuery(query)
results = tree.executeQuery(self.index)
if results is None:
return [], 0
chooser = NBest(nbest)
chooser.addmany(results.items())
return chooser.getbest(), len(results)
## Pluggable Index APIs ##
def index_object(self, documentId, obj, threshold=None):
""" wrapper to handle indexing of multiple attributes """
# needed for backward compatibility
try: fields = self._indexed_attrs
except: fields = [ self._fieldname ]
res = 0
all_texts = []
for attr in fields:
text = getattr(obj, attr, None)
if text is None:
continue
if safe_callable(text):
text = text()
if text is None:
continue
all_texts.append(text)
if all_texts:
return self.index.index_doc(documentId, ' '.join(all_texts))
else:
return 0
def unindex_object(self, docid):
if self.index.has_doc(docid):
self.index.unindex_doc(docid)
def _apply_index(self, request, cid=''):
"""Apply query specified by request, a mapping containing the query.
Returns two object on success, the resultSet containing the
matching record numbers and a tuple containing the names of
the fields used
Returns None if request is not valid for this index.
"""
record = parseIndexRequest(request, self.id, self.query_options)
if record.keys is None:
return None
query_str = ' '.join(record.keys)
if not query_str:
return None
tree = QueryParser(self.getLexicon()).parseQuery(query_str)
results = tree.executeQuery(self.index)
return results, (self.id,)
def getEntryForObject(self, documentId, default=None):
"""Return the list of words indexed for documentId"""
try:
word_ids = self.index.get_words(documentId)
except KeyError:
return default
get_word = self.getLexicon().get_word
return [get_word(wid) for wid in word_ids]
def uniqueValues(self, name=None, withLengths=0):
raise NotImplementedError
## The ZCatalog Index management screen uses these methods ##
def numObjects(self):
"""Return number of unique words in the index"""
return self.index.length()
def indexSize(self):
"""Return the number of indexes objects """
return self.index.document_count()
def clear(self):
"""reinitialize the index (but not the lexicon)"""
try:
# Remove the cached reference to the lexicon
# So that it is refreshed
del self._v_lexicon
except (AttributeError, KeyError):
pass
self.index = self._index_factory(self.getLexicon())
## User Interface Methods ##
manage_main = DTMLFile('dtml/manageZCTextIndex', globals())
def getIndexSourceNames(self):
"""Return sequence of names of indexed attributes"""
try:
return self._indexed_attrs
except:
return [self._fieldname]
def getIndexType(self):
"""Return index type string"""
return getattr(self, '_index_type', self._index_factory.__name__)
def getLexiconURL(self):
"""Return the url of the lexicon used by the index"""
try:
lex = self.getLexicon()
except (KeyError, AttributeError):
return None
else:
return lex.absolute_url()
InitializeClass(ZCTextIndex)
def manage_addZCTextIndex(self, id, extra=None, REQUEST=None,
RESPONSE=None):
"""Add a text index"""
if REQUEST is None:
URL3 = None
else:
URL3 = REQUEST.URL3
return self.manage_addIndex(id, 'ZCTextIndex', extra,
REQUEST, RESPONSE, URL3)
manage_addZCTextIndexForm = DTMLFile('dtml/addZCTextIndex', globals())
manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals())
def manage_addLexicon(self, id, title='', elements=[], REQUEST=None):
"""Add ZCTextIndex Lexicon"""
pipeline = []
for el_record in elements:
if not hasattr(el_record, 'name'):
continue # Skip over records that only specify element group
element = element_factory.instantiate(el_record.group, el_record.name)
if element is not None:
if el_record.group == 'Word Splitter':
# I don't like hardcoding this, but its a simple solution
# to get the splitter element first in the pipeline
pipeline.insert(0, element)
else:
pipeline.append(element)
lexicon = PLexicon(id, title, *pipeline)
self._setObject(id, lexicon)
if REQUEST is not None:
return self.manage_main(self, REQUEST, update_menu=1)
# I am borrowing the existing vocabulary permissions for now to avoid
# adding new permissions. This may change when old style Vocabs go away
LexiconQueryPerm = 'Query Vocabulary'
LexiconMgmtPerm = 'Manage Vocabulary'
class PLexicon(Lexicon, Acquisition.Implicit, SimpleItem):
"""Lexicon for ZCTextIndex"""
meta_type = 'ZCTextIndex Lexicon'
manage_options = ({'label':'Overview', 'action':'manage_main'},
{'label':'Query', 'action':'queryLexicon'},
) + SimpleItem.manage_options
security = ClassSecurityInfo()
security.declareObjectProtected(LexiconQueryPerm)
def __init__(self, id, title='', *pipeline):
self.id = str(id)
self.title = str(title)
PLexicon.inheritedAttribute('__init__')(self, *pipeline)
## User Interface Methods ##
def getPipelineNames(self):
"""Return list of names of pipeline element classes"""
return [element.__class__.__name__ for element in self._pipeline]
_queryLexicon = DTMLFile('dtml/queryLexicon', globals())
security.declareProtected(LexiconQueryPerm, 'queryLexicon')
def queryLexicon(self, REQUEST, words=None, page=0, rows=20, cols=4):
"""Lexicon browser/query user interface
"""
if words:
wids = []
for word in words:
wids.extend(self.globToWordIds(word))
words = [self.get_word(wid) for wid in wids]
else:
words = self.words()
word_count = len(words)
rows = max(min(rows, 500), 1)
cols = max(min(cols, 12), 1)
page_count = word_count / (rows * cols) + \
(word_count % (rows * cols) > 0)
page = max(min(page, page_count - 1), 0)
start = rows * cols * page
end = min(rows * cols * (page + 1), word_count)
if word_count:
words = list(words[start:end])
else:
words = []
columns = []
i = 0
while i < len(words):
columns.append(words[i:i + rows])
i += rows
return self._queryLexicon(self, REQUEST,
page=page,
rows=rows,
cols=cols,
start_word=start+1,
end_word=end,
word_count=word_count,
page_count=page_count,
page_range=xrange(page_count),
page_columns=columns)
security.declareProtected(LexiconMgmtPerm, 'manage_main')
manage_main = DTMLFile('dtml/manageLexicon', globals())
InitializeClass(PLexicon)