-
Notifications
You must be signed in to change notification settings - Fork 22
/
BooleanIndex.py
275 lines (230 loc) · 10.1 KB
/
BooleanIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from logging import getLogger
from App.special_dtml import DTMLFile
from BTrees.IIBTree import IIBTree, IITreeSet, IISet
from BTrees.IIBTree import union, intersection, difference
import BTrees.Length
from ZODB.POSException import ConflictError
from Products.PluginIndexes.unindex import _marker
from Products.PluginIndexes.unindex import UnIndex
LOG = getLogger('BooleanIndex.UnIndex')
class BooleanIndex(UnIndex):
"""Index for booleans
self._index = set([documentId1, documentId2])
self._unindex = {documentId:[True/False]}
self._length is the length of the unindex
self._index_length is the length of the index
self._index_value is the indexed value
The document ids in self._index have self._index_value as their value.
Since there are only two possible values (True/False), the index
only stores a forward index for the less common value.
It starts off with the opposite of value of the first document
and later checks and inverts itself, if more than 60% of all
documents now have the indexed value. It does the inversion
at 60% to avoid inverting itself constantly for an index that
has a roughly equal 50/50 split.
"""
meta_type = 'BooleanIndex'
manage_options = (
{'label': 'Settings',
'action': 'manage_main'},
{'label': 'Browse',
'action': 'manage_browse'},
)
query_options = ['query']
manage = manage_main = DTMLFile('dtml/manageBooleanIndex', globals())
manage_main._setName('manage_main')
manage_browse = DTMLFile('../dtml/browseIndex', globals())
_index_value = 1
_index_length = None
def clear(self):
self._index = IITreeSet()
self._index_length = BTrees.Length.Length()
self._index_value = 1
self._unindex = IIBTree()
self._length = BTrees.Length.Length()
if self._counter is None:
self._counter = BTrees.Length.Length()
else:
self._increment_counter()
def histogram(self):
"""Return a mapping which provides a histogram of the number of
elements found at each point in the index.
"""
histogram = {}
indexed = bool(self._index_value)
histogram[indexed] = self._index_length.value
histogram[not indexed] = self._length.value - self._index_length.value
return histogram
def _invert_index(self, documentId=None):
self._index_value = indexed = int(not self._index_value)
self._index.clear()
length = 0
for rid, value in self._unindex.iteritems():
if value == indexed:
self._index.add(rid)
length += 1
# documentId is the rid of the currently processed object that
# triggered the invert. in the case of unindexing, the rid hasn't
# been removed from the unindex yet. While indexing, the rid will
# be added to the index and unindex after this method is done
if documentId is not None:
self._index.remove(documentId)
length -= 1
self._index_length = BTrees.Length.Length(length)
def insertForwardIndexEntry(self, entry, documentId):
"""If the value matches the indexed one, insert into treeset
"""
# When we get the first entry, decide to index the opposite of what
# we got, as indexing zero items is fewer than one.
if self._length.value == 0:
self._index_value = int(not bool(entry))
# if the added entry value is index value, insert it into index
if bool(entry) is bool(self._index_value):
self._index_length.change(1)
self._index.insert(documentId)
# insert value into global unindex (before computing index invert)
self._unindex[documentId] = entry
self._length.change(1)
# is the index (after adding the current entry) larger than 60%
# of the total length? than switch the indexed value
if bool(entry) is bool(self._index_value):
if (self._index_length.value) >= ((self._length.value) * 0.6):
self._invert_index()
def removeForwardIndexEntry(self, entry, documentId, check=True):
"""Take the entry provided and remove any reference to documentId
in its entry in the index.
"""
if bool(entry) is bool(self._index_value):
try:
self._index.remove(documentId)
self._index_length.change(-1)
except ConflictError:
raise
except Exception:
LOG.exception('%(context)s: unindex_object could not '
'remove documentId %(doc_id)s from '
'index %(index)r. This should not '
'happen.', dict(
context=self.__class__.__name__,
doc_id=documentId,
index=self.id))
elif check:
# is the index (after removing the current entry) larger than
# 60% of the total length? than switch the indexed value
if (self._index_length.value) <= ((self._length.value - 1) * 0.6):
self._invert_index(documentId)
return
def _index_object(self, documentId, obj, threshold=None, attr=''):
""" index and object 'obj' with integer id 'documentId'"""
returnStatus = 0
# First we need to see if there's anything interesting to look at
datum = self.get_object_datum(obj, attr)
# Make it boolean, int as an optimization
if datum is not _marker:
datum = int(bool(datum))
# We don't want to do anything that we don't have to here, so we'll
# check to see if the new and existing information is the same.
oldDatum = self._unindex.get(documentId, _marker)
if datum != oldDatum:
if oldDatum is not _marker:
self.removeForwardIndexEntry(oldDatum, documentId, check=False)
if datum is _marker:
try:
del self._unindex[documentId]
self._length.change(-1)
except ConflictError:
raise
except Exception:
LOG.error('Should not happen: oldDatum was there, now '
'its not, for document with id %s',
documentId)
if datum is not _marker:
self.insertForwardIndexEntry(datum, documentId)
returnStatus = 1
return returnStatus
def unindex_object(self, documentId):
""" Unindex the object with integer id 'documentId' and don't
raise an exception if we fail
"""
unindexRecord = self._unindex.get(documentId, _marker)
if unindexRecord is _marker:
return None
self._increment_counter()
self.removeForwardIndexEntry(unindexRecord, documentId)
try:
del self._unindex[documentId]
self._length.change(-1)
except ConflictError:
raise
except Exception:
LOG.debug('Attempt to unindex nonexistent document'
' with id %s', documentId, exc_info=True)
def query_index(self, record, resultset=None):
index = self._index
indexed = self._index_value
for key in record.keys:
if bool(key) is bool(indexed):
# If we match the indexed value, check index
return intersection(index, resultset)
else:
# Otherwise, remove from resultset or _unindex
if resultset is None:
return union(difference(self._unindex, index), IISet([]))
else:
return intersection(difference(resultset, index),
self._unindex)
return IISet()
def indexSize(self):
"""Return distinct values, as an optimization we always claim 2."""
return 2
def items(self):
# return a list of value to int set of rid tuples
indexed = self._index_value
items = [(bool(indexed), self._index)]
false = IISet()
for rid, value in self._unindex.iteritems():
if value != indexed:
false.add(rid)
items.append((not bool(indexed), false))
return items
def uniqueValues(self, name=None, withLengths=0):
"""returns the unique values for name
if withLengths is true, returns a sequence of
tuples of (value, length)
"""
if name is None:
name = self.id
elif name != self.id:
return
indexed = bool(self._index_value)
unique_values = (indexed, not indexed)
if not withLengths:
for key in unique_values:
yield key
else:
for key in unique_values:
ilen = len(self._index)
if key is indexed:
yield (key, ilen)
else:
ulen = len(self._unindex)
yield (key, ulen - ilen)
manage_addBooleanIndexForm = DTMLFile('dtml/addBooleanIndex', globals())
def manage_addBooleanIndex(self, id, extra=None,
REQUEST=None, RESPONSE=None, URL3=None):
"""Add a boolean index"""
return self.manage_addIndex(
id, 'BooleanIndex', extra=extra,
REQUEST=REQUEST, RESPONSE=RESPONSE, URL1=URL3)