Skip to content

Commit

Permalink
Consider only indexes with inhomogen distribution of items per unique…
Browse files Browse the repository at this point in the history
… value
  • Loading branch information
andbag committed Apr 11, 2019
1 parent 852c51f commit 800f45d
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 26 deletions.
60 changes: 45 additions & 15 deletions src/Products/ZCatalog/plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,19 @@
REFRESH_RATE = 100
VALUE_INDEX_KEY = 'VALUE_INDEXES'

# Chi-squared distribution table with P-value 0.05 (0.95 quantile)
# key of dict is the degree of freedom
CHISQ_QUANTILE = {1: 3.84,
2: 5.99,
3: 7.82,
4: 9.49,
5: 11.07,
6: 12.59,
7: 14.07,
8: 15.51,
9: 16.92,
10: 18.31}

Duration = namedtuple('Duration', ['start', 'end'])
IndexMeasurement = namedtuple('IndexMeasurement',
['name', 'duration', 'limit'])
Expand Down Expand Up @@ -191,9 +204,9 @@ def valueindexes(self):
# in the report key. The number of unique values for the index needs to
# be lower than the MAX_DISTINCT_VALUES watermark.

# TODO: Ideally who would only consider those indexes with a small
# Ideally who would only consider those indexes with a small
# number of unique values, where the number of items for each value
# differs a lot. If the number of items per value is similar, the
# differs a lot. If the distribution of items per value is homogen, the
# duration of a query is likely similar as well.
value_indexes = PriorityMap.get_entry(self.cid, VALUE_INDEX_KEY)
if isinstance(value_indexes, (frozenset, set)):
Expand All @@ -206,19 +219,36 @@ def valueindexes(self):
value_indexes = set()
for name, index in indexes.items():
if IUniqueValueIndex.providedBy(index):
values = index.uniqueValues()
i = 0
for value in values:
# the total number of unique values might be large and
# expensive to load, so we only check if we can get
# more than MAX_DISTINCT_VALUES
if i >= MAX_DISTINCT_VALUES:
break
i += 1
if i > 0 and i < MAX_DISTINCT_VALUES:
# Only consider indexes which actually return a number
# greater than zero
value_indexes.add(name)
# Skip indexes with high number of unique values
# or no degree of freedom (isize < 2)
# index size
isize = index.indexSize()
if isize < 2 or isize >= MAX_DISTINCT_VALUES:
continue

values = index.uniqueValues(withLengths=True)

# chi-square test serves as a measure of goodness of
# homogenity of the distribution of items for each value
# https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test

# null hypothesis (expected value) for a homogen distribution
ev = float(len(self.catalog)) / float(isize)
chi_square = 0
for value, l in values:
chi_square += (float(l) - float(ev))**2 / float(ev)

# Check null hypothesis. If chi_square is lower than
# CHISQ_QUANTILE the the distribution of items per
# value is homogen. Therefore, the null hypothesis is accepted

# Degree of freedom
df = isize - 1
if chi_square < CHISQ_QUANTILE[df]:
continue

# null hypothesis is rejected
value_indexes.add(name)

value_indexes = frozenset(value_indexes)
PriorityMap.set_entry(self.cid, VALUE_INDEX_KEY, value_indexes)
Expand Down
24 changes: 13 additions & 11 deletions src/Products/ZCatalog/tests/test_plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self, num):
self.num = num

def big(self):
return self.num > 5
return self.num > 2

def numbers(self):
return (self.num, self.num + 1)
Expand Down Expand Up @@ -314,7 +314,7 @@ def _make_catalog(self):
zcat._catalog.addIndex('numbers', KeywordIndex('numbers'))
zcat._catalog.addIndex('path', PathIndex('getPhysicalPath'))
zcat._catalog.addIndex('uuid', UUIDIndex('num'))
for i in range(9):
for i in range(19):
obj = Dummy(i)
zcat.catalog_object(obj, str(i))
return zcat
Expand All @@ -328,18 +328,20 @@ def test_uniquevalues(self):
indexes = zcat._catalog.indexes
self.assertEqual(len(list(indexes['big'].uniqueValues())), 2)
self.assertEqual(len(list(indexes['date'].uniqueValues())), 0)
self.assertEqual(len(list(indexes['date'].uniqueValues('start'))), 9)
self.assertEqual(len(list(indexes['date'].uniqueValues('end'))), 9)
self.assertEqual(len(list(indexes['num'].uniqueValues())), 9)
self.assertEqual(len(list(indexes['numbers'].uniqueValues())), 10)
self.assertEqual(len(list(indexes['path'].uniqueValues())), 9)
self.assertEqual(len(list(indexes['uuid'].uniqueValues())), 9)
self.assertEqual(len(list(indexes['date'].uniqueValues('start'))), 19)
self.assertEqual(len(list(indexes['date'].uniqueValues('end'))), 19)
self.assertEqual(len(list(indexes['num'].uniqueValues())), 19)
self.assertEqual(len(list(indexes['numbers'].uniqueValues())), 20)
self.assertEqual(len(list(indexes['path'].uniqueValues())), 19)
self.assertEqual(len(list(indexes['uuid'].uniqueValues())), 19)

def test_valueindexes(self):
zcat = self._make_catalog()
plan = self._make_plan(zcat._catalog)
# only 'big' is a valueindex where the number
# of items per value is inhomogen
self.assertEqual(plan.valueindexes(),
frozenset(['big', 'num', 'path', 'uuid']))
frozenset(['big']))


class TestCatalogReport(cleanup.CleanUp, unittest.TestCase):
Expand All @@ -349,7 +351,7 @@ def setUp(self):
self.zcat = ZCatalog('catalog')
self.zcat.long_query_time = 0.0
self._add_indexes()
for i in range(9):
for i in range(19):
obj = Dummy(i)
self.zcat.catalog_object(obj, str(i))

Expand Down Expand Up @@ -412,7 +414,7 @@ def test_ReportKey(self):
self.assertEqual(r['counter'], 1)

# query key 3
key = ('sort_on', ('num', '[3, 4, 5]'))
key = ('num', 'sort_on')
self.zcat.manage_resetCatalogReport()

self.zcat.searchResults(num=[5, 4, 3], sort_on='num')
Expand Down

0 comments on commit 800f45d

Please sign in to comment.