Consider only indexes with inhomogen distribution of items per unique…

… value
zopefoundation · Apr 11, 2019 · 800f45d · 800f45d
1 parent 852c51f
commit 800f45d
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 26 deletions.
diff --git a/src/Products/ZCatalog/plan.py b/src/Products/ZCatalog/plan.py
@@ -31,6 +31,19 @@
 REFRESH_RATE = 100
 VALUE_INDEX_KEY = 'VALUE_INDEXES'
 
+# Chi-squared distribution table with P-value 0.05 (0.95 quantile)
+# key of dict is the degree of freedom
+CHISQ_QUANTILE = {1: 3.84,
+                  2: 5.99,
+                  3: 7.82,
+                  4: 9.49,
+                  5: 11.07,
+                  6: 12.59,
+                  7: 14.07,
+                  8: 15.51,
+                  9: 16.92,
+                  10: 18.31}
+
 Duration = namedtuple('Duration', ['start', 'end'])
 IndexMeasurement = namedtuple('IndexMeasurement',
                               ['name', 'duration', 'limit'])
@@ -191,9 +204,9 @@ def valueindexes(self):
         # in the report key. The number of unique values for the index needs to
         # be lower than the MAX_DISTINCT_VALUES watermark.
 
-        # TODO: Ideally who would only consider those indexes with a small
+        # Ideally who would only consider those indexes with a small
         # number of unique values, where the number of items for each value
-        # differs a lot. If the number of items per value is similar, the
+        # differs a lot. If the distribution of items per value is homogen, the
         # duration of a query is likely similar as well.
         value_indexes = PriorityMap.get_entry(self.cid, VALUE_INDEX_KEY)
         if isinstance(value_indexes, (frozenset, set)):
@@ -206,19 +219,36 @@ def valueindexes(self):
         value_indexes = set()
         for name, index in indexes.items():
             if IUniqueValueIndex.providedBy(index):
-                values = index.uniqueValues()
-                i = 0
-                for value in values:
-                    # the total number of unique values might be large and
-                    # expensive to load, so we only check if we can get
-                    # more than MAX_DISTINCT_VALUES
-                    if i >= MAX_DISTINCT_VALUES:
-                        break
-                    i += 1
-                if i > 0 and i < MAX_DISTINCT_VALUES:
-                    # Only consider indexes which actually return a number
-                    # greater than zero
-                    value_indexes.add(name)
+                # Skip indexes with high number of unique values
+                # or no degree of freedom (isize < 2)
+                # index size
+                isize = index.indexSize()
+                if isize < 2 or isize >= MAX_DISTINCT_VALUES:
+                    continue
+
+                values = index.uniqueValues(withLengths=True)
+
+                # chi-square test serves as a measure of goodness of
+                # homogenity of the distribution of items for each value
+                # https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
+
+                # null hypothesis (expected value) for a homogen distribution
+                ev = float(len(self.catalog)) / float(isize)
+                chi_square = 0
+                for value, l in values:
+                    chi_square += (float(l) - float(ev))**2 / float(ev)
+
+                # Check null hypothesis. If chi_square is lower than
+                # CHISQ_QUANTILE the the distribution of items per
+                # value is homogen. Therefore, the null hypothesis is accepted
+
+                # Degree of freedom
+                df = isize - 1
+                if chi_square < CHISQ_QUANTILE[df]:
+                    continue
+
+                # null hypothesis is rejected
+                value_indexes.add(name)
 
         value_indexes = frozenset(value_indexes)
         PriorityMap.set_entry(self.cid, VALUE_INDEX_KEY, value_indexes)

diff --git a/src/Products/ZCatalog/tests/test_plan.py b/src/Products/ZCatalog/tests/test_plan.py
@@ -38,7 +38,7 @@ def __init__(self, num):
         self.num = num
 
     def big(self):
-        return self.num > 5
+        return self.num > 2
 
     def numbers(self):
         return (self.num, self.num + 1)
@@ -314,7 +314,7 @@ def _make_catalog(self):
         zcat._catalog.addIndex('numbers', KeywordIndex('numbers'))
         zcat._catalog.addIndex('path', PathIndex('getPhysicalPath'))
         zcat._catalog.addIndex('uuid', UUIDIndex('num'))
-        for i in range(9):
+        for i in range(19):
             obj = Dummy(i)
             zcat.catalog_object(obj, str(i))
         return zcat
@@ -328,18 +328,20 @@ def test_uniquevalues(self):
         indexes = zcat._catalog.indexes
         self.assertEqual(len(list(indexes['big'].uniqueValues())), 2)
         self.assertEqual(len(list(indexes['date'].uniqueValues())), 0)
-        self.assertEqual(len(list(indexes['date'].uniqueValues('start'))), 9)
-        self.assertEqual(len(list(indexes['date'].uniqueValues('end'))), 9)
-        self.assertEqual(len(list(indexes['num'].uniqueValues())), 9)
-        self.assertEqual(len(list(indexes['numbers'].uniqueValues())), 10)
-        self.assertEqual(len(list(indexes['path'].uniqueValues())), 9)
-        self.assertEqual(len(list(indexes['uuid'].uniqueValues())), 9)
+        self.assertEqual(len(list(indexes['date'].uniqueValues('start'))), 19)
+        self.assertEqual(len(list(indexes['date'].uniqueValues('end'))), 19)
+        self.assertEqual(len(list(indexes['num'].uniqueValues())), 19)
+        self.assertEqual(len(list(indexes['numbers'].uniqueValues())), 20)
+        self.assertEqual(len(list(indexes['path'].uniqueValues())), 19)
+        self.assertEqual(len(list(indexes['uuid'].uniqueValues())), 19)
 
     def test_valueindexes(self):
         zcat = self._make_catalog()
         plan = self._make_plan(zcat._catalog)
+        # only 'big' is a valueindex where the number
+        # of items per value is inhomogen
         self.assertEqual(plan.valueindexes(),
-                         frozenset(['big', 'num', 'path', 'uuid']))
+                         frozenset(['big']))
 
 
 class TestCatalogReport(cleanup.CleanUp, unittest.TestCase):
@@ -349,7 +351,7 @@ def setUp(self):
         self.zcat = ZCatalog('catalog')
         self.zcat.long_query_time = 0.0
         self._add_indexes()
-        for i in range(9):
+        for i in range(19):
             obj = Dummy(i)
             self.zcat.catalog_object(obj, str(i))
 
@@ -412,7 +414,7 @@ def test_ReportKey(self):
         self.assertEqual(r['counter'], 1)
 
         # query key 3
-        key = ('sort_on', ('num', '[3, 4, 5]'))
+        key = ('num', 'sort_on')
         self.zcat.manage_resetCatalogReport()
 
         self.zcat.searchResults(num=[5, 4, 3], sort_on='num')