Bug#33935417: Histograms cause zero row estimates for values outside histogram buckets

Tobias Christiani · Jan Wedvik · commit 1d8840151336 · 2022-06-03T11:20:56.000+02:00
Histograms in MySQL return a selectivity estimate of zero for values
that are outside of buckets. Values can be missing from the histogram
because they were missed during sampling, or because the histogram has
grown stale.

This patch introduces a constant lower bound of 0.001 on the selectivity
estimates produced by histograms. This choice of lower bound corresponds
to the selectivity of a value/range that we are likely to miss during
sampling.

Using a constant lower bound rather than a statistical estimate for the
selectivity of a missing value has the advantage of simplicity and
predictability. It also provides some protection against underestimating
the selectivity due to stale histograms and within-bucket heuristics.

Change-Id: I94dceaf65995fce618abd01bc9ee80c1ffac677a
Signed-off-by: Jan Wedvik &lt;jan.wedvik@oracle.com&gt;
diff --git a/mysql-test/r/histograms.result b/mysql-test/r/histograms.result
@@ -3888,4 +3888,87 @@ Table	Op	Msg_type	Msg_text
 test.t1	histogram	Error	Lock wait timeout exceeded; try restarting transaction
 UNLOCK INSTANCE;
 DROP TABLE t1;
+#
+# Bug#33935417 Histograms cause zero row estimates for values outside
+#              histogram buckets
+#
+CREATE TABLE ten (x INT);
+INSERT INTO ten VALUES (0), (1), (2), (3), (4), (5), (6), (7), (8), (9);
+CREATE TABLE hundred (x INT);
+INSERT INTO hundred SELECT 10*ten1.x + ten0.x AS v
+FROM ten AS ten1, ten AS ten0 ORDER BY v;
+CREATE TABLE ten_thousand (x INT);
+INSERT INTO ten_thousand SELECT 100*h1.x + h0.x AS v
+FROM hundred AS h1, hundred AS h0 ORDER BY v;
+EXPLAIN SELECT * FROM ten WHERE x = -1;
+id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	ten	NULL	ALL	NULL	NULL	NULL	NULL	10	10.00	Using where
+Warnings:
+Note	1003	/* select#1 */ select `test`.`ten`.`x` AS `x` from `test`.`ten` where (`test`.`ten`.`x` = <cache>(-(1)))
+EXPLAIN SELECT * FROM hundred WHERE x = -1;
+id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	hundred	NULL	ALL	NULL	NULL	NULL	NULL	100	10.00	Using where
+Warnings:
+Note	1003	/* select#1 */ select `test`.`hundred`.`x` AS `x` from `test`.`hundred` where (`test`.`hundred`.`x` = <cache>(-(1)))
+EXPLAIN SELECT * FROM ten_thousand WHERE x = -1;
+id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	ten_thousand	NULL	ALL	NULL	NULL	NULL	NULL	10000	10.00	Using where
+Warnings:
+Note	1003	/* select#1 */ select `test`.`ten_thousand`.`x` AS `x` from `test`.`ten_thousand` where (`test`.`ten_thousand`.`x` = <cache>(-(1)))
+ANALYZE TABLE ten UPDATE HISTOGRAM ON x;
+Table	Op	Msg_type	Msg_text
+test.ten	histogram	status	Histogram statistics created for column 'x'.
+ANALYZE TABLE ten;
+Table	Op	Msg_type	Msg_text
+test.ten	analyze	status	OK
+ANALYZE TABLE hundred UPDATE HISTOGRAM ON x;
+Table	Op	Msg_type	Msg_text
+test.hundred	histogram	status	Histogram statistics created for column 'x'.
+ANALYZE TABLE hundred;
+Table	Op	Msg_type	Msg_text
+test.hundred	analyze	status	OK
+ANALYZE TABLE ten_thousand UPDATE HISTOGRAM ON x;
+Table	Op	Msg_type	Msg_text
+test.ten_thousand	histogram	status	Histogram statistics created for column 'x'.
+ANALYZE TABLE ten_thousand;
+Table	Op	Msg_type	Msg_text
+test.ten_thousand	analyze	status	OK
+EXPLAIN SELECT * FROM ten WHERE x = -1;
+id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	ten	NULL	ALL	NULL	NULL	NULL	NULL	10	10.00	Using where
+Warnings:
+Note	1003	/* select#1 */ select `test`.`ten`.`x` AS `x` from `test`.`ten` where (`test`.`ten`.`x` = <cache>(-(1)))
+EXPLAIN SELECT * FROM hundred WHERE x = -1;
+id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	hundred	NULL	ALL	NULL	NULL	NULL	NULL	100	1.00	Using where
+Warnings:
+Note	1003	/* select#1 */ select `test`.`hundred`.`x` AS `x` from `test`.`hundred` where (`test`.`hundred`.`x` = <cache>(-(1)))
+EXPLAIN SELECT * FROM ten_thousand WHERE x = -1;
+id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	ten_thousand	NULL	ALL	NULL	NULL	NULL	NULL	9980	0.10	Using where
+Warnings:
+Note	1003	/* select#1 */ select `test`.`ten_thousand`.`x` AS `x` from `test`.`ten_thousand` where (`test`.`ten_thousand`.`x` = <cache>(-(1)))
+EXPLAIN SELECT * FROM ten_thousand WHERE x < -1;
+id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	ten_thousand	NULL	ALL	NULL	NULL	NULL	NULL	9980	0.10	Using where
+Warnings:
+Note	1003	/* select#1 */ select `test`.`ten_thousand`.`x` AS `x` from `test`.`ten_thousand` where (`test`.`ten_thousand`.`x` < <cache>(-(1)))
+EXPLAIN SELECT * FROM ten_thousand WHERE x BETWEEN -100 AND -1;
+id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	ten_thousand	NULL	ALL	NULL	NULL	NULL	NULL	9980	0.10	Using where
+Warnings:
+Note	1003	/* select#1 */ select `test`.`ten_thousand`.`x` AS `x` from `test`.`ten_thousand` where (`test`.`ten_thousand`.`x` between <cache>(-(100)) and <cache>(-(1)))
+EXPLAIN SELECT * FROM ten_thousand WHERE x IN (-2, -1);
+id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	ten_thousand	NULL	ALL	NULL	NULL	NULL	NULL	9980	0.10	Using where
+Warnings:
+Note	1003	/* select#1 */ select `test`.`ten_thousand`.`x` AS `x` from `test`.`ten_thousand` where (`test`.`ten_thousand`.`x` in (<cache>(-(2)),<cache>(-(1))))
+EXPLAIN SELECT * FROM ten_thousand WHERE x IN (1, 2, 3);
+id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	ten_thousand	NULL	ALL	NULL	NULL	NULL	NULL	9980	0.10	Using where
+Warnings:
+Note	1003	/* select#1 */ select `test`.`ten_thousand`.`x` AS `x` from `test`.`ten_thousand` where (`test`.`ten_thousand`.`x` in (1,2,3))
+DROP TABLE ten;
+DROP TABLE hundred;
+DROP TABLE ten_thousand;
 # restart:
diff --git a/mysql-test/t/histograms.test b/mysql-test/t/histograms.test
@@ -2249,6 +2249,53 @@ UNLOCK INSTANCE;
 DROP TABLE t1;
 --disconnect con1
 
+-- echo #
+-- echo # Bug#33935417 Histograms cause zero row estimates for values outside
+-- echo #              histogram buckets
+-- echo #
+
+# Verify that selectivity estimates are lower bounded by 0.001.
+CREATE TABLE ten (x INT);
+INSERT INTO ten VALUES (0), (1), (2), (3), (4), (5), (6), (7), (8), (9);
+
+CREATE TABLE hundred (x INT);
+INSERT INTO hundred SELECT 10*ten1.x + ten0.x AS v
+FROM ten AS ten1, ten AS ten0 ORDER BY v;
+
+CREATE TABLE ten_thousand (x INT);
+INSERT INTO ten_thousand SELECT 100*h1.x + h0.x AS v
+FROM hundred AS h1, hundred AS h0 ORDER BY v;
+
+# The default selectivity used for equality predicates is 0.1.
+EXPLAIN SELECT * FROM ten WHERE x = -1;
+EXPLAIN SELECT * FROM hundred WHERE x = -1;
+EXPLAIN SELECT * FROM ten_thousand WHERE x = -1;
+
+# Build histograms on all tables and ensure statistics are up to date.
+ANALYZE TABLE ten UPDATE HISTOGRAM ON x;
+ANALYZE TABLE ten;
+ANALYZE TABLE hundred UPDATE HISTOGRAM ON x;
+ANALYZE TABLE hundred;
+ANALYZE TABLE ten_thousand UPDATE HISTOGRAM ON x;
+ANALYZE TABLE ten_thousand;
+
+# The old optimizer uses a selectivity estimate of max(1/#rows, histogram_estimate).
+# At 10k rows we should see the difference with the new lower bound of 1/1000
+# being used instead of 1/10000.
+EXPLAIN SELECT * FROM ten WHERE x = -1;
+EXPLAIN SELECT * FROM hundred WHERE x = -1;
+EXPLAIN SELECT * FROM ten_thousand WHERE x = -1;
+
+# The lower bound is used for all predicates supported by the histogram.
+EXPLAIN SELECT * FROM ten_thousand WHERE x < -1;
+EXPLAIN SELECT * FROM ten_thousand WHERE x BETWEEN -100 AND -1;
+EXPLAIN SELECT * FROM ten_thousand WHERE x IN (-2, -1);
+EXPLAIN SELECT * FROM ten_thousand WHERE x IN (1, 2, 3);
+
+DROP TABLE ten;
+DROP TABLE hundred;
+DROP TABLE ten_thousand;
+
 # Run a restart without any special parameters, which causes "check testcase" to
 # be run. Always keep this at the very end of the test!
 let $restart_parameters = restart:;
diff --git a/sql/histograms/equi_height.cc b/sql/histograms/equi_height.cc
@@ -235,7 +235,33 @@ static ha_rows FindBucketMaxValues(const Value_map<T> &value_map,
   low frequency values can be between u and (1/s)*u. In order to minimize the
   worst-case relative error, we use the geometric mean of these two values.
 
-  Further considerations:
+  Important note:
+
+  This estimator was designed for uniform random sampling. We currently use
+  page-level sampling for histograms. This can cause us to underestimate the
+  number of distinct values by nearly a factor 1/s in the worst case. The
+  reason is that we only scale up the number of singleton values.
+  With page-level sampling we can have pairs of distinct values occuring
+  together so that we will have u=0 in the formula above.
+
+  For now, we opt to keep the formula as it is, since we would rather
+  underestimate than overestimate the number of distinct values. Potential
+  solutions:
+
+  1) Use a custom estimator for page-level sampling [3]. This requires changes
+     to the sampling interface to InnoDB to support counting the number of pages
+     a value appears in.
+
+  2) Use the simpler estimate of sqrt(1/s)*d, the geometric mean between the
+     lower bound of d and the upper bound of d/s. This has the downside of
+     overestimating the number of distinct values by sqrt(1/s) in cases where
+     the table only contains heavy hitters.
+
+  3) Simulate uniform random sampling on top of the page-level sampling.
+     Postgres does this, but it requires sampling as many pages as the target
+     number of rows.
+
+   Further considerations:
 
   It turns out that estimating the number of distinct values is a difficult
   problem. In [1] it is shown that for any estimator based on random sampling
@@ -271,6 +297,11 @@ static ha_rows FindBucketMaxValues(const Value_map<T> &value_map,
 
   [2] Haas, Peter J., et al. "Sampling-based estimation of the number of
   distinct values of an attribute." VLDB. Vol. 95. 1995.
+
+  [3] Chaudhuri, Surajit, Gautam Das, and Utkarsh Srivastava. "Effective use of
+  block-level sampling in statistics estimation." Proceedings of the 2004 ACM
+  SIGMOD international conference on Management of data. 2004.
+
 */
 static ha_rows EstimateDistinctValues(double sampling_rate,
                                       ha_rows bucket_distinct_values,
diff --git a/sql/histograms/histogram.cc b/sql/histograms/histogram.cc
@@ -1574,6 +1574,53 @@ bool Histogram::get_selectivity_dispatcher(Item *item, const enum_operator op,
 
 bool Histogram::get_selectivity(Item **items, size_t item_count,
                                 enum_operator op, double *selectivity) const {
+  if (get_raw_selectivity(items, item_count, op, selectivity)) return true;
+
+  /*
+    We return a selectivity of at least 0.001 in order to avoid returning very
+    low estimates in the following cases:
+
+    1) We miss a value or underestimate its frequency during sampling. With our
+       current histogram format this causes "holes" between buckets where we
+       estimate a selectivity of zero.
+
+    2) We miss a range of values. With our format we are particularly vulnerable
+       around the min and max of the distribution as the sampled min is likely
+       greater than the true min and the sampled max likely smaller than the
+       true max.
+
+    3) Within-bucket heuristics produce very low estimates. This can for example
+       happen for range-queries within a bucket. Another example is if we have
+       many infrequent values and one highly frequent value in a bucket.
+
+    4) The histogram has gone stale. While the usual assumption is that the
+       value distribution remains nearly constant this assumption fails in some
+       common use cases. Consider for example a date column where the current
+       date is inserted.
+
+    The reason for the choice of 0.001 for the lower bound is that we typically
+    sample fewer than 1000 pages with the default settings. With a sample of
+    1000 pages the probablity of missing a value or range of values with a
+    selectivity of 0.001 is around 1/e (~0.368) as the size of the table goes to
+    infinity in the worst case when the values of interest are concentrated on
+    few pages.
+
+    The cost of using a minimum selectivity of 0.001 is that we may sometimes
+    over-estimate the selectivity. For very large tables 0.1% of the rows is
+    still a lot in absolute terms -- 1000 rows for a table with 1 million rows,
+    and 1 million rows for a table with 1 billion rows.
+
+    We could improve this estimate by considering the actual number of pages
+    sampled when the histogram was constructed.
+  */
+  const double minimum_selectivity = 0.001;
+  *selectivity = std::max(*selectivity, minimum_selectivity);
+  return false;
+}
+
+bool Histogram::get_raw_selectivity(Item **items, size_t item_count,
+                                    enum_operator op,
+                                    double *selectivity) const {
   // Do some sanity checking first
   switch (op) {
     case enum_operator::EQUALS_TO:
diff --git a/sql/histograms/histogram.h b/sql/histograms/histogram.h
@@ -276,6 +276,13 @@ class Histogram {
   /// Name of the column this histogram represents.
   LEX_CSTRING m_column_name;
 
+  /**
+    An internal function for getting a selectivity estimate prior to adustment.
+    @see get_selectivity() for details.
+   */
+  bool get_raw_selectivity(Item **items, size_t item_count, enum_operator op,
+                           double *selectivity) const;
+
   /**
     An internal function for getting the selecitvity estimation.