Skip to content

Commit 1d88401

Browse files
Tobias ChristianiJan Wedvik
Tobias Christiani
authored and
Jan Wedvik
committed
Bug#33935417: Histograms cause zero row estimates for values outside histogram buckets
Histograms in MySQL return a selectivity estimate of zero for values that are outside of buckets. Values can be missing from the histogram because they were missed during sampling, or because the histogram has grown stale. This patch introduces a constant lower bound of 0.001 on the selectivity estimates produced by histograms. This choice of lower bound corresponds to the selectivity of a value/range that we are likely to miss during sampling. Using a constant lower bound rather than a statistical estimate for the selectivity of a missing value has the advantage of simplicity and predictability. It also provides some protection against underestimating the selectivity due to stale histograms and within-bucket heuristics. Change-Id: I94dceaf65995fce618abd01bc9ee80c1ffac677a Signed-off-by: Jan Wedvik <jan.wedvik@oracle.com>
1 parent f40c374 commit 1d88401

File tree

5 files changed

+216
-1
lines changed

5 files changed

+216
-1
lines changed

mysql-test/r/histograms.result

+83
Original file line numberDiff line numberDiff line change
@@ -3888,4 +3888,87 @@ Table Op Msg_type Msg_text
38883888
test.t1 histogram Error Lock wait timeout exceeded; try restarting transaction
38893889
UNLOCK INSTANCE;
38903890
DROP TABLE t1;
3891+
#
3892+
# Bug#33935417 Histograms cause zero row estimates for values outside
3893+
# histogram buckets
3894+
#
3895+
CREATE TABLE ten (x INT);
3896+
INSERT INTO ten VALUES (0), (1), (2), (3), (4), (5), (6), (7), (8), (9);
3897+
CREATE TABLE hundred (x INT);
3898+
INSERT INTO hundred SELECT 10*ten1.x + ten0.x AS v
3899+
FROM ten AS ten1, ten AS ten0 ORDER BY v;
3900+
CREATE TABLE ten_thousand (x INT);
3901+
INSERT INTO ten_thousand SELECT 100*h1.x + h0.x AS v
3902+
FROM hundred AS h1, hundred AS h0 ORDER BY v;
3903+
EXPLAIN SELECT * FROM ten WHERE x = -1;
3904+
id select_type table partitions type possible_keys key key_len ref rows filtered Extra
3905+
1 SIMPLE ten NULL ALL NULL NULL NULL NULL 10 10.00 Using where
3906+
Warnings:
3907+
Note 1003 /* select#1 */ select `test`.`ten`.`x` AS `x` from `test`.`ten` where (`test`.`ten`.`x` = <cache>(-(1)))
3908+
EXPLAIN SELECT * FROM hundred WHERE x = -1;
3909+
id select_type table partitions type possible_keys key key_len ref rows filtered Extra
3910+
1 SIMPLE hundred NULL ALL NULL NULL NULL NULL 100 10.00 Using where
3911+
Warnings:
3912+
Note 1003 /* select#1 */ select `test`.`hundred`.`x` AS `x` from `test`.`hundred` where (`test`.`hundred`.`x` = <cache>(-(1)))
3913+
EXPLAIN SELECT * FROM ten_thousand WHERE x = -1;
3914+
id select_type table partitions type possible_keys key key_len ref rows filtered Extra
3915+
1 SIMPLE ten_thousand NULL ALL NULL NULL NULL NULL 10000 10.00 Using where
3916+
Warnings:
3917+
Note 1003 /* select#1 */ select `test`.`ten_thousand`.`x` AS `x` from `test`.`ten_thousand` where (`test`.`ten_thousand`.`x` = <cache>(-(1)))
3918+
ANALYZE TABLE ten UPDATE HISTOGRAM ON x;
3919+
Table Op Msg_type Msg_text
3920+
test.ten histogram status Histogram statistics created for column 'x'.
3921+
ANALYZE TABLE ten;
3922+
Table Op Msg_type Msg_text
3923+
test.ten analyze status OK
3924+
ANALYZE TABLE hundred UPDATE HISTOGRAM ON x;
3925+
Table Op Msg_type Msg_text
3926+
test.hundred histogram status Histogram statistics created for column 'x'.
3927+
ANALYZE TABLE hundred;
3928+
Table Op Msg_type Msg_text
3929+
test.hundred analyze status OK
3930+
ANALYZE TABLE ten_thousand UPDATE HISTOGRAM ON x;
3931+
Table Op Msg_type Msg_text
3932+
test.ten_thousand histogram status Histogram statistics created for column 'x'.
3933+
ANALYZE TABLE ten_thousand;
3934+
Table Op Msg_type Msg_text
3935+
test.ten_thousand analyze status OK
3936+
EXPLAIN SELECT * FROM ten WHERE x = -1;
3937+
id select_type table partitions type possible_keys key key_len ref rows filtered Extra
3938+
1 SIMPLE ten NULL ALL NULL NULL NULL NULL 10 10.00 Using where
3939+
Warnings:
3940+
Note 1003 /* select#1 */ select `test`.`ten`.`x` AS `x` from `test`.`ten` where (`test`.`ten`.`x` = <cache>(-(1)))
3941+
EXPLAIN SELECT * FROM hundred WHERE x = -1;
3942+
id select_type table partitions type possible_keys key key_len ref rows filtered Extra
3943+
1 SIMPLE hundred NULL ALL NULL NULL NULL NULL 100 1.00 Using where
3944+
Warnings:
3945+
Note 1003 /* select#1 */ select `test`.`hundred`.`x` AS `x` from `test`.`hundred` where (`test`.`hundred`.`x` = <cache>(-(1)))
3946+
EXPLAIN SELECT * FROM ten_thousand WHERE x = -1;
3947+
id select_type table partitions type possible_keys key key_len ref rows filtered Extra
3948+
1 SIMPLE ten_thousand NULL ALL NULL NULL NULL NULL 9980 0.10 Using where
3949+
Warnings:
3950+
Note 1003 /* select#1 */ select `test`.`ten_thousand`.`x` AS `x` from `test`.`ten_thousand` where (`test`.`ten_thousand`.`x` = <cache>(-(1)))
3951+
EXPLAIN SELECT * FROM ten_thousand WHERE x < -1;
3952+
id select_type table partitions type possible_keys key key_len ref rows filtered Extra
3953+
1 SIMPLE ten_thousand NULL ALL NULL NULL NULL NULL 9980 0.10 Using where
3954+
Warnings:
3955+
Note 1003 /* select#1 */ select `test`.`ten_thousand`.`x` AS `x` from `test`.`ten_thousand` where (`test`.`ten_thousand`.`x` < <cache>(-(1)))
3956+
EXPLAIN SELECT * FROM ten_thousand WHERE x BETWEEN -100 AND -1;
3957+
id select_type table partitions type possible_keys key key_len ref rows filtered Extra
3958+
1 SIMPLE ten_thousand NULL ALL NULL NULL NULL NULL 9980 0.10 Using where
3959+
Warnings:
3960+
Note 1003 /* select#1 */ select `test`.`ten_thousand`.`x` AS `x` from `test`.`ten_thousand` where (`test`.`ten_thousand`.`x` between <cache>(-(100)) and <cache>(-(1)))
3961+
EXPLAIN SELECT * FROM ten_thousand WHERE x IN (-2, -1);
3962+
id select_type table partitions type possible_keys key key_len ref rows filtered Extra
3963+
1 SIMPLE ten_thousand NULL ALL NULL NULL NULL NULL 9980 0.10 Using where
3964+
Warnings:
3965+
Note 1003 /* select#1 */ select `test`.`ten_thousand`.`x` AS `x` from `test`.`ten_thousand` where (`test`.`ten_thousand`.`x` in (<cache>(-(2)),<cache>(-(1))))
3966+
EXPLAIN SELECT * FROM ten_thousand WHERE x IN (1, 2, 3);
3967+
id select_type table partitions type possible_keys key key_len ref rows filtered Extra
3968+
1 SIMPLE ten_thousand NULL ALL NULL NULL NULL NULL 9980 0.10 Using where
3969+
Warnings:
3970+
Note 1003 /* select#1 */ select `test`.`ten_thousand`.`x` AS `x` from `test`.`ten_thousand` where (`test`.`ten_thousand`.`x` in (1,2,3))
3971+
DROP TABLE ten;
3972+
DROP TABLE hundred;
3973+
DROP TABLE ten_thousand;
38913974
# restart:

mysql-test/t/histograms.test

+47
Original file line numberDiff line numberDiff line change
@@ -2249,6 +2249,53 @@ UNLOCK INSTANCE;
22492249
DROP TABLE t1;
22502250
--disconnect con1
22512251

2252+
-- echo #
2253+
-- echo # Bug#33935417 Histograms cause zero row estimates for values outside
2254+
-- echo # histogram buckets
2255+
-- echo #
2256+
2257+
# Verify that selectivity estimates are lower bounded by 0.001.
2258+
CREATE TABLE ten (x INT);
2259+
INSERT INTO ten VALUES (0), (1), (2), (3), (4), (5), (6), (7), (8), (9);
2260+
2261+
CREATE TABLE hundred (x INT);
2262+
INSERT INTO hundred SELECT 10*ten1.x + ten0.x AS v
2263+
FROM ten AS ten1, ten AS ten0 ORDER BY v;
2264+
2265+
CREATE TABLE ten_thousand (x INT);
2266+
INSERT INTO ten_thousand SELECT 100*h1.x + h0.x AS v
2267+
FROM hundred AS h1, hundred AS h0 ORDER BY v;
2268+
2269+
# The default selectivity used for equality predicates is 0.1.
2270+
EXPLAIN SELECT * FROM ten WHERE x = -1;
2271+
EXPLAIN SELECT * FROM hundred WHERE x = -1;
2272+
EXPLAIN SELECT * FROM ten_thousand WHERE x = -1;
2273+
2274+
# Build histograms on all tables and ensure statistics are up to date.
2275+
ANALYZE TABLE ten UPDATE HISTOGRAM ON x;
2276+
ANALYZE TABLE ten;
2277+
ANALYZE TABLE hundred UPDATE HISTOGRAM ON x;
2278+
ANALYZE TABLE hundred;
2279+
ANALYZE TABLE ten_thousand UPDATE HISTOGRAM ON x;
2280+
ANALYZE TABLE ten_thousand;
2281+
2282+
# The old optimizer uses a selectivity estimate of max(1/#rows, histogram_estimate).
2283+
# At 10k rows we should see the difference with the new lower bound of 1/1000
2284+
# being used instead of 1/10000.
2285+
EXPLAIN SELECT * FROM ten WHERE x = -1;
2286+
EXPLAIN SELECT * FROM hundred WHERE x = -1;
2287+
EXPLAIN SELECT * FROM ten_thousand WHERE x = -1;
2288+
2289+
# The lower bound is used for all predicates supported by the histogram.
2290+
EXPLAIN SELECT * FROM ten_thousand WHERE x < -1;
2291+
EXPLAIN SELECT * FROM ten_thousand WHERE x BETWEEN -100 AND -1;
2292+
EXPLAIN SELECT * FROM ten_thousand WHERE x IN (-2, -1);
2293+
EXPLAIN SELECT * FROM ten_thousand WHERE x IN (1, 2, 3);
2294+
2295+
DROP TABLE ten;
2296+
DROP TABLE hundred;
2297+
DROP TABLE ten_thousand;
2298+
22522299
# Run a restart without any special parameters, which causes "check testcase" to
22532300
# be run. Always keep this at the very end of the test!
22542301
let $restart_parameters = restart:;

sql/histograms/equi_height.cc

+32-1
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,33 @@ static ha_rows FindBucketMaxValues(const Value_map<T> &value_map,
235235
low frequency values can be between u and (1/s)*u. In order to minimize the
236236
worst-case relative error, we use the geometric mean of these two values.
237237
238-
Further considerations:
238+
Important note:
239+
240+
This estimator was designed for uniform random sampling. We currently use
241+
page-level sampling for histograms. This can cause us to underestimate the
242+
number of distinct values by nearly a factor 1/s in the worst case. The
243+
reason is that we only scale up the number of singleton values.
244+
With page-level sampling we can have pairs of distinct values occuring
245+
together so that we will have u=0 in the formula above.
246+
247+
For now, we opt to keep the formula as it is, since we would rather
248+
underestimate than overestimate the number of distinct values. Potential
249+
solutions:
250+
251+
1) Use a custom estimator for page-level sampling [3]. This requires changes
252+
to the sampling interface to InnoDB to support counting the number of pages
253+
a value appears in.
254+
255+
2) Use the simpler estimate of sqrt(1/s)*d, the geometric mean between the
256+
lower bound of d and the upper bound of d/s. This has the downside of
257+
overestimating the number of distinct values by sqrt(1/s) in cases where
258+
the table only contains heavy hitters.
259+
260+
3) Simulate uniform random sampling on top of the page-level sampling.
261+
Postgres does this, but it requires sampling as many pages as the target
262+
number of rows.
263+
264+
Further considerations:
239265
240266
It turns out that estimating the number of distinct values is a difficult
241267
problem. In [1] it is shown that for any estimator based on random sampling
@@ -271,6 +297,11 @@ static ha_rows FindBucketMaxValues(const Value_map<T> &value_map,
271297
272298
[2] Haas, Peter J., et al. "Sampling-based estimation of the number of
273299
distinct values of an attribute." VLDB. Vol. 95. 1995.
300+
301+
[3] Chaudhuri, Surajit, Gautam Das, and Utkarsh Srivastava. "Effective use of
302+
block-level sampling in statistics estimation." Proceedings of the 2004 ACM
303+
SIGMOD international conference on Management of data. 2004.
304+
274305
*/
275306
static ha_rows EstimateDistinctValues(double sampling_rate,
276307
ha_rows bucket_distinct_values,

sql/histograms/histogram.cc

+47
Original file line numberDiff line numberDiff line change
@@ -1574,6 +1574,53 @@ bool Histogram::get_selectivity_dispatcher(Item *item, const enum_operator op,
15741574

15751575
bool Histogram::get_selectivity(Item **items, size_t item_count,
15761576
enum_operator op, double *selectivity) const {
1577+
if (get_raw_selectivity(items, item_count, op, selectivity)) return true;
1578+
1579+
/*
1580+
We return a selectivity of at least 0.001 in order to avoid returning very
1581+
low estimates in the following cases:
1582+
1583+
1) We miss a value or underestimate its frequency during sampling. With our
1584+
current histogram format this causes "holes" between buckets where we
1585+
estimate a selectivity of zero.
1586+
1587+
2) We miss a range of values. With our format we are particularly vulnerable
1588+
around the min and max of the distribution as the sampled min is likely
1589+
greater than the true min and the sampled max likely smaller than the
1590+
true max.
1591+
1592+
3) Within-bucket heuristics produce very low estimates. This can for example
1593+
happen for range-queries within a bucket. Another example is if we have
1594+
many infrequent values and one highly frequent value in a bucket.
1595+
1596+
4) The histogram has gone stale. While the usual assumption is that the
1597+
value distribution remains nearly constant this assumption fails in some
1598+
common use cases. Consider for example a date column where the current
1599+
date is inserted.
1600+
1601+
The reason for the choice of 0.001 for the lower bound is that we typically
1602+
sample fewer than 1000 pages with the default settings. With a sample of
1603+
1000 pages the probablity of missing a value or range of values with a
1604+
selectivity of 0.001 is around 1/e (~0.368) as the size of the table goes to
1605+
infinity in the worst case when the values of interest are concentrated on
1606+
few pages.
1607+
1608+
The cost of using a minimum selectivity of 0.001 is that we may sometimes
1609+
over-estimate the selectivity. For very large tables 0.1% of the rows is
1610+
still a lot in absolute terms -- 1000 rows for a table with 1 million rows,
1611+
and 1 million rows for a table with 1 billion rows.
1612+
1613+
We could improve this estimate by considering the actual number of pages
1614+
sampled when the histogram was constructed.
1615+
*/
1616+
const double minimum_selectivity = 0.001;
1617+
*selectivity = std::max(*selectivity, minimum_selectivity);
1618+
return false;
1619+
}
1620+
1621+
bool Histogram::get_raw_selectivity(Item **items, size_t item_count,
1622+
enum_operator op,
1623+
double *selectivity) const {
15771624
// Do some sanity checking first
15781625
switch (op) {
15791626
case enum_operator::EQUALS_TO:

sql/histograms/histogram.h

+7
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,13 @@ class Histogram {
276276
/// Name of the column this histogram represents.
277277
LEX_CSTRING m_column_name;
278278

279+
/**
280+
An internal function for getting a selectivity estimate prior to adustment.
281+
@see get_selectivity() for details.
282+
*/
283+
bool get_raw_selectivity(Item **items, size_t item_count, enum_operator op,
284+
double *selectivity) const;
285+
279286
/**
280287
An internal function for getting the selecitvity estimation.
281288

0 commit comments

Comments
 (0)