Skip to content

Commit

Permalink
Use numrows_pre_compression in approx row count
Browse files Browse the repository at this point in the history
The approximate_row_count function was using the reltuples from
compressed chunks and multiplying that with 1000 which is the default
batch size. This was leading to a huge skew between the actual row
count and the approximate one. We now use the numrows_pre_compression
value from the timescaledb catalog which accurately represents the
number of rows before the actual compression.
  • Loading branch information
nikkhils committed Dec 4, 2023
1 parent ef030d2 commit 293104a
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 20 deletions.
1 change: 1 addition & 0 deletions .unreleased/fix_6365
@@ -0,0 +1 @@
Fixes: #6365 Use numrows_pre_compression in approximate row count
35 changes: 19 additions & 16 deletions sql/size_utils.sql
Expand Up @@ -453,14 +453,15 @@ BEGIN
IF local_compressed_hypertable_id IS NOT NULL THEN
uncompressed_row_count = _timescaledb_functions.get_approx_row_count(relation);

WITH compressed_hypertable AS (SELECT table_name, schema_name FROM _timescaledb_catalog.hypertable ht
WHERE ht.id = local_compressed_hypertable_id)
SELECT c.oid INTO compressed_hypertable_oid FROM pg_class c
INNER JOIN compressed_hypertable h ON (c.relname = h.table_name)
INNER JOIN pg_namespace n ON (n.nspname = h.schema_name);

compressed_row_count = _timescaledb_functions.get_approx_row_count(compressed_hypertable_oid);
RETURN (uncompressed_row_count + (compressed_row_count * max_compressed_row_count));
-- use the compression_chunk_size stats to fetch precompressed num rows
SELECT COALESCE(SUM(numrows_pre_compression), 0) FROM _timescaledb_catalog.chunk srcch,
_timescaledb_catalog.compression_chunk_size map, _timescaledb_catalog.hypertable srcht
INTO compressed_row_count
WHERE map.chunk_id = srcch.id
AND srcht.id = srcch.hypertable_id AND srcht.table_name = local_table_name
AND srcht.schema_name = local_schema_name;

RETURN (uncompressed_row_count + compressed_row_count);
ELSE
uncompressed_row_count = _timescaledb_functions.get_approx_row_count(relation);
RETURN uncompressed_row_count;
Expand All @@ -477,22 +478,24 @@ BEGIN
-- 'input is chunk #1';
IF is_compressed_chunk IS NULL AND local_compressed_chunk_id IS NOT NULL THEN
-- 'Include both uncompressed and compressed chunk #2';
WITH compressed_ns_oid AS ( SELECT table_name, oid FROM _timescaledb_catalog.chunk ch INNER JOIN pg_namespace ns ON
(ch.id = local_compressed_chunk_id and ch.schema_name = ns.nspname))
SELECT c.oid FROM pg_class c INNER JOIN compressed_ns_oid
ON ( c.relnamespace = compressed_ns_oid.oid AND c.relname = compressed_ns_oid.table_name)
INTO local_compressed_chunk_oid;
-- use the compression_chunk_size stats to fetch precompressed num rows
SELECT COALESCE(numrows_pre_compression, 0) FROM _timescaledb_catalog.compression_chunk_size
INTO compressed_row_count
WHERE compressed_chunk_id = local_compressed_chunk_id;

uncompressed_row_count = _timescaledb_functions.get_approx_row_count(relation);
compressed_row_count = _timescaledb_functions.get_approx_row_count(local_compressed_chunk_oid);
RETURN uncompressed_row_count + (compressed_row_count * max_compressed_row_count);
RETURN (uncompressed_row_count + compressed_row_count);
ELSIF is_compressed_chunk IS NULL AND local_compressed_chunk_id IS NULL THEN
-- 'input relation is uncompressed chunk #3';
uncompressed_row_count = _timescaledb_functions.get_approx_row_count(relation);
RETURN uncompressed_row_count;
ELSE
-- 'compressed chunk only #4';
compressed_row_count = _timescaledb_functions.get_approx_row_count(relation) * max_compressed_row_count;
-- use the compression_chunk_size stats to fetch precompressed num rows
SELECT COALESCE(SUM(numrows_pre_compression), 0) FROM _timescaledb_catalog.chunk srcch,
_timescaledb_catalog.compression_chunk_size map INTO compressed_row_count
WHERE map.compressed_chunk_id = srcch.id
AND srcch.table_name = local_table_name AND srcch.schema_name = local_schema_name;
RETURN compressed_row_count;
END IF;
END IF;
Expand Down
82 changes: 80 additions & 2 deletions tsl/test/expected/compression.out
Expand Up @@ -39,6 +39,20 @@ insert into foo values( 3 , 16 , 20, NULL);
insert into foo values( 10 , 10 , 20, NULL);
insert into foo values( 20 , 11 , 20, NULL);
insert into foo values( 30 , 12 , 20, NULL);
analyze foo;
-- check that approximate_row_count works with a regular table
SELECT approximate_row_count('foo');
approximate_row_count
-----------------------
4
(1 row)

SELECT count(*) from foo;
count
-------
4
(1 row)

alter table foo set (timescaledb.compress, timescaledb.compress_segmentby = 'a,b', timescaledb.compress_orderby = 'c desc, d asc nulls last');
--test self-refencing updates
SET timescaledb.enable_transparent_decompression to ON;
Expand Down Expand Up @@ -1259,6 +1273,7 @@ SELECT * FROM pg_stats WHERE tablename = :statchunk;
(0 rows)

ALTER TABLE stattest SET (timescaledb.compress);
-- check that approximate_row_count works with all normal chunks
SELECT approximate_row_count('stattest');
approximate_row_count
-----------------------
Expand All @@ -1271,10 +1286,18 @@ SELECT compress_chunk(c) FROM show_chunks('stattest') c;
_timescaledb_internal._hyper_27_55_chunk
(1 row)

-- check that approximate_row_count works with all compressed chunks
SELECT approximate_row_count('stattest');
approximate_row_count
-----------------------
0
26
(1 row)

-- actual count should match with the above
SELECT count(*) from stattest;
count
-------
26
(1 row)

-- Uncompressed chunk table is empty since we just compressed the chunk and moved everything to compressed chunk table.
Expand Down Expand Up @@ -1330,6 +1353,13 @@ SELECT relpages, reltuples FROM pg_class WHERE relname = :'STAT_COMP_CHUNK_NAME'
1 | 1
(1 row)

-- verify that approximate_row_count works fine on a chunk with compressed data
SELECT approximate_row_count('_timescaledb_internal.' || :'STAT_COMP_CHUNK_NAME');
approximate_row_count
-----------------------
26
(1 row)

-- Verify partial chunk stats are handled correctly when analyzing
-- for both uncompressed and compressed chunk tables
INSERT INTO stattest SELECT '2020/02/20 01:00'::TIMESTAMPTZ + ('1 hour'::interval * v), 250 * v FROM generate_series(25,50) v;
Expand Down Expand Up @@ -1362,6 +1392,28 @@ SELECT relpages, reltuples FROM pg_class WHERE relname = :'STAT_COMP_CHUNK_NAME'
1 | 1
(1 row)

-- verify that approximate_row_count works fine on a chunk with a mix of uncompressed
-- and compressed data
SELECT table_name as "STAT_CHUNK_NAME" from temptable \gset
SELECT approximate_row_count('_timescaledb_internal.' || :'STAT_CHUNK_NAME');
approximate_row_count
-----------------------
52
(1 row)

-- should match with the result via the hypertable post in-memory decompression
SELECT count(*) from stattest;
count
-------
52
(1 row)

SELECT count(*) from show_chunks('stattest');
count
-------
1
(1 row)

-- Verify that decompressing the chunk restores autoanalyze to the hypertable's setting
SELECT reloptions FROM pg_class WHERE relname = :statchunk;
reloptions
Expand Down Expand Up @@ -1423,7 +1475,6 @@ SELECT relpages, reltuples FROM pg_class WHERE relname = :statchunk;
1 | 52
(1 row)

DROP TABLE stattest;
--- Test that analyze on compression internal table updates stats on original chunks
CREATE TABLE stattest2(time TIMESTAMPTZ NOT NULL, c1 int, c2 int);
SELECT create_hypertable('stattest2', 'time', chunk_time_interval=>'1 day'::interval);
Expand Down Expand Up @@ -2626,3 +2677,30 @@ SELECT * FROM sensor_data_compressed ORDER BY time DESC LIMIT 5;
(70 rows)

RESET timescaledb.enable_decompression_sorted_merge;
-- create another chunk
INSERT INTO stattest SELECT '2021/02/20 01:00'::TIMESTAMPTZ + ('1 hour'::interval * v), 250 * v FROM generate_series(125,140) v;
ANALYZE stattest;
SELECT count(*) from show_chunks('stattest');
count
-------
2
(1 row)

SELECT table_name INTO TEMPORARY temptable FROM _timescaledb_catalog.chunk WHERE hypertable_id = (SELECT id FROM _timescaledb_catalog.hypertable WHERE table_name = 'stattest') ORDER BY creation_time desc limit 1;
SELECT table_name as "STAT_CHUNK2_NAME" FROM temptable \gset
-- verify that approximate_row_count works ok on normal chunks
SELECT approximate_row_count('_timescaledb_internal.' || :'STAT_CHUNK2_NAME');
approximate_row_count
-----------------------
16
(1 row)

-- verify that approximate_row_count works fine on a hypertable with a mix of uncompressed
-- and compressed data
SELECT approximate_row_count('stattest');
approximate_row_count
-----------------------
68
(1 row)

DROP TABLE stattest;
33 changes: 31 additions & 2 deletions tsl/test/sql/compression.sql
Expand Up @@ -19,6 +19,10 @@ insert into foo values( 3 , 16 , 20, NULL);
insert into foo values( 10 , 10 , 20, NULL);
insert into foo values( 20 , 11 , 20, NULL);
insert into foo values( 30 , 12 , 20, NULL);
analyze foo;
-- check that approximate_row_count works with a regular table
SELECT approximate_row_count('foo');
SELECT count(*) from foo;

alter table foo set (timescaledb.compress, timescaledb.compress_segmentby = 'a,b', timescaledb.compress_orderby = 'c desc, d asc nulls last');

Expand Down Expand Up @@ -538,9 +542,13 @@ SELECT table_name INTO TEMPORARY temptable FROM _timescaledb_catalog.chunk WHERE
SELECT * FROM pg_stats WHERE tablename = :statchunk;

ALTER TABLE stattest SET (timescaledb.compress);
-- check that approximate_row_count works with all normal chunks
SELECT approximate_row_count('stattest');
SELECT compress_chunk(c) FROM show_chunks('stattest') c;
-- check that approximate_row_count works with all compressed chunks
SELECT approximate_row_count('stattest');
-- actual count should match with the above
SELECT count(*) from stattest;
-- Uncompressed chunk table is empty since we just compressed the chunk and moved everything to compressed chunk table.
-- reltuples is initially -1 on PG14 before VACUUM/ANALYZE was run
SELECT relpages, CASE WHEN reltuples > 0 THEN reltuples ELSE 0 END as reltuples FROM pg_class WHERE relname = :statchunk;
Expand All @@ -564,6 +572,8 @@ SELECT relpages, reltuples FROM pg_class WHERE relname = :statchunk;

-- verify that corresponding compressed chunk table stats is updated as well.
SELECT relpages, reltuples FROM pg_class WHERE relname = :'STAT_COMP_CHUNK_NAME';
-- verify that approximate_row_count works fine on a chunk with compressed data
SELECT approximate_row_count('_timescaledb_internal.' || :'STAT_COMP_CHUNK_NAME');

-- Verify partial chunk stats are handled correctly when analyzing
-- for both uncompressed and compressed chunk tables
Expand All @@ -577,7 +587,13 @@ SELECT relpages, reltuples FROM pg_class WHERE relname = :statchunk;
-- verify that corresponding compressed chunk table stats have not changed since
-- we didn't compress anything new.
SELECT relpages, reltuples FROM pg_class WHERE relname = :'STAT_COMP_CHUNK_NAME';

-- verify that approximate_row_count works fine on a chunk with a mix of uncompressed
-- and compressed data
SELECT table_name as "STAT_CHUNK_NAME" from temptable \gset
SELECT approximate_row_count('_timescaledb_internal.' || :'STAT_CHUNK_NAME');
-- should match with the result via the hypertable post in-memory decompression
SELECT count(*) from stattest;
SELECT count(*) from show_chunks('stattest');

-- Verify that decompressing the chunk restores autoanalyze to the hypertable's setting
SELECT reloptions FROM pg_class WHERE relname = :statchunk;
Expand All @@ -597,7 +613,6 @@ SET client_min_messages TO NOTICE;
SELECT histogram_bounds FROM pg_stats WHERE tablename = :statchunk and attname = 'c1';
SELECT relpages, reltuples FROM pg_class WHERE relname = :statchunk;

DROP TABLE stattest;

--- Test that analyze on compression internal table updates stats on original chunks
CREATE TABLE stattest2(time TIMESTAMPTZ NOT NULL, c1 int, c2 int);
Expand Down Expand Up @@ -1052,3 +1067,17 @@ SET timescaledb.enable_decompression_sorted_merge = FALSE;
:PREFIX
SELECT * FROM sensor_data_compressed ORDER BY time DESC LIMIT 5;
RESET timescaledb.enable_decompression_sorted_merge;

-- create another chunk
INSERT INTO stattest SELECT '2021/02/20 01:00'::TIMESTAMPTZ + ('1 hour'::interval * v), 250 * v FROM generate_series(125,140) v;
ANALYZE stattest;
SELECT count(*) from show_chunks('stattest');
SELECT table_name INTO TEMPORARY temptable FROM _timescaledb_catalog.chunk WHERE hypertable_id = (SELECT id FROM _timescaledb_catalog.hypertable WHERE table_name = 'stattest') ORDER BY creation_time desc limit 1;
SELECT table_name as "STAT_CHUNK2_NAME" FROM temptable \gset
-- verify that approximate_row_count works ok on normal chunks
SELECT approximate_row_count('_timescaledb_internal.' || :'STAT_CHUNK2_NAME');
-- verify that approximate_row_count works fine on a hypertable with a mix of uncompressed
-- and compressed data
SELECT approximate_row_count('stattest');

DROP TABLE stattest;

0 comments on commit 293104a

Please sign in to comment.