From f2b8f121c60e855eaadade8e85ce71c93e1d8e8a Mon Sep 17 00:00:00 2001 From: Nils Dijk Date: Wed, 10 Jun 2020 18:24:34 +0200 Subject: [PATCH 1/3] initial fix for in/out --- tdigest.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tdigest.c b/tdigest.c index e190555..c27b4d5 100644 --- a/tdigest.c +++ b/tdigest.c @@ -1689,6 +1689,7 @@ tdigest_in(PG_FUNCTION_ARGS) tdigest_t *digest = NULL; /* t-digest header fields */ + int32 flags; int64 count; int compression; int ncentroids; @@ -1697,10 +1698,10 @@ tdigest_in(PG_FUNCTION_ARGS) centroids = palloc(strlen(str)); - r = sscanf(str, "count %ld compression %d centroids %d%s", - &count, &compression, &ncentroids, centroids); + r = sscanf(str, "flags %d count %ld compression %d centroids %d%s", + &flags, &count, &compression, &ncentroids, centroids); - if (r != 4) + if (r != 5) elog(ERROR, "failed to parse t-digest value"); if ((compression < 10) || (compression > 10000)) @@ -1725,11 +1726,12 @@ tdigest_in(PG_FUNCTION_ARGS) digest = tdigest_allocate(ncentroids); + digest->flags = flags; digest->count = count; digest->ncentroids = ncentroids; digest->compression = compression; - ptr = centroids; + ptr = strchr(str, '(') - 1; for (i = 0; i < digest->ncentroids; i++) { @@ -1750,7 +1752,7 @@ tdigest_in(PG_FUNCTION_ARGS) ptr = strchr(ptr, ')') + 1; } - Assert(ptr == centroids + strlen(centroids)); + Assert(ptr == str + strlen(str)); pfree(centroids); @@ -1803,7 +1805,7 @@ tdigest_recv(PG_FUNCTION_ARGS) if (flags != 0) elog(ERROR, "unsupported t-digest on-disk format"); - count = pq_getmsgint(buf, sizeof(int64)); + count = pq_getmsgint64(buf); compression = pq_getmsgint(buf, sizeof(int32)); ncentroids = pq_getmsgint(buf, sizeof(int32)); @@ -1817,7 +1819,7 @@ tdigest_recv(PG_FUNCTION_ARGS) for (i = 0; i < digest->ncentroids; i++) { digest->centroids[i].sum = pq_getmsgfloat8(buf); - digest->centroids[i].count = pq_getmsgint(buf, sizeof(int64)); + digest->centroids[i].count = pq_getmsgint64(buf); } PG_RETURN_POINTER(digest); From e00c9d7538db1e343c9d0361df7937cea8c831ac Mon Sep 17 00:00:00 2001 From: Nils Dijk Date: Wed, 10 Jun 2020 18:47:09 +0200 Subject: [PATCH 2/3] use %n --- tdigest.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tdigest.c b/tdigest.c index c27b4d5..9809f45 100644 --- a/tdigest.c +++ b/tdigest.c @@ -1693,15 +1693,13 @@ tdigest_in(PG_FUNCTION_ARGS) int64 count; int compression; int ncentroids; - char *centroids; + int header_length; char *ptr; - centroids = palloc(strlen(str)); + r = sscanf(str, "flags %d count %ld compression %d centroids %d%n", + &flags, &count, &compression, &ncentroids, &header_length); - r = sscanf(str, "flags %d count %ld compression %d centroids %d%s", - &flags, &count, &compression, &ncentroids, centroids); - - if (r != 5) + if (r != 4) elog(ERROR, "failed to parse t-digest value"); if ((compression < 10) || (compression > 10000)) @@ -1731,7 +1729,7 @@ tdigest_in(PG_FUNCTION_ARGS) digest->ncentroids = ncentroids; digest->compression = compression; - ptr = strchr(str, '(') - 1; + ptr = str + header_length; for (i = 0; i < digest->ncentroids; i++) { @@ -1754,8 +1752,6 @@ tdigest_in(PG_FUNCTION_ARGS) Assert(ptr == str + strlen(str)); - pfree(centroids); - AssertCheckTDigest(digest); PG_RETURN_POINTER(digest); From fe2dabeee390f3344dd2d1b7101e5e3c9ce4fb3d Mon Sep 17 00:00:00 2001 From: Nils Dijk Date: Wed, 10 Jun 2020 19:24:50 +0200 Subject: [PATCH 3/3] add tests for text and table conversions --- test/expected/tdigest.out | 62 +++++++++++++++++++++++++++++++++++++++ test/sql/tdigest.sql | 46 +++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) diff --git a/test/expected/tdigest.out b/test/expected/tdigest.out index 7b05e74..87e36fe 100644 --- a/test/expected/tdigest.out +++ b/test/expected/tdigest.out @@ -1196,3 +1196,65 @@ SELECT * FROM ( ---+---+--- (0 rows) +-- some basic tests to verify transforming from and to text work +-- 10 centroids (tiny) +WITH data AS (SELECT i / 1000000.0 AS x FROM generate_series(1,1000000) s(i)), + intermediate AS (SELECT tdigest(x, 10)::text AS intermediate_x FROM data), + tdigest_parsed AS (SELECT tdigest_percentile(intermediate_x::tdigest, ARRAY[0.01, 0.05, 0.1, 0.9, 0.95, 0.99]) AS a FROM intermediate), + pg_percentile AS (SELECT percentile_cont(ARRAY[0.01, 0.05, 0.1, 0.9, 0.95, 0.99]) WITHIN GROUP (ORDER BY x) AS b FROM data) +SELECT + p, + abs(a - b) < 0.01, -- arbitrary threshold of 1% + (CASE WHEN abs(a - b) < 0.01 THEN NULL ELSE (a - b) END) AS err +FROM ( + SELECT + unnest(ARRAY[0.01, 0.05, 0.1, 0.9, 0.95, 0.99]) AS p, + unnest(a) AS a, + unnest(b) AS b + FROM tdigest_parsed, + pg_percentile +) foo; + p | ?column? | err +------+----------+----- + 0.01 | t | + 0.05 | t | + 0.1 | t | + 0.9 | t | + 0.95 | t | + 0.99 | t | +(6 rows) + +-- verify we can store tdigest in a summary table +CREATE TABLE intermediate_tdigest (grouping int, summary tdigest); +WITH data AS (SELECT row_number() OVER () AS i, pow(z, 4) AS x FROM random_normal(1000000) s(z)) +INSERT INTO intermediate_tdigest +SELECT + i % 10 AS grouping, + tdigest(x, 100) AS summary +FROM data +GROUP BY i % 10; +WITH data AS (SELECT pow(z, 4) AS x FROM random_normal(1000000) s(z)), + intermediate AS (SELECT tdigest_percentile(summary, ARRAY[0.01, 0.05, 0.1, 0.9, 0.95, 0.99]) AS a FROM intermediate_tdigest), + pg_percentile AS (SELECT percentile_cont(ARRAY[0.01, 0.05, 0.1, 0.9, 0.95, 0.99]) WITHIN GROUP (ORDER BY x) AS b FROM data) +SELECT + p, + abs(a - b) < 0.01, -- arbitrary threshold of 1% + (CASE WHEN abs(a - b) < 0.01 THEN NULL ELSE (a - b) END) AS err +FROM ( + SELECT + unnest(ARRAY[0.01, 0.05, 0.1, 0.9, 0.95, 0.99]) AS p, + unnest(a) AS a, + unnest(b) AS b + FROM intermediate, + pg_percentile +) foo; + p | ?column? | err +------+----------+----- + 0.01 | t | + 0.05 | t | + 0.1 | t | + 0.9 | t | + 0.95 | t | + 0.99 | t | +(6 rows) + diff --git a/test/sql/tdigest.sql b/test/sql/tdigest.sql index e2f0765..4d79c65 100644 --- a/test/sql/tdigest.sql +++ b/test/sql/tdigest.sql @@ -892,3 +892,49 @@ SELECT * FROM ( unnest(tdigest_percentile(x, 1000, (SELECT p FROM perc))) AS a FROM data ) foo ) bar WHERE a <= b; + +-- some basic tests to verify transforming from and to text work +-- 10 centroids (tiny) +WITH data AS (SELECT i / 1000000.0 AS x FROM generate_series(1,1000000) s(i)), + intermediate AS (SELECT tdigest(x, 10)::text AS intermediate_x FROM data), + tdigest_parsed AS (SELECT tdigest_percentile(intermediate_x::tdigest, ARRAY[0.01, 0.05, 0.1, 0.9, 0.95, 0.99]) AS a FROM intermediate), + pg_percentile AS (SELECT percentile_cont(ARRAY[0.01, 0.05, 0.1, 0.9, 0.95, 0.99]) WITHIN GROUP (ORDER BY x) AS b FROM data) +SELECT + p, + abs(a - b) < 0.01, -- arbitrary threshold of 1% + (CASE WHEN abs(a - b) < 0.01 THEN NULL ELSE (a - b) END) AS err +FROM ( + SELECT + unnest(ARRAY[0.01, 0.05, 0.1, 0.9, 0.95, 0.99]) AS p, + unnest(a) AS a, + unnest(b) AS b + FROM tdigest_parsed, + pg_percentile +) foo; + +-- verify we can store tdigest in a summary table +CREATE TABLE intermediate_tdigest (grouping int, summary tdigest); + +WITH data AS (SELECT row_number() OVER () AS i, pow(z, 4) AS x FROM random_normal(1000000) s(z)) +INSERT INTO intermediate_tdigest +SELECT + i % 10 AS grouping, + tdigest(x, 100) AS summary +FROM data +GROUP BY i % 10; + +WITH data AS (SELECT pow(z, 4) AS x FROM random_normal(1000000) s(z)), + intermediate AS (SELECT tdigest_percentile(summary, ARRAY[0.01, 0.05, 0.1, 0.9, 0.95, 0.99]) AS a FROM intermediate_tdigest), + pg_percentile AS (SELECT percentile_cont(ARRAY[0.01, 0.05, 0.1, 0.9, 0.95, 0.99]) WITHIN GROUP (ORDER BY x) AS b FROM data) +SELECT + p, + abs(a - b) < 0.01, -- arbitrary threshold of 1% + (CASE WHEN abs(a - b) < 0.01 THEN NULL ELSE (a - b) END) AS err +FROM ( + SELECT + unnest(ARRAY[0.01, 0.05, 0.1, 0.9, 0.95, 0.99]) AS p, + unnest(a) AS a, + unnest(b) AS b + FROM intermediate, + pg_percentile +) foo;