Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix crash when partializing agg with HAVING #1544

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
185 changes: 133 additions & 52 deletions src/plan_partialize.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,13 @@
#include <postgres.h>
#include <catalog/pg_type.h>
#include <nodes/nodeFuncs.h>
#include <nodes/nodes.h>
#include <nodes/pg_list.h>
#include <nodes/relation.h>
#include <optimizer/planner.h>
#include <optimizer/cost.h>
#include <optimizer/var.h>
#include <optimizer/tlist.h>
#include <parser/parse_func.h>
#include <utils/lsyscache.h>

Expand All @@ -15,15 +21,30 @@
#include "utils.h"

#define TS_PARTIALFN "partialize_agg"

typedef struct PartializeWalkerState
{
bool found_partialize;
bool found_non_partial_agg;
bool looking_for_agg;
Oid fnoid;
} PartializeWalkerState;

/*
* Look for the partialize function in a target list and mark the wrapped
* aggregate as a partial aggregate.
*
* The partialize function is an expression of the form:
*
* _timescaledb_internal.partialize_agg(avg(temp))
*
* where avg(temp) can be replaced by any aggregate that can be partialized.
*
* When such an expression is found, this function will mark the Aggref node
* for the aggregate as partial.
*/
static bool
partialize_function_call_walker(Node *node, PartializeWalkerState *state)
check_for_partialize_function_call(Node *node, PartializeWalkerState *state)
{
if (node == NULL)
return false;
Expand All @@ -32,86 +53,146 @@ partialize_function_call_walker(Node *node, PartializeWalkerState *state)
* If the last node we saw was partialize, the next one must be aggregate
* we're partializing
*/
if (state->looking_for_agg)
{
Aggref *agg_ref;
if (state->looking_for_agg && !IsA(node, Aggref))
elog(ERROR, "the input to partialize must be an aggregate");

if (!IsA(node, Aggref))
elog(ERROR, "The input to partialize must be an aggregate");
if (IsA(node, Aggref))
{
Aggref *aggref = castNode(Aggref, node);

agg_ref = castNode(Aggref, node);
agg_ref->aggsplit = AGGSPLIT_INITIAL_SERIAL;
if (agg_ref->aggtranstype == INTERNALOID && DO_AGGSPLIT_SERIALIZE(AGGSPLIT_INITIAL_SERIAL))
agg_ref->aggtype = BYTEAOID;
else
agg_ref->aggtype = agg_ref->aggtranstype;
if (state->looking_for_agg)
{
state->looking_for_agg = false;
aggref->aggsplit = AGGSPLIT_INITIAL_SERIAL;

if (aggref->aggtranstype == INTERNALOID &&
DO_AGGSPLIT_SERIALIZE(AGGSPLIT_INITIAL_SERIAL))
aggref->aggtype = BYTEAOID;
else
aggref->aggtype = aggref->aggtranstype;
}

state->looking_for_agg = false;
/* We currently cannot handle cases like
* SELECT sum(i), partialize(sum(i)) ...
*
* We check for non-partial aggs to ensure that if any of the aggregates
* in a statement are partialized, all of them have to be.
*/
else if (aggref->aggsplit != AGGSPLIT_INITIAL_SERIAL)
state->found_non_partial_agg = true;
}
else if (IsA(node, FuncExpr) && ((FuncExpr *) node)->funcid == state->fnoid)
{
state->found_partialize = true;
state->looking_for_agg = true;
}

return expression_tree_walker((Node *) node, partialize_function_call_walker, state);
return expression_tree_walker(node, check_for_partialize_function_call, state);
}

/* We currently cannot handle cases like
* SELECT sum(i), partialize(sum(i)) ...
* instead we use this function to ensure that if any of the aggregates in a statement are
* partialized, all of them are
*/
static bool
ensure_only_partials(Node *node, void *state)
{
if (node == NULL)
return false;

if (IsA(node, Aggref) && castNode(Aggref, node)->aggsplit != AGGSPLIT_INITIAL_SERIAL)
elog(ERROR, "Cannot mix partialized and non-partialized aggregates in the same statement");

return expression_tree_walker((Node *) node, ensure_only_partials, state);
}

void
ts_plan_process_partialize_agg(PlannerInfo *root, RelOptInfo *input_rel, RelOptInfo *output_rel)
has_partialize_function(Query *parse)
{
Oid partialfnoid = InvalidOid;
Oid argtyp[] = { ANYELEMENTOID };
Query *parse = root->parse;

PartializeWalkerState state = { .found_partialize = false,
.found_non_partial_agg = false,
.looking_for_agg = false,
.fnoid = InvalidOid };
List *name = list_make2(makeString(INTERNAL_SCHEMA_NAME), makeString(TS_PARTIALFN));
ListCell *lc;

if (CMD_SELECT != parse->commandType)
return;
partialfnoid = LookupFuncName(name, lengthof(argtyp), argtyp, false);
Assert(partialfnoid != InvalidOid);

state.fnoid = partialfnoid;
partialize_function_call_walker((Node *) parse->targetList, &state);
check_for_partialize_function_call((Node *) parse->targetList, &state);

if (state.found_partialize && state.found_non_partial_agg)
elog(ERROR, "cannot mix partialized and non-partialized aggregates in the same statement");

if (state.found_partialize)
return state.found_partialize;
}

/*
* Modify all AggPaths in relation to use partial aggregation.
*/
static void
partialize_agg_paths(RelOptInfo *rel)
{
ListCell *lc;

foreach (lc, rel->pathlist)
{
ensure_only_partials((Node *) parse->targetList, NULL);
Path *path = lfirst(lc);

foreach (lc, input_rel->pathlist)
{
Path *path = lfirst(lc);
if (IsA(path, AggPath))
castNode(AggPath, path)->aggsplit = AGGSPLIT_INITIAL_SERIAL;
}
}

if (IsA(path, AggPath))
((AggPath *) path)->aggsplit = AGGSPLIT_INITIAL_SERIAL;
}
/*
* Turn an aggregate relation into a partial aggregate relation if aggregates
* are enclosed by the partialize_agg function.
*
* The partialize_agg function can "manually" partialize an aggregate. For
* instance:
*
* SELECT time_bucket('1 day', time), device,
* __timescaledb_internal.partialize_agg(avg(temp))
* GROUP BY 1, 2;
*
* Would compute the partial aggregate of avg(temp).
*
* The plan to compute the relation must be either entirely non-partial or
* entirely partial, so it is not possible to mix partials and
* non-partials. Note that aggregates can appear in both the target list and the
* HAVING clause, for instance:
*
* SELECT time_bucket('1 day', time), device, avg(temp)
* GROUP BY 1, 2
* HAVING avg(temp) > 3;
*
* Regular partial aggregations executed by the planner (i.e., those not induced
* by the partialize_agg function) have their HAVING aggregates transparently
* moved to the target list during planning so that the finalize node can use it
* when applying the final filter on the resulting groups, obviously omitting
* the extra columns in the final output/projection. However, it doesn't make
* much sense to transparently do that when partializing with partialize_agg
* since it would be odd to return more columns than requested by the
* user. Therefore, the caller would have to do that manually. This, in fact, is
* also done when materializing continuous aggregates.
*
* For this reason, HAVING clauses with partialize_agg are blocked, except in
* cases where the planner transparently reduces the having expression to a
* simple filter (e.g., HAVING device > 3). In such cases, the HAVING clause is
* removed and replaced by a filter on the input.
*/
void
ts_plan_process_partialize_agg(PlannerInfo *root, RelOptInfo *input_rel, RelOptInfo *output_rel)
{
Query *parse = root->parse;

foreach (lc, output_rel->pathlist)
{
Path *path = lfirst(lc);
#if PG10_GE
Assert(IS_UPPER_REL(output_rel));
#endif

if (IsA(path, AggPath))
((AggPath *) path)->aggsplit = AGGSPLIT_INITIAL_SERIAL;
}
if (CMD_SELECT != parse->commandType || !parse->hasAggs)
return;

if (has_partialize_function(parse))
{
/* We cannot check root->hasHavingqual here because sometimes the
* planner can replace the HAVING clause with a simple filter. But
* root->hashavingqual stays true to remember that the query had a
* HAVING clause initially. */
if (NULL != parse->havingQual)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot partialize aggregate with HAVING clause"),
errhint(
"Any aggregates in a HAVING clause need to be partialized in the output "
"target list.")));

partialize_agg_paths(output_rel);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should only be called on either input_rel or output_rel

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was wondering about that, since it is reasonable that only the output_rel would actually have any AggRefs given that this is the rel that is doing the aggregation. But I am just following what was there before (although I moved this code into a function to minimize duplication). Would be good to get some input from the person that wrote this code (looks like @cevian or @gayyappan according to git blame).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Anyway, I changed this to only act on the output_rel and also put a guard in planner.c to only call this in case of UPPERREL_GROUP_AGG.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JLockerman could you comment on this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gayyappan I'd defer to sven on this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for taking a look @JLockerman. looped you in as you wrote the code.

Copy link
Contributor Author

@erimatnor erimatnor Dec 23, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am going to try and clarify the reasoning behind this change according to how I understand this code.

The input_rel is the rel that something was computed from and output_rel is the result. Since we want to partialize aggregates, we should only be interested in the stage that applies aggregations to an input_rel to produce an aggregate output_rel, i.e., the UPPERREL_GROUP_AGG stage. In this case, the input_rel is typically a base or join rel (no aggregates) and the output_rel the result of applying an aggregate on top of the base/join rel in input_rel.

In any later stage (after aggregates), the input_rel would already have aggregates applied (which we partialized in a previous call to this function), and no aggregates are computed in those stages. Therefore, there should never be a case where we'd have to partialize an input_rel (unless we somehow missed to do it on the output_rel in a previous step).

Please let me know if I am missing something.

}
}
10 changes: 8 additions & 2 deletions src/planner.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <utils/guc.h>
#include <miscadmin.h>
#include <nodes/makefuncs.h>
#include <nodes/relation.h>
#include <optimizer/var.h>
#include <optimizer/restrictinfo.h>
#include <utils/lsyscache.h>
Expand All @@ -27,6 +28,7 @@
#include <tcop/tcopprot.h>
#include <optimizer/plancat.h>
#include <nodes/nodeFuncs.h>

#include <parser/analyze.h>

#include <catalog/pg_constraint.h>
Expand Down Expand Up @@ -745,8 +747,11 @@ timescale_create_upper_paths_hook(PlannerInfo *root, UpperRelationKind stage, Re
if (output_rel->pathlist != NIL)
output_rel->pathlist = replace_hypertable_insert_paths(root, output_rel->pathlist);

/* modify aggregates that need to be partialized */
ts_plan_process_partialize_agg(root, input_rel, output_rel);
if (parse->hasAggs && stage == UPPERREL_GROUP_AGG)
{
/* modify aggregates that need to be partialized */
ts_plan_process_partialize_agg(root, input_rel, output_rel);
}
}

if (ts_guc_disable_optimizations || input_rel == NULL || IS_DUMMY_REL(input_rel))
Expand All @@ -758,6 +763,7 @@ timescale_create_upper_paths_hook(PlannerInfo *root, UpperRelationKind stage, Re
if (UPPERREL_GROUP_AGG == stage && output_rel != NULL)
{
ts_plan_add_hashagg(root, input_rel, output_rel);

if (parse->hasAggs)
ts_preprocess_first_last_aggregates(root, root->processed_tlist);
}
Expand Down
25 changes: 23 additions & 2 deletions tsl/test/expected/partialize_finalize.out
Original file line number Diff line number Diff line change
Expand Up @@ -243,10 +243,31 @@ select length(_timescaledb_internal.partialize_agg(min(a+1))) from foo;

\set ON_ERROR_STOP 0
select length(_timescaledb_internal.partialize_agg(1+min(a))) from foo;
ERROR: The input to partialize must be an aggregate
ERROR: the input to partialize must be an aggregate
select length(_timescaledb_internal.partialize_agg(min(a)+min(a))) from foo;
ERROR: The input to partialize must be an aggregate
ERROR: the input to partialize must be an aggregate
--non-trivial HAVING clause not allowed with partialize_agg
select time_bucket('1 hour', b) as b, _timescaledb_internal.partialize_agg(avg(a))
from foo
group by 1
having avg(a) > 3;
ERROR: cannot partialize aggregate with HAVING clause
--mixing partialized and non-partialized aggs is not allowed
select time_bucket('1 hour', b) as b, _timescaledb_internal.partialize_agg(avg(a)), sum(a)
from foo
group by 1;
ERROR: cannot mix partialized and non-partialized aggregates in the same statement
\set ON_ERROR_STOP 1
--partializing works with HAVING when the planner can effectively
--reduce it. In this case to a simple filter.
select time_bucket('1 hour', b) as b, toastval, _timescaledb_internal.partialize_agg(avg(a))
from foo
group by b, toastval
having toastval LIKE 'does not exist';
b | toastval | partialize_agg
---+----------+----------------
(0 rows)

--
-- TEST FINALIZEFUNC_EXTRA
--
Expand Down
16 changes: 16 additions & 0 deletions tsl/test/sql/partialize_finalize.sql
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,24 @@ select length(_timescaledb_internal.partialize_agg(min(a+1))) from foo;
\set ON_ERROR_STOP 0
select length(_timescaledb_internal.partialize_agg(1+min(a))) from foo;
select length(_timescaledb_internal.partialize_agg(min(a)+min(a))) from foo;
--non-trivial HAVING clause not allowed with partialize_agg
select time_bucket('1 hour', b) as b, _timescaledb_internal.partialize_agg(avg(a))
from foo
group by 1
having avg(a) > 3;
--mixing partialized and non-partialized aggs is not allowed
select time_bucket('1 hour', b) as b, _timescaledb_internal.partialize_agg(avg(a)), sum(a)
from foo
group by 1;
\set ON_ERROR_STOP 1

--partializing works with HAVING when the planner can effectively
--reduce it. In this case to a simple filter.
select time_bucket('1 hour', b) as b, toastval, _timescaledb_internal.partialize_agg(avg(a))
from foo
group by b, toastval
having toastval LIKE 'does not exist';
erimatnor marked this conversation as resolved.
Show resolved Hide resolved

--
-- TEST FINALIZEFUNC_EXTRA
--
Expand Down