Skip to content
Find file
Fetching contributors…
Cannot retrieve contributors at this time
562 lines (464 sloc) 17.1 KB
/*
* count_distinct.c - alternative to COUNT(DISTINCT ...)
* Copyright (C) Tomas Vondra, 2013
*
*/
#include <assert.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <sys/time.h>
#include <unistd.h>
#include <limits.h>
#include "postgres.h"
#include "utils/datum.h"
#include "utils/array.h"
#include "utils/lsyscache.h"
#include "utils/numeric.h"
#include "utils/builtins.h"
#include "catalog/pg_type.h"
#include "nodes/execnodes.h"
#include "access/tupmacs.h"
#include "utils/pg_crc.h"
#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif
/* if set to 1, the table resize will be profiled */
#define DEBUG_PROFILE 0
#define DEBUG_HISTOGRAM 0 /* prints bucket size histogram */
#if (PG_VERSION_NUM >= 90000)
#define GET_AGG_CONTEXT(fname, fcinfo, aggcontext) \
if (! AggCheckCallContext(fcinfo, &aggcontext)) { \
elog(ERROR, "%s called in non-aggregate context", fname); \
}
#define CHECK_AGG_CONTEXT(fname, fcinfo) \
if (! AggCheckCallContext(fcinfo, NULL)) { \
elog(ERROR, "%s called in non-aggregate context", fname); \
}
#elif (PG_VERSION_NUM >= 80400)
#define GET_AGG_CONTEXT(fname, fcinfo, aggcontext) \
if (fcinfo->context && IsA(fcinfo->context, AggState)) { \
aggcontext = ((AggState *) fcinfo->context)->aggcontext; \
} else if (fcinfo->context && IsA(fcinfo->context, WindowAggState)) { \
aggcontext = ((WindowAggState *) fcinfo->context)->wincontext; \
} else { \
elog(ERROR, "%s called in non-aggregate context", fname); \
aggcontext = NULL; \
}
#define CHECK_AGG_CONTEXT(fname, fcinfo) \
if (!(fcinfo->context && \
(IsA(fcinfo->context, AggState) || \
IsA(fcinfo->context, WindowAggState)))) \
{ \
elog(ERROR, "%s called in non-aggregate context", fname); \
}
#else
#define GET_AGG_CONTEXT(fname, fcinfo, aggcontext) \
if (fcinfo->context && IsA(fcinfo->context, AggState)) { \
aggcontext = ((AggState *) fcinfo->context)->aggcontext; \
} else { \
elog(ERROR, "%s called in non-aggregate context", fname); \
aggcontext = NULL; \
}
#define CHECK_AGG_CONTEXT(fname, fcinfo) \
if (!(fcinfo->context && \
(IsA(fcinfo->context, AggState)))) \
{ \
elog(ERROR, "%s called in non-aggregate context", fname); \
}
/* backward compatibility with 8.3 (macros copied mostly from src/include/access/tupmacs.h) */
#if SIZEOF_DATUM == 8
#define fetch_att(T,attbyval,attlen) \
( \
(attbyval) ? \
( \
(attlen) == (int) sizeof(Datum) ? \
*((Datum *)(T)) \
: \
( \
(attlen) == (int) sizeof(int32) ? \
Int32GetDatum(*((int32 *)(T))) \
: \
( \
(attlen) == (int) sizeof(int16) ? \
Int16GetDatum(*((int16 *)(T))) \
: \
( \
AssertMacro((attlen) == 1), \
CharGetDatum(*((char *)(T))) \
) \
) \
) \
) \
: \
PointerGetDatum((char *) (T)) \
)
#else /* SIZEOF_DATUM != 8 */
#define fetch_att(T,attbyval,attlen) \
( \
(attbyval) ? \
( \
(attlen) == (int) sizeof(int32) ? \
Int32GetDatum(*((int32 *)(T))) \
: \
( \
(attlen) == (int) sizeof(int16) ? \
Int16GetDatum(*((int16 *)(T))) \
: \
( \
AssertMacro((attlen) == 1), \
CharGetDatum(*((char *)(T))) \
) \
) \
) \
: \
PointerGetDatum((char *) (T)) \
)
#endif /* SIZEOF_DATUM == 8 */
#define att_addlength_pointer(cur_offset, attlen, attptr) \
( \
((attlen) > 0) ? \
( \
(cur_offset) + (attlen) \
) \
: (((attlen) == -1) ? \
( \
(cur_offset) + VARSIZE_ANY(attptr) \
) \
: \
( \
AssertMacro((attlen) == -2), \
(cur_offset) + (strlen((char *) (attptr)) + 1) \
)) \
)
#define att_align_nominal(cur_offset, attalign) \
( \
((attalign) == 'i') ? INTALIGN(cur_offset) : \
(((attalign) == 'c') ? (long) (cur_offset) : \
(((attalign) == 'd') ? DOUBLEALIGN(cur_offset) : \
( \
AssertMacro((attalign) == 's'), \
SHORTALIGN(cur_offset) \
))) \
)
#endif
#define COMPUTE_CRC32(hash, value, length) \
INIT_CRC32(hash); \
COMP_CRC32(hash, value, length); \
FIN_CRC32(hash);
/* This count_distinct implementation uses a simple, partially sorted array.
*
* It's considerably simpler than the hash-table based version, and the main
* goals of this design is to:
*
* (a) minimize the palloc overhead - the whole array is allocated as a whole,
* and thus has a single palloc header (while in the hash table, each
* bucket had at least one such header)
*
* (b) optimal L2/L3 cache utilization - once the hash table can't fit into
* the CPU caches, it get's considerably slower because of cache misses,
* and it's impossible to improve the hash implementation (because for
* large hash tables it naturally leads to cache misses)
*
* Hash tables are great when you need to immediately query the structure
* (e.g. to immediately check whether the key is already in the table), but
* in count_distint it's not really necessary. We can accumulate some elements
* first (into a buffer), and then process all of them at once - this approach
* improves the CPU cache hit ratios. Also, the palloc overhead is much lower.
*
* The data array is split into three sections - sorted items, unsorted items,
* and unused.
*
* ----------------------------------------------
* | sorted | unsorted | free |
* ----------------------------------------------
*
* Initially, the sorted / unsorted sections are empty, of course.
*
* ----------------------------------------------
* | free |
* ----------------------------------------------
*
* New values are simply accumulated into the unsorted section, which grows.
*
* ----------------------------------------------
* | unsorted --> | free |
* ----------------------------------------------
*
* Once there's no more space for new items, the unsorted items are 'compacted'
* which means the values are sorted, duplicates are removed and the result
* is merged into the sorted section (unless it's empty). The 'merge' is just
* a simple 'merge-sort' of the two sorted inputs, with removal of duplicates.
*
* Once the compaction completes, it's checked whether enough space was freed,
* where 'enough' means ~20% of the array needs to be free. Using low values
* (e.g. space for at least one value) might cause 'oscillation' - imagine
* compaction that removes a single item, causing compaction on the very next
* addition. Using non-trivial threshold (like the 20%) should prevent such
* frequent compactions - which is quite expensive operation.
*
* If there's not enough free space, the array grows (twice the size).
*
* The compaction needs to be performed at the very end, when computing the
* actual result of the aggregate (distinct value in the array).
*
*/
#define ARRAY_INIT_SIZE 32 /* initial size of the array (in bytes) */
#define ARRAY_FREE_FRACT 0.2 /* we want >= 20% free space after compaction */
/* A hash table - a collection of buckets. */
typedef struct element_set_t {
uint32 item_size; /* length of the value (depends on the actual data type) */
uint32 nsorted; /* number of items in the sorted part (distinct) */
uint32 nall; /* number of all items (unsorted part may contain duplicates) */
uint32 nbytes; /* number of bytes in the data array */
/* aggregation memory context (reference, so we don't need to do lookups repeatedly) */
MemoryContext aggctx;
/* elements */
char * data; /* nsorted items first, then (nall - nsorted) unsorted items */
} element_set_t;
/* prototypes */
PG_FUNCTION_INFO_V1(count_distinct_append);
PG_FUNCTION_INFO_V1(count_distinct);
Datum count_distinct_append(PG_FUNCTION_ARGS);
Datum count_distinct(PG_FUNCTION_ARGS);
static void add_element(element_set_t * eset, char * value);
static element_set_t *init_set(int item_size, MemoryContext ctx);
static int compare_items(const void * a, const void * b, void * size);
static void compact_set(element_set_t * eset, bool need_space);
#if DEBUG_PROFILE
static void print_set_stats(element_set_t * eset);
#endif
Datum
count_distinct_append(PG_FUNCTION_ARGS)
{
element_set_t *eset;
/* info for anyelement */
Oid element_type = get_fn_expr_argtype(fcinfo->flinfo, 1);
Datum element = PG_GETARG_DATUM(1);
/* memory contexts */
MemoryContext oldcontext;
MemoryContext aggcontext;
/* OK, we do want to skip NULL values altogether */
if (PG_ARGISNULL(1) && PG_ARGISNULL(0)) /* both values are NULL*/
PG_RETURN_NULL(); /* no state, no value -> just keep NULL */
else if (PG_ARGISNULL(1))
/* if there already is a state accumulated, don't forget it */
PG_RETURN_DATUM(PG_GETARG_DATUM(0));
/* we can be sure the value is not null (see the check above) */
/* switch to the per-group hash-table memory context */
GET_AGG_CONTEXT("count_distinct_append", fcinfo, aggcontext);
oldcontext = MemoryContextSwitchTo(aggcontext);
/* init the hash table, if needed */
if (PG_ARGISNULL(0))
{
int16 typlen;
bool typbyval;
char typalign;
/* get type information for the second parameter (anyelement item) */
get_typlenbyvalalign(element_type, &typlen, &typbyval, &typalign);
/* we can't handle varlena types yet or values passed by reference */
if ((typlen == -1) || (! typbyval))
elog(ERROR, "count_distinct handles only fixed-length types passed by value");
eset = init_set(typlen, aggcontext);
} else
eset = (element_set_t *)PG_GETARG_POINTER(0);
/* add the value into the set */
add_element(eset, (char*)&element);
MemoryContextSwitchTo(oldcontext);
PG_RETURN_POINTER(eset);
}
Datum
count_distinct(PG_FUNCTION_ARGS)
{
element_set_t * eset;
CHECK_AGG_CONTEXT("count_distinct", fcinfo);
if (PG_ARGISNULL(0))
PG_RETURN_NULL();
eset = (element_set_t *)PG_GETARG_POINTER(0);
/* do the compaction */
compact_set(eset, false);
#if DEBUG_PROFILE
print_set_stats(eset);
#endif
PG_RETURN_INT64(eset->nall);
}
/* performs compaction of the set
*
* Sorts the unsorted data, removes duplicate values and then merges it
* into the already sorted part (skipping duplicate values).
*
* Finally, it checks whether at least ARRAY_FREE_FRACT (20%) of the array
* is empty, and if not then resizes it.
*/
static void
compact_set(element_set_t * eset, bool need_space)
{
char *base = eset->data + (eset->nsorted * eset->item_size);
char *last = base;
char *curr;
int i;
int cnt = 1;
double free_fract;
Assert(eset->nsorted + eset->nall > 0);
Assert(eset->data != NULL);
Assert(eset->nsorted <= eset->nall);
Assert(eset->nall * eset->item_size <= eset->nbytes);
/* if there are no new (unsorted) items, we're done */
if (eset->nall == eset->nsorted)
return;
/* sort the new items
*
* TODO Consider replacing this insert-sort for small number of items (for <64 items
* it might be faster than qsort)
*/
qsort_arg(eset->data + eset->nsorted * eset->item_size,
eset->nall - eset->nsorted, eset->item_size,
compare_items, &eset->item_size);
/*
* Remove duplicities from the sorted array. That is - walk through the array,
* compare each item with the previous one, and only keep it if it's different.
*/
for (i = 1; i < eset->nall - eset->nsorted; i++)
{
curr = base + (i * eset->item_size);
/* items differ (keep the item) */
if (memcmp(last, curr, eset->item_size) != 0)
{
last += eset->item_size;
cnt += 1;
/* only copy if really needed */
if (last != curr)
memcpy(last, curr, eset->item_size);
}
}
/* duplicities removed -> update the number of items in this part */
eset->nall = eset->nsorted + cnt;
/* If this is the first sorted part, we can just use it as the 'sorted' part. */
if (eset->nsorted == 0)
eset->nsorted = eset->nall;
/* TODO Another optimization opportunity is that we don't really need to merge the
* arrays, if we freed enough space by processing the new items. We may postpone
* that until the last call (when finalizing the aggregate). OTOH if that happens,
* it shouldn't be that expensive to merge because the number of new items will
* be small (as we've removed a enough duplicities).
*/
/* If a merge is needed, walk through the arrays and keep unique values. */
if (eset->nsorted < eset->nall)
{
MemoryContext oldctx = MemoryContextSwitchTo(eset->aggctx);
/* allocate new array for the result */
char * data = palloc0(eset->nbytes);
char * ptr = data;
/* already sorted array */
char * a = eset->data;
char * a_max = eset->data + eset->nsorted * eset->item_size;
/* the new array */
char * b = eset->data + (eset->nsorted * eset->item_size);
char * b_max = eset->data + eset->nall * eset->item_size;
MemoryContextSwitchTo(oldctx);
/* TODO There's a possibility for optimization - if we get already sorted
* items (e.g. because of a subplan), we can just copy the arrays.
* The check is as simple as (a_first > b_last) || (a_last < b_first).
* OTOH this might be pointless in practice.
*/
while (true)
{
int r = memcmp(a, b, eset->item_size);
/*
* If both values are the same, copy one of them into the result and increment
* both. Otherwise, increment only the smaller value.
*/
if (r == 0)
{
memcpy(ptr, a, eset->item_size);
a += eset->item_size;
b += eset->item_size;
}
else if (r < 0)
{
memcpy(ptr, a, eset->item_size);
a += eset->item_size;
}
else
{
memcpy(ptr, b, eset->item_size);
b += eset->item_size;
}
ptr += eset->item_size;
/*
* If we reached the end of (at least) one of the arrays, copy all the remaining
* items and we're done.
*/
if ((a == a_max) || (b == b_max))
{
if (a != a_max) /* b ended -> copy rest of a */
{
memcpy(ptr, a, a_max - a);
ptr += (a_max - a);
}
else if (b != b_max) /* a ended -> copy rest of b */
{
memcpy(ptr, b, b_max - b);
ptr += (b_max - b);
}
break;
}
}
/* update the counts */
eset->nsorted = (ptr - data) / eset->item_size;
eset->nall = eset->nsorted;
pfree(eset->data);
eset->data = data;
}
/* free space as a fraction of the total size */
free_fract = (eset->nbytes - eset->nall * eset->item_size) * 1.0 / eset->nbytes;
/*
* When we need space for more items (e.g. not when finalizing the aggregate
* result) we need to check that, and enlarge the array when needed. We
* require ARRAY_FREE_FRACT of the space to be free.
*/
if (need_space && (free_fract < ARRAY_FREE_FRACT))
{
eset->nbytes *= 2;
eset->data = repalloc(eset->data, eset->nbytes);
}
}
static void
add_element(element_set_t * eset, char * value)
{
/* if there's not enough space for another item, perform compaction */
if (eset->item_size * (eset->nall + 1) > eset->nbytes)
compact_set(eset, true);
/* there needs to be space for at least one more value (thanks to the compaction) */
Assert(eset->nbytes >= eset->item_size * (eset->nall + 1));
/* now we're sure there's enough space */
memcpy(eset->data + (eset->item_size * eset->nall), value, eset->item_size);
eset->nall += 1;
}
/* XXX make sure the whole method is called within the aggregate context */
static element_set_t *
init_set(int item_size, MemoryContext ctx)
{
element_set_t * eset = (element_set_t *)palloc0(sizeof(element_set_t));
eset->item_size = item_size;
eset->nsorted = 0;
eset->nall = 0;
eset->nbytes = ARRAY_INIT_SIZE;
eset->aggctx = ctx;
/* the memory is zeroed */
eset->data = palloc0(eset->nbytes);
return eset;
}
#if DEBUG_PROFILE
static void
print_set_stats(element_set_t * eset)
{
elog(WARNING, "bytes=%d item=%d all=%d sorted=%d", eset->nbytes, eset->item_size, eset->nall, eset->nsorted);
}
#endif
/* just compare the data directly */
static int
compare_items(const void * a, const void * b, void * size)
{
return memcmp(a, b, *(int*)size);
}
Jump to Line
Something went wrong with that request. Please try again.