Skip to content

Commit

Permalink
Implement stable sort for events
Browse files Browse the repository at this point in the history
  • Loading branch information
rasky committed Jul 2, 2016
1 parent 18070e2 commit 332f27b
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 21 deletions.
18 changes: 2 additions & 16 deletions src/tdb_encode.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "tdb_huffman.h"
#include "tdb_error.h"
#include "tdb_io.h"
#include "tdb_sort.h"

#define EDGE_INCREMENT 1000000
#define GROUPBUF_INCREMENT 1000000
Expand All @@ -42,18 +43,6 @@ struct jm_fold_state{
tdb_error ret;
};

static int compare(const void *p1, const void *p2)
{
const struct tdb_grouped_event *x = (const struct tdb_grouped_event*)p1;
const struct tdb_grouped_event *y = (const struct tdb_grouped_event*)p2;

if (x->timestamp > y->timestamp)
return 1;
else if (x->timestamp < y->timestamp)
return -1;
return 0;
}

static void *groupby_uuid_handle_one_trail(
__uint128_t uuid __attribute__((unused)),
Word_t *value,
Expand Down Expand Up @@ -99,10 +88,7 @@ static void *groupby_uuid_handle_one_trail(
num_events = j;

/* sort events of this trail by time */
/* TODO make this stable sort */
/* TODO this could really benefit from Timsort since raw data
is often partially sorted */
qsort(s->buf, num_events, sizeof(struct tdb_grouped_event), compare);
events_sort(s->buf, num_events);

/* delta-encode timestamps */
uint64_t prev_timestamp = s->min_timestamp;
Expand Down
57 changes: 57 additions & 0 deletions src/tdb_sort.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#define _DEFAULT_SOURCE /* mkstemp */
#define _GNU_SOURCE

#include <fcntl.h>
#include <unistd.h>
#include <sys/types.h>
#include <stdlib.h>
#include <string.h>

#include "tdb_internal.h"

static int compare(uint64_t t1, uint64_t t2)
{
if (t1 > t2)
return 1;
if (t1 < t2)
return -1;
return 0;
}

#define SORT_TYPE struct tdb_grouped_event
#define SORT_NAME _events
#define SORT_CMP(x,y) compare((x).timestamp, (y).timestamp)
#include "sort/sort.h"

void events_sort(struct tdb_grouped_event *buf, uint64_t num_events)
{
/* We need to do a reversed stable sort. The textbook version would be
to first the reverse the whole buffer, and then apply a stable sort.
But the underlyiing dataset can often be semi-sorted, and we do use
timsort that's very good at exploting partially sorted datasets; if
we reverse the whole buffer first as first thing, we basically leave
lots of performance on the table.
So, instead, we do a stable sort first, and then go through the array
and reverse sub-sequences of elements with the same timestamp; this
keeps timsort happier, and in the (also likely) cases of sequences
with no duplicated timestamps, we don't even do a single swap operation
(after the sort). */
_events_tim_sort(buf, num_events);
for (int i = 0; i < num_events-1; i++){
int j = i+1;
if (buf[j].timestamp == buf[i].timestamp){
do
j++;
while (j<num_events && buf[j].timestamp == buf[i].timestamp);
int n = j-i;
for (int k = 0; k < n/2; k++){
int k1 = k;
int k2 = n-k-1;
struct tdb_grouped_event tmp = buf[i+k1];
buf[i+k1] = buf[i+k2];
buf[i+k2] = tmp;
}
i = j-1;
}
}
}
11 changes: 11 additions & 0 deletions src/tdb_sort.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

#ifndef __TDB_SORT_H__
#define __TDB_SORT_H__

#include "tdb_types.h"

struct tdb_grouped_event;

void events_sort(struct tdb_grouped_event *buf, uint64_t num_events);

#endif /* __TDB_SORT_H__ */
23 changes: 18 additions & 5 deletions wscript
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def configure(cnf):

cnf.define("DSFMT_MEXP", 521)
cnf.define("HAVE_ARCHIVE_H", 1)
cnf.env.append_value("CFLAGS", "-std=c99")
cnf.env.append_value("CFLAGS", "-std=gnu99")
cnf.env.append_value("CFLAGS", "-O3")
cnf.env.append_value("CFLAGS", "-g")

Expand Down Expand Up @@ -57,6 +57,18 @@ def options(opt):
opt.load("compiler_c")

def build(bld, test_build=False):

# Third-party code is compiled without extra warnings flags, as it's not cleaned up for them
# NOTE: tdb_sort.c is special-cased because it includes third-party sort.h
bld.objects(source=bld.path.ant_glob("src/dsfmt/*.c"), target="dsfmt")
bld.objects(source=bld.path.ant_glob("src/xxhash/*.c"), target="xxhash")
bld.objects(source=bld.path.ant_glob("src/tdb_sort.c"), target="tdbsort")
bld.objects(source=bld.path.ant_glob("src/dsfmt/*.c"), cflags=["-fPIC"], target="dsfmt-so")
bld.objects(source=bld.path.ant_glob("src/xxhash/*.c"), cflags=["-fPIC"], target="xxhash-so")
bld.objects(source=bld.path.ant_glob("src/tdb_sort.c"), cflags=["-fPIC"], target="tdbsort-so")

tdbsources = bld.path.ant_glob("src/*.c", excl="src/tdb_sort.c")

tdbcflags = [
"-Wextra",
"-Wconversion",
Expand All @@ -75,16 +87,16 @@ def build(bld, test_build=False):
"-DEVENTS_ARENA_INCREMENT=100",
"-fprofile-arcs",
"-ftest-coverage",
"--coverage",
"-fPIC",
])
else:
tdbcflags.append("-fvisibility=hidden")

bld.stlib(
target = "traildb",
source = bld.path.ant_glob("src/**/*.c"),
source = tdbsources,
cflags = tdbcflags,
use = ["dsfmt", "xxhash", "tdbsort"],
uselib = ["ARCHIVE", "JUDY"],
install_path = "${PREFIX}/lib", # opt-in to have .a installed
)
Expand All @@ -103,7 +115,7 @@ def build(bld, test_build=False):
target = os.path.splitext(testname)[0],
source = [test],
includes = "src",
cflags = ["-fprofile-arcs", "-ftest-coverage", "-fPIC", "--coverage"],
cflags = ["-fprofile-arcs", "-ftest-coverage", "-fPIC"],
ldflags = ["-fprofile-arcs"],
use = ["traildb"],
uselib = ["ARCHIVE", "JUDY"],
Expand All @@ -119,8 +131,9 @@ def build(bld, test_build=False):

bld.shlib(
target = "traildb",
source = bld.path.ant_glob("src/**/*.c"),
source = tdbsources,
cflags = tdbcflags,
use = ["dsfmt-so", "xxhash-so", "tdbsort-so"],
uselib = ["ARCHIVE", "JUDY"],
vnum = "0", # .so versioning
)
Expand Down

0 comments on commit 332f27b

Please sign in to comment.