Permalink
Browse files

wire in postings list headers

Right now these simple add a count in addition to an offset within
the postings list.

In the future, this change will enable:
a) quick results for counts of simple (single-term or single-label) queries,
   which actually is a common use case
b) a much more space-efficient postings structure; and
c) much quicker, skiplist-style queries for phrases (which right now are
   pretty slow).

Bump the segment version number to account for this.
  • Loading branch information...
1 parent 258a950 commit 050204ce2635de1850ecbfc282bae152d93e5b30 @wmorgan committed Apr 14, 2012
Showing with 141 additions and 79 deletions.
  1. +11 −5 dump.c
  2. +8 −3 search.c
  3. +65 −33 segment.c
  4. +1 −1 snippeter.c
  5. +16 −17 termhash.c
  6. +16 −4 termhash.h
  7. +24 −16 test-termhash.c
View
16 dump.c
@@ -5,11 +5,14 @@
#define isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
#define KEY(h, i) &(h->pool[h->keys[i]])
-RAISING_STATIC(dump_posting_list(wp_segment* s, uint32_t offset)) {
+RAISING_STATIC(dump_posting_list(wp_segment* s, posting_list_header* plh)) {
posting po;
docid_t last_doc_id = 0;
int started = 0;
+ uint32_t offset = plh->next_offset;
+ printf("[%u entries]\n", plh->count);
+
while(offset != OFFSET_NONE) {
RELAY_ERROR(wp_segment_read_posting(s, offset, &po, 1));
@@ -28,11 +31,14 @@ RAISING_STATIC(dump_posting_list(wp_segment* s, uint32_t offset)) {
return NO_ERROR;
}
-RAISING_STATIC(dump_label_posting_list(wp_segment* s, uint32_t offset)) {
+RAISING_STATIC(dump_label_posting_list(wp_segment* s, posting_list_header* plh)) {
posting po;
docid_t last_doc_id = 0;
int started = 0;
+ uint32_t offset = plh->next_offset;
+ printf("[%u entries]\n", plh->count);
+
while(offset != OFFSET_NONE) {
RELAY_ERROR(wp_segment_read_label(s, offset, &po));
@@ -56,7 +62,7 @@ RAISING_STATIC(dump(wp_segment* segment)) {
uint32_t* thflags = TERMHASH_FLAGS(th);
term* thkeys = TERMHASH_KEYS(th);
- uint32_t* thvals = TERMHASH_VALS(th);
+ posting_list_header* thvals = TERMHASH_VALS(th);
for(uint32_t i = 0; i < th->n_buckets; i++) {
if(isempty(thflags, i)); // do nothing
@@ -72,13 +78,13 @@ RAISING_STATIC(dump(wp_segment* segment)) {
const char* label = stringmap_int_to_string(sh, sp, t.word_s);
printf("%u: ~%s\n", i, label);
}
- RELAY_ERROR(dump_label_posting_list(segment, thvals[i]));
+ RELAY_ERROR(dump_label_posting_list(segment, &thvals[i]));
}
else {
const char* field = stringmap_int_to_string(sh, sp, t.field_s);
const char* word = stringmap_int_to_string(sh, sp, t.word_s);
printf("%u: %s:'%s'\n", i, field, word);
- RELAY_ERROR(dump_posting_list(segment, thvals[i]));
+ RELAY_ERROR(dump_posting_list(segment, &thvals[i]));
}
}
}
View
@@ -195,8 +195,14 @@ static wp_error* term_init_search_state(wp_query* q, wp_segment* seg) {
t.word_s = stringmap_string_to_int(sh, sp, q->word);
- uint32_t offset = termhash_get_val(th, t);
- if(offset == (uint32_t)-1) offset = OFFSET_NONE;
+ uint32_t offset;
+ posting_list_header* plh = termhash_get_val(th, t);
+
+ DEBUG("posting list header for %s:%s (-> %u:%u) is %p", q->field, q->word, t.field_s, t.word_s, plh);
+ if(plh == NULL) offset = OFFSET_NONE;
+ else offset = plh->next_offset;
+
+ if(plh) DEBUG("posting list header has count=%u next_offset=%u", plh->count, plh->next_offset);
if(offset == OFFSET_NONE) state->done = 1; // no entry in term hash
else {
@@ -784,7 +790,6 @@ static wp_error* every_advance_to_doc(wp_query* q, wp_segment* seg, docid_t doc_
wp_error* wp_search_run_query_on_segment(struct wp_query* q, struct wp_segment* s, uint32_t max_num_results, uint32_t* num_results, search_result* results) {
int done;
-
*num_results = 0;
#ifdef DEBUG
View
@@ -3,13 +3,16 @@
#include <unistd.h>
#include "whistlepig.h"
-#define POSTINGS_REGION_TYPE_IMMUTABLE_VBE 1
+#define POSTINGS_REGION_TYPE_IMMUTABLE_VBE 1
#define POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS 2 // bigger, mutable
-#define SEGMENT_VERSION 3
+#define SEGMENT_VERSION 4
#define wp_segment_label_posting_at(posting_region, offset) ((label_posting*)(posting_region->postings + offset))
+static posting_list_header blank_plh = { .count = 0, .next_offset = OFFSET_NONE };
+static term dead_term = { .field_s = 0, .word_s = 0 };
+
wp_error* wp_segment_grab_readlock(wp_segment* seg) {
segment_info* si = MMAP_OBJ(seg->seginfo, segment_info);
RELAY_ERROR(wp_lock_grab(&si->lock, WP_LOCK_READLOCK));
@@ -449,7 +452,7 @@ wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* w
RELAY_ERROR(bump_stringpool(s, &success));
RELAY_ERROR(bump_termhash(s, &success));
- DEBUG("adding posting for %s:%s and doc %u", field, word, doc_id);
+ DEBUG("adding posting for %s:%s and doc %u with %u positions", field, word, doc_id, num_positions);
postings_region* pr = MMAP_OBJ(s->postings, postings_region);
stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
@@ -461,26 +464,38 @@ wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* w
RELAY_ERROR(stringmap_add(sh, sp, field, &t.field_s));
RELAY_ERROR(stringmap_add(sh, sp, word, &t.word_s));
+ DEBUG("%s:%s maps to %u:%u", field, word, t.field_s, t.word_s);
+
// find the offset of the next posting
+ posting_list_header* plh = termhash_get_val(th, t);
+ if(plh == NULL) {
+ RELAY_ERROR(termhash_put_val(th, t, &blank_plh));
+ plh = termhash_get_val(th, t);
+ }
+ DEBUG("posting list header for %s:%s is at %p", field, word, plh);
+
posting po;
- uint32_t next_offset = termhash_get_val(th, t);
- if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
- if(next_offset != OFFSET_NONE) { // TODO remove this check for speed once happy
+ uint32_t next_offset = plh->next_offset;
+
+ if(next_offset != OFFSET_NONE) { // TODO remove this check for speed once happy [PERFORMANCE]
RELAY_ERROR(wp_segment_read_posting(s, next_offset, &po, 0));
if(po.doc_id >= doc_id) RAISE_ERROR("cannot add a doc_id out of sorted order");
}
// write the entry to the postings region
uint32_t entry_offset = pr->postings_head;
- //DEBUG("entry will be at offset %u, prev offset is %u and next offset is %u", entry_offset, prev_offset, next_offset);
+ DEBUG("writing posting at offset %u. next offset is %u.", entry_offset, next_offset);
+
po.doc_id = doc_id;
po.next_offset = next_offset;
po.num_positions = num_positions;
RELAY_ERROR(write_posting(s, &po, positions)); // prev_docid is 0 for th
- DEBUG("postings list head now at %u", pr->postings_head);
+ DEBUG("posting list head now at %u", pr->postings_head);
// really finally, update the tail pointer so that readers can access this posting
- RELAY_ERROR(termhash_put_val(th, t, entry_offset));
+ plh->count++;
+ plh->next_offset = entry_offset;
+ DEBUG("posting list header for %s:%s now reads count=%u offset=%u", field, word, plh->count, plh->next_offset);
return NO_ERROR;
}
@@ -537,22 +552,25 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
// find the previous and next label postings, between which we'll insert this
// posting
- uint32_t prev_offset = OFFSET_NONE;
- uint32_t next_offset = termhash_get_val(th, t);
+ posting_list_header* plh = termhash_get_val(th, t);
+ if(plh == NULL) {
+ RELAY_ERROR(termhash_put_val(th, t, &blank_plh));
+ plh = termhash_get_val(th, t);
+ }
+
+ uint32_t next_offset = plh->next_offset;
docid_t last_docid = DOCID_NONE;
+ uint32_t prev_offset = OFFSET_NONE;
- if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
DEBUG("start offset is %u (none is %u)", next_offset, OFFSET_NONE);
while(next_offset != OFFSET_NONE) {
label_posting* lp = wp_segment_label_posting_at(pr, next_offset);
- if((last_docid != DOCID_NONE) && (lp->doc_id >= last_docid)) {
+ if((last_docid != DOCID_NONE) && (lp->doc_id >= last_docid))
RAISE_ERROR("whistlepig index corruption! lp %u has docid %u but last docid at lp %u was %u", next_offset, lp->doc_id, prev_offset, last_docid);
- }
- else {
+ else
last_docid = lp->doc_id;
- }
DEBUG("got doc id %u next_offset %u at offset %u (looking for doc id %u)", lp->doc_id, lp->next_offset, next_offset, doc_id);
if(lp->doc_id == doc_id) {
@@ -567,18 +585,23 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
// find a space for the posting by first checking for a free postings in the
// dead list. the dead list is the list stored under the sentinel term with
// field 0 and word 0.
- term dead_term = { .field_s = 0, .word_s = 0 };
+ posting_list_header* dead_plh = termhash_get_val(th, dead_term);
+ if(dead_plh == NULL) {
+ RELAY_ERROR(termhash_put_val(th, dead_term, &blank_plh));
+ dead_plh = termhash_get_val(th, t);
+ }
+
uint32_t entry_offset;
- uint32_t dead_offset = termhash_get_val(th, dead_term);
- if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
+ uint32_t dead_offset = dead_plh->next_offset;
if(dead_offset == OFFSET_NONE) { // make a new posting
entry_offset = pr->postings_head;
}
else { // we'll use this one; remove it from the linked list
DEBUG("offset from dead list is %u, using it for the new posting!", dead_offset);
- entry_offset = dead_offset;
- RELAY_ERROR(termhash_put_val(th, dead_term, wp_segment_label_posting_at(pr, dead_offset)->next_offset));
+ entry_offset = dead_plh->next_offset;
+ dead_plh->next_offset = wp_segment_label_posting_at(pr, dead_offset)->next_offset;
+ dead_plh->count--;
}
// finally, write the entry to the label postings region
@@ -588,11 +611,12 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
po->next_offset = next_offset;
pr->postings_head += (uint32_t)sizeof(label_posting);
- DEBUG("label postings list head now at %u", pr->postings_head);
+ DEBUG("label posting list head now at %u", pr->postings_head);
// really finally, update either the previous offset or the tail pointer
// for this label so that readers can access this posting
- if(prev_offset == OFFSET_NONE) RELAY_ERROR(termhash_put_val(th, t, entry_offset));
+ plh->count++;
+ if(prev_offset == OFFSET_NONE) plh->next_offset = entry_offset;
else wp_segment_label_posting_at(pr, prev_offset)->next_offset = entry_offset;
return NO_ERROR;
@@ -615,13 +639,16 @@ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_
t.word_s = stringmap_string_to_int(sh, sp, label); // will be -1 if not there
// find the posting and the previous posting in the list, if any
- uint32_t prev_offset = OFFSET_NONE;
- uint32_t offset = termhash_get_val(th, t);
docid_t last_docid = DOCID_NONE;
+ uint32_t prev_offset = OFFSET_NONE;
+ posting_list_header* plh = termhash_get_val(th, t);
+ if(plh == NULL) {
+ DEBUG("no such label %s", label);
+ return NO_ERROR;
+ }
- if(offset == (uint32_t)-1) offset = OFFSET_NONE;
+ uint32_t offset = plh->next_offset;
label_posting* lp = NULL;
-
while(offset != OFFSET_NONE) {
lp = wp_segment_label_posting_at(pr, offset);
@@ -646,17 +673,22 @@ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_
}
// we've found the posting; now remove it from the list
- if(prev_offset == OFFSET_NONE) RELAY_ERROR(termhash_put_val(th, t, lp->next_offset));
+ if(prev_offset == OFFSET_NONE) plh->next_offset = lp->next_offset;
else wp_segment_label_posting_at(pr, prev_offset)->next_offset = lp->next_offset;
+ plh->count--;
// now add it to the dead list for later reclamation
- term dead_term = { .field_s = 0, .word_s = 0 };
- uint32_t dead_offset = termhash_get_val(th, dead_term);
- if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
+ posting_list_header* dead_plh = termhash_get_val(th, dead_term);
+ if(dead_plh == NULL) {
+ RELAY_ERROR(termhash_put_val(th, dead_term, &blank_plh));
+ dead_plh = termhash_get_val(th, t);
+ }
- lp->next_offset = dead_offset;
DEBUG("adding dead label posting %u to head of deadlist with next_offset %u", offset, lp->next_offset);
- RELAY_ERROR(termhash_put_val(th, dead_term, offset));
+
+ uint32_t dead_offset = dead_plh->next_offset;
+ lp->next_offset = dead_offset;
+ dead_plh->next_offset = offset;
return NO_ERROR;
}
View
@@ -87,7 +87,7 @@ RAISING_STATIC(snippetize_from_lexer(wp_query* query, lexinfo* charpos, yyscan_t
RARRAY_INIT(pword, words);
while(yylex(*scanner) != TOK_DONE) {
- pword pw = { strdup(yyget_text(*scanner)), charpos->start, charpos->end };
+ pword pw = { .token = strdup(yyget_text(*scanner)), .start = charpos->start, .end = charpos->end };
RARRAY_ADD(pword, words, pw);
}
View
@@ -106,15 +106,15 @@ wp_error* termhash_bump_size(termhash *h) {
// get pointers to the old locations
term* oldkeys = TERMHASH_KEYS(h);
- uint32_t* oldvals = TERMHASH_VALS(h);
+ posting_list_header* oldvals = TERMHASH_VALS(h);
// set pointers to the new locations
uint32_t* newflags = (uint32_t*)h->boundary;
term* newkeys = (term*)(newflags + ((new_n_buckets >> 4) + 1));
- uint32_t* newvals = (uint32_t*)(newkeys + new_n_buckets);
+ posting_list_header* newvals = (posting_list_header*)(newkeys + new_n_buckets);
// move the vals and keys
- memmove(newvals, oldvals, h->n_buckets * sizeof(uint32_t));
+ memmove(newvals, oldvals, h->n_buckets * sizeof(posting_list_header));
memmove(newkeys, oldkeys, h->n_buckets * sizeof(term));
// clear the new flags
@@ -124,8 +124,7 @@ wp_error* termhash_bump_size(termhash *h) {
for (unsigned int j = 0; j != h->n_buckets; ++j) {
if (iseither(flagbaks, j) == 0) {
term key = newkeys[j];
- uint32_t val;
- val = newvals[j];
+ posting_list_header val = newvals[j];
set_isdel_true(flagbaks, j);
while (1) {
uint32_t inc, k, i;
@@ -139,7 +138,7 @@ wp_error* termhash_bump_size(termhash *h) {
set_isempty_false(newflags, i);
if (i < h->n_buckets && iseither(flagbaks, i) == 0) {
{ term tmp = newkeys[i]; newkeys[i] = key; key = tmp; }
- { uint32_t tmp = newvals[i]; newvals[i] = val; val = tmp; }
+ { posting_list_header tmp = newvals[i]; newvals[i] = val; val = tmp; }
set_isdel_true(flagbaks, i);
} else {
newkeys[i] = key;
@@ -235,20 +234,20 @@ void termhash_del(termhash *h, uint32_t x) {
}
}
-uint32_t termhash_get_val(termhash* h, term t) {
- uint32_t* vals = TERMHASH_VALS(h);
+posting_list_header* termhash_get_val(termhash* h, term t) {
+ posting_list_header* vals = TERMHASH_VALS(h);
uint32_t idx = termhash_get(h, t);
- if(idx == h->n_buckets) return (uint32_t)-1;
- return vals[idx];
+ if(idx == h->n_buckets) return NULL;
+ return &vals[idx];
}
-wp_error* termhash_put_val(termhash* h, term t, uint32_t val) {
+wp_error* termhash_put_val(termhash* h, term t, posting_list_header* val) {
int status;
- uint32_t* vals = TERMHASH_VALS(h);
+ posting_list_header* vals = TERMHASH_VALS(h);
uint32_t loc = termhash_put(h, t, &status);
DEBUG("put(%u,%u) has status %d and loc %u (error val is %u)", t.field_s, t.word_s, status, loc, h->n_buckets);
if(status == -1) RAISE_ERROR("out of space in hash");
- vals[loc] = val;
+ memcpy(&vals[loc], val, sizeof(posting_list_header));
return NO_ERROR;
}
@@ -257,22 +256,22 @@ int termhash_needs_bump(termhash* h) {
}
// returns the total size in bytes
-// memory layout: termhash, then:
+// memory layout: termhash struct, then:
// ((n_buckets >> 4) + 1) uint32_t's for the flags
// n_buckets terms for the keys
-// n_buckets uint32_t's for the vals (offsets into postings lists)
+// n_buckets posting_list_header for the vals (offsets into postings lists)
static uint32_t size(uint32_t n_buckets) {
uint32_t size = (uint32_t)sizeof(termhash) +
(((n_buckets >> 4) + 1) * (uint32_t)sizeof(uint32_t)) +
(n_buckets * (uint32_t)sizeof(term)) +
- (n_buckets * (uint32_t)sizeof(uint32_t));
+ (n_buckets * (uint32_t)sizeof(posting_list_header));
DEBUG("size of a termhash with %u buckets is %lu + %lu + %lu + %lu = %u",
n_buckets,
(long)sizeof(termhash),
(long)(((n_buckets >> 4) + 1) * sizeof(uint32_t)),
(long)(n_buckets * sizeof(term)),
- (long)(n_buckets * sizeof(uint32_t)),
+ (long)(n_buckets * sizeof(posting_list_header)),
size);
return size;
Oops, something went wrong.

0 comments on commit 050204c

Please sign in to comment.