Skip to content

Commit

Permalink
WT-2231: this is a replay of the changes for the previous WT-2231 branch
Browse files Browse the repository at this point in the history
(see pull request #2325) for the individual commit details. There's an
additional bug fix over and above those commits, fixing a clang sanitizer
error.

In summary: check the leaf page's parent keys before doing the full
binary search of the leaf page, avoiding the binary search entirely
when the cursor is being re-positioned, at the cost of two additional
searches when the cursor is not being re-positioned. Additionally,
do some work to improve WT_REF page hints.
  • Loading branch information
keithbostic committed Dec 10, 2015
1 parent 6bd151a commit b62ee8a
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 17 deletions.
8 changes: 6 additions & 2 deletions src/btree/bt_page.c
Expand Up @@ -272,7 +272,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
const WT_PAGE_HEADER *dsk;
WT_PAGE_INDEX *pindex;
WT_REF **refp, *ref;
uint32_t i;
uint32_t hint, i;

btree = S2BT(session);
dsk = page->dsk;
Expand All @@ -284,9 +284,11 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
*/
pindex = WT_INTL_INDEX_GET_SAFE(page);
refp = pindex->index;
hint = 0;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
ref = *refp++;
ref->home = page;
ref->pindex_hint = hint++;

__wt_cell_unpack(cell, unpack);
ref->addr = cell;
Expand Down Expand Up @@ -404,7 +406,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
const WT_PAGE_HEADER *dsk;
WT_PAGE_INDEX *pindex;
WT_REF *ref, **refp;
uint32_t i;
uint32_t hint, i;
bool overflow_keys;

btree = S2BT(session);
Expand All @@ -421,9 +423,11 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
pindex = WT_INTL_INDEX_GET_SAFE(page);
refp = pindex->index;
overflow_keys = false;
hint = 0;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
ref = *refp;
ref->home = page;
ref->pindex_hint = hint++;

__wt_cell_unpack(cell, unpack);
switch (unpack->type) {
Expand Down
40 changes: 26 additions & 14 deletions src/btree/bt_split.c
Expand Up @@ -404,12 +404,12 @@ __split_child_block_evict_and_split(WT_PAGE *child)
*/
static int
__split_ref_move_final(
WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries)
WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
{
WT_DECL_RET;
WT_PAGE *child;
WT_REF *ref, *child_ref;
uint32_t i;
uint32_t i, j;

/*
* The WT_REF structures moved to newly allocated child pages reference
Expand All @@ -420,8 +420,11 @@ __split_ref_move_final(
* happens the thread waits for the reference's home page to be updated,
* which we do here: walk the children and fix them up.
*/
for (i = 0; i < entries; ++i, ++refp) {
ref = *refp;
for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
ref = pindex->index[i];

/* Update the WT_REF's page-index hint. */
ref->pindex_hint = i;

/*
* We don't hold hazard pointers on created pages, they cannot
Expand Down Expand Up @@ -451,6 +454,7 @@ __split_ref_move_final(
* know about that flag; use the standard macros to ensure that
* reading the child's page index structure is safe.
*/
j = 0;
WT_ENTER_PAGE_INDEX(session);
WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
/*
Expand All @@ -459,10 +463,11 @@ __split_ref_move_final(
* disk pages may have been read in since then, and
* those pages would have correct parent references.
*/
if (child_ref->home != child) {
if (child_ref->home != child)
child_ref->home = child;
child_ref->pindex_hint = 0;
}

/* Update the WT_REF's page-index hint. */
child_ref->pindex_hint = j++;
} WT_INTL_FOREACH_END;
WT_LEAVE_PAGE_INDEX(session);

Expand Down Expand Up @@ -638,8 +643,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
__split_verify_intl_key_order(session, root));
#endif
/* Fix up the moved WT_REF structures. */
WT_ERR(__split_ref_move_final(
session, alloc_index->index, alloc_index->entries));
WT_ERR(__split_ref_move_final(session, alloc_index, false));

/* We've installed the allocated page-index, ensure error handling. */
alloc_index = NULL;
Expand Down Expand Up @@ -700,7 +704,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_REF **alloc_refp, *next_ref;
size_t parent_decr, size;
uint64_t split_gen;
uint32_t i, j;
uint32_t hint, i, j;
uint32_t deleted_entries, parent_entries, result_entries;
uint32_t *deleted_refs;
bool complete, empty_parent;
Expand Down Expand Up @@ -768,22 +772,31 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
* Allocate and initialize a new page index array for the parent, then
* copy references from the original index array, plus references from
* the newly created split array, into place.
*
* Update the WT_REF's page-index hint as we go. This can race with a
* thread setting the hint based on an older page-index, and the change
* isn't backed out in the case of an error, so there ways for the hint
* to be wrong; OK because it's just a hint.
*/
size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
parent_incr += size;
alloc_index->index = (WT_REF **)(alloc_index + 1);
alloc_index->entries = result_entries;
for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
for (alloc_refp = alloc_index->index,
hint = i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
if (next_ref == ref)
for (j = 0; j < new_entries; ++j) {
ref_new[j]->home = parent;
ref_new[j]->pindex_hint = hint++;
*alloc_refp++ = ref_new[j];
}
else if (next_ref->state != WT_REF_SPLIT)
else if (next_ref->state != WT_REF_SPLIT) {
/* Skip refs we have marked for deletion. */
next_ref->pindex_hint = hint++;
*alloc_refp++ = next_ref;
}
}

/* Check that we filled in all the entries. */
Expand Down Expand Up @@ -1128,8 +1141,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
#endif

/* Fix up the moved WT_REF structures. */
WT_ERR(__split_ref_move_final(
session, alloc_index->index + 1, alloc_index->entries - 1));
WT_ERR(__split_ref_move_final(session, alloc_index, true));

/*
* We don't care about the page-index we allocated, all we needed was
Expand Down
79 changes: 78 additions & 1 deletion src/btree/row_srch.c
Expand Up @@ -131,6 +131,62 @@ __wt_search_insert(
return (0);
}

/*
* __check_leaf_key_range --
* Check the search key is in the leaf page's key range.
*/
static inline int
__check_leaf_key_range(WT_SESSION_IMPL *session,
WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
{
WT_BTREE *btree;
WT_COLLATOR *collator;
WT_ITEM *item;
WT_PAGE_INDEX *pindex;
uint32_t indx;
int cmp;

btree = S2BT(session);
collator = btree->collator;
item = cbt->tmp;

/*
* Check if the search key is less than the parent's starting key for
* this page.
*/
__wt_ref_key(leaf->home, leaf, &item->data, &item->size);
WT_RET(__wt_compare(session, collator, srch_key, item, &cmp));
if (cmp < 0) {
cbt->compare = 1; /* page keys > search key */
return (0);
}

/*
* Check if the search key is greater than or equal to the starting key
* for the parent's next page.
*
* !!!
* Check that "indx + 1" is a valid page-index entry first, because it
* also checks that "indx" is a valid page-index entry, and we have to
* do that latter check before looking at the indx slot of the array
* for a match to leaf (in other words, our page hint might be wrong).
*/
WT_INTL_INDEX_GET(session, leaf->home, pindex);
indx = leaf->pindex_hint;
if (indx + 1 < pindex->entries && pindex->index[indx] == leaf) {
__wt_ref_key(leaf->home,
pindex->index[indx + 1], &item->data, &item->size);
WT_RET(__wt_compare(session, collator, srch_key, item, &cmp));
if (cmp >= 0) {
cbt->compare = -1; /* page keys < search key */
return (0);
}
}

cbt->compare = 0;
return (0);
}

/*
* __wt_row_search --
* Search a row-store tree for a specific key.
Expand Down Expand Up @@ -179,8 +235,29 @@ __wt_row_search(WT_SESSION_IMPL *session,
append_check = insert && cbt->append_tree;
descend_right = true;

/* We may only be searching a single leaf page, not the full tree. */
/*
* We may be searching only a single leaf page, not the full tree. In
* the normal case where the page links to a parent, check the page's
* parent keys before doing the full search, it's faster when the
* cursor is being re-positioned. (One case where the page doesn't
* have a parent is if it is being re-instantiated in memory as part
* of a split).
*/
if (leaf != NULL) {
if (leaf->home != NULL) {
WT_RET(__check_leaf_key_range(
session, srch_key, leaf, cbt));
if (cbt->compare != 0) {
/*
* !!!
* WT_CURSOR.search_near uses the slot value to
* decide if there was an on-page match.
*/
cbt->slot = 0;
return (0);
}
}

current = leaf;
goto leaf_only;
}
Expand Down

0 comments on commit b62ee8a

Please sign in to comment.