Skip to content

Commit

Permalink
WT-6257 rows out-of-order in history store file (#5780)
Browse files Browse the repository at this point in the history
* Don't assert the handle is set, we're going to drop core in the next instruction anyway.

We use WT_DBG.key in cases other than HS dumps, always allocate it.

* Reset skiphigh for leaf pages, if the page is extended during the search, the high count can be
wrong for the child.

* We allocate a tombstone in all paths, don't do it in two different places.

* The act of searching the tree is what configures future order checks, move the initialization to
the correct place.

Fix a comment that documents behavior that's no longer present.

Clear the cursor order checks on reset. I don't think this is actually necessary, but it's
cleaner, and the HS code calls reset explicitly.

* Now that we clear the cursor-order checks on reset, they may never have been turned on, so
we have to check before doing the work.

* clang-format

* Clang analyzer, skiphigh set but never read.
  • Loading branch information
keithbostic committed Jun 9, 2020
1 parent c31b965 commit 3d0b126
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 39 deletions.
3 changes: 2 additions & 1 deletion src/btree/bt_curnext.c
Expand Up @@ -548,7 +548,8 @@ __wt_cursor_key_order_reset(WT_CURSOR_BTREE *cbt)
/*
* Clear the last-key returned, it doesn't apply.
*/
cbt->lastkey->size = 0;
if (cbt->lastkey != NULL)
cbt->lastkey->size = 0;
cbt->lastrecno = WT_RECNO_OOB;
}
#endif
Expand Down
4 changes: 1 addition & 3 deletions src/btree/bt_debug.c
Expand Up @@ -337,8 +337,6 @@ __wt_debug_addr(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size,
WT_DECL_ITEM(buf);
WT_DECL_RET;

WT_ASSERT(session, S2BT_SAFE(session) != NULL);

bm = S2BT(session)->bm;

WT_RET(__wt_scr_alloc(session, 1024, &buf));
Expand Down Expand Up @@ -1022,10 +1020,10 @@ __debug_page(WT_DBG *ds, WT_REF *ref, uint32_t flags)
WT_SESSION_IMPL *session;

session = ds->session;
WT_RET(__wt_scr_alloc(session, 100, &ds->key));

/* Set up history store support. */
if (!WT_IS_HS(S2BT(session))) {
WT_RET(__wt_scr_alloc(session, 100, &ds->key));
WT_RET(__wt_scr_alloc(session, 0, &ds->hs_key));
WT_RET(__wt_scr_alloc(session, 0, &ds->hs_value));
if (session->hs_cursor == NULL) {
Expand Down
23 changes: 16 additions & 7 deletions src/btree/row_srch.c
Expand Up @@ -242,9 +242,9 @@ __wt_row_search(WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool insert, WT_REF *le
* In some cases we expect we're comparing more than a few keys with matching prefixes, so it's
* faster to avoid the memory fetches by skipping over those prefixes. That's done by tracking
* the length of the prefix match for the lowest and highest keys we compare as we descend the
* tree.
* tree. The high boundary is reset on each new page, the lower boundary is maintained.
*/
skiphigh = skiplow = 0;
skiplow = 0;

/*
* If a cursor repeatedly appends to the tree, compare the search key against the last key on
Expand Down Expand Up @@ -281,7 +281,7 @@ __wt_row_search(WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool insert, WT_REF *le
* Discard the currently held page and restart the search from the root.
*/
WT_RET(__wt_page_release(session, current, 0));
skiphigh = skiplow = 0;
skiplow = 0;
}

/* Search the internal pages of the tree. */
Expand Down Expand Up @@ -352,8 +352,7 @@ __wt_row_search(WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool insert, WT_REF *le
* parent, the child page's key space will have been truncated, and the values from the
* parent's search may be wrong for the child. We only need to reset the high count
* because the split-page algorithm truncates the end of the internal page's key space,
* the low count is still correct. We also don't need to clear either count when
* transitioning to a leaf page, a leaf page's key space can't change in flight.
* the low count is still correct.
*/
skiphigh = 0;

Expand Down Expand Up @@ -501,7 +500,17 @@ __wt_row_search(WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool insert, WT_REF *le
} else if (cmp == 0)
goto leaf_match;
}
else if (collator == NULL)
else if (collator == NULL) {
/*
* Reset the skipped prefix counts; we'd normally expect the parent's skipped prefix values
* to be larger than the child's values and so we'd only increase them as we walk down the
* tree (in other words, if we can skip N bytes on the parent, we can skip at least N bytes
* on the child). However, leaf pages at the end of the tree can be extended, causing the
* parent's search to be wrong for the child. We only need to reset the high count, the page
* can only be extended so the low count is still correct.
*/
skiphigh = 0;

for (; limit != 0; limit >>= 1) {
indx = base + (limit >> 1);
rip = page->pg_row + indx;
Expand All @@ -518,7 +527,7 @@ __wt_row_search(WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool insert, WT_REF *le
else
goto leaf_match;
}
else
} else
for (; limit != 0; limit >>= 1) {
indx = base + (limit >> 1);
rip = page->pg_row + indx;
Expand Down
30 changes: 11 additions & 19 deletions src/history/hs.c
Expand Up @@ -293,6 +293,9 @@ __hs_row_search(WT_CURSOR_BTREE *hs_cbt, WT_ITEM *srch_key, bool insert)

WT_WITH_BTREE(CUR2S(hs_cbt), CUR2BT(hs_cbt),
ret = __wt_row_search(hs_cbt, srch_key, insert, NULL, false, NULL));
#ifdef HAVE_DIAGNOSTIC
WT_TRET(__wt_cursor_key_order_init(hs_cbt));
#endif
return (ret);
}

Expand Down Expand Up @@ -420,10 +423,7 @@ __hs_insert_record_with_btree_int(WT_SESSION_IMPL *session, WT_CURSOR *cursor, W
else
hs_upd = upd_local;

/*
* Search the page and insert the updates. We expect there will be no existing data: assert that
* we don't find a matching key.
*/
/* Search the page and insert the updates. */
WT_WITH_PAGE_INDEX(session, ret = __hs_row_search(cbt, &cursor->key, true));
WT_ERR(ret);
WT_ERR(__wt_hs_modify(cbt, hs_upd));
Expand Down Expand Up @@ -507,23 +507,15 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT
if (upd->start_ts != WT_TS_NONE)
goto done;

/*
* If we inserted an update with no timestamp, we need to delete all history records for that key
* that are further in the history table than us (the key is lexicographically greater). For
* timestamped tables that are occasionally getting a non-timestamped update, that means that all
* timestamped updates should get removed. In the case of non-timestamped tables, that means that
* all updates with higher transaction ids will get removed (which could happen at some more relaxed
* isolation levels).
*/
#ifdef HAVE_DIAGNOSTIC
/*
* We need to initialize the last searched key so that we can do key comparisons when we
* begin iterating over the history store. This needs to be done otherwise the subsequent
* "next" calls will blow up.
* If we inserted an update with no timestamp, we need to delete all history records for that
* key that are further in the history table than us (the key is lexicographically greater). For
* timestamped tables that are occasionally getting a non-timestamped update, that means that
* all timestamped updates should get removed. In the case of non-timestamped tables, that means
* that all updates with higher transaction ids will get removed (which could happen at some
* more relaxed isolation levels). We're pointing at the newly inserted update, iterate once
* more to avoid deleting it.
*/
WT_ERR(__wt_cursor_key_order_init((WT_CURSOR_BTREE *)cursor));
#endif
/* We're pointing at the newly inserted update. Iterate once more to avoid deleting it. */
WT_ERR_NOTFOUND_OK(cursor->next(cursor), true);

/* No records to delete. */
Expand Down
9 changes: 4 additions & 5 deletions src/include/cursor.i
Expand Up @@ -192,6 +192,9 @@ __cursor_reset(WT_CURSOR_BTREE *cbt)
cursor = &cbt->iface;
session = CUR2S(cbt);

#ifdef HAVE_DIAGNOSTIC
__wt_cursor_key_order_reset(cbt); /* Clear key-order checks. */
#endif
__cursor_pos_clear(cbt);

/* If the cursor was active, deactivate it. */
Expand Down Expand Up @@ -385,12 +388,8 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter)

session = CUR2S(cbt);

if (reenter) {
#ifdef HAVE_DIAGNOSTIC
__wt_cursor_key_order_reset(cbt);
#endif
if (reenter)
WT_RET(__cursor_reset(cbt));
}

/*
* Any old insert position is now invalid. We rely on this being cleared to detect if a new
Expand Down
6 changes: 2 additions & 4 deletions src/txn/txn.c
Expand Up @@ -765,8 +765,8 @@ __txn_fixup_prepared_update(
* If the history update already has a stop time point and we are committing the prepared update
* there is no work to do.
*/
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL));
if (commit) {
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL));
hs_upd->start_ts = txn->commit_timestamp;
hs_upd->durable_ts = txn->durable_timestamp;
hs_upd->txnid = txn->id;
Expand All @@ -784,9 +784,7 @@ __txn_fixup_prepared_update(
hs_upd->next->durable_ts = fix_upd->durable_ts;
hs_upd->next->start_ts = fix_upd->start_ts;
hs_upd->next->txnid = fix_upd->txnid;
} else
/* Remove the restored update from history store. */
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL));
}

WT_ERR(__wt_hs_modify(hs_cbt, hs_upd));

Expand Down

0 comments on commit 3d0b126

Please sign in to comment.