Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WT-7757 Skip internal and leaf pages with content obsolete to the reader #6842

Merged
merged 20 commits into from Aug 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
b3aa9c2
Skip pages not needed to be read at higher level, not just leaf pages.
sulabhM Jul 22, 2021
2415448
Inline the function __wt_btcur_skip_page
sulabhM Jul 22, 2021
d8605a3
Replace skipped pages next/prev statistics with a single stat
sulabhM Jul 26, 2021
c829d3e
Check for 50% page skips instead of 80%
sulabhM Jul 26, 2021
9fe1c11
Move dirty-page-check to after locking a page
sulabhM Jul 27, 2021
9e4cf56
Change the python test to check for atleast 1 skipped page.
sulabhM Jul 28, 2021
64d38dd
Don't use cached copy of the page pointer, in-memory splits could change
sulabhM Jul 28, 2021
c1d46f1
Remove the statistic counting the number of pages skipped.
sulabhM Jul 29, 2021
58038ba
Fix a comment: from not- found to not-found
sulabhM Jul 29, 2021
c9ca2c8
Redo the skip pages function, pull out certain conditions outside it.
sulabhM Jul 29, 2021
29eed49
Merge remote-tracking branch 'origin/develop' into wt-7757-reduce-int…
sulabhM Jul 29, 2021
52de7ef
Fixed a comment about tombstones
sulabhM Jul 29, 2021
833f6ec
Merge remote-tracking branch 'origin/develop' into wt-7757-reduce-int…
sulabhM Jul 29, 2021
9565158
Switch to locking the page instead of using try lock.
sulabhM Jul 29, 2021
bcb8ba9
Allow skipping pages without reading them or racing with eviction.
sulabhM Aug 2, 2021
bc713f8
Forgo modified check if the page is not in memory
sulabhM Aug 2, 2021
ddb471f
Fix matching typo in cursor.prev code.
keithbostic Aug 2, 2021
b6770fd
Add __wt_btcur_skip_page() to the list of functions that can return v…
keithbostic Aug 2, 2021
d132e47
Don't bother checking if the passed-in WT_REF is NULL, that can't hap…
keithbostic Aug 2, 2021
f0509b4
Add saftey and future proof WT_REF.state around the if condition.
sulabhM Aug 3, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions dist/s_void
Expand Up @@ -60,6 +60,7 @@ func_ok()
-e '/int __wt_block_manager_size$/d' \
-e '/int __wt_block_tiered_load$/d' \
-e '/int __wt_block_write_size$/d' \
-e '/int __wt_btcur_skip_page$/d' \
-e '/int __wt_buf_catfmt$/d' \
-e '/int __wt_buf_fmt$/d' \
-e '/int __wt_count_birthmarks$/d' \
Expand Down
2 changes: 0 additions & 2 deletions dist/stat_data.py
Expand Up @@ -811,13 +811,11 @@ def __init__(self, name, desc, flags=''):
CursorStat('cursor_next_hs_tombstone', 'cursor next calls that skip due to a globally visible history store tombstone'),
CursorStat('cursor_next_skip_ge_100', 'cursor next calls that skip greater than or equal to 100 entries'),
CursorStat('cursor_next_skip_lt_100', 'cursor next calls that skip less than 100 entries'),
CursorStat('cursor_next_skip_page_count', 'Total number of pages skipped without reading by cursor next calls'),
CursorStat('cursor_next_skip_total', 'Total number of entries skipped by cursor next calls'),
CursorStat('cursor_open_count', 'open cursor count', 'no_clear,no_scale'),
CursorStat('cursor_prev_hs_tombstone', 'cursor prev calls that skip due to a globally visible history store tombstone'),
CursorStat('cursor_prev_skip_ge_100', 'cursor prev calls that skip greater than or equal to 100 entries'),
CursorStat('cursor_prev_skip_lt_100', 'cursor prev calls that skip less than 100 entries'),
CursorStat('cursor_prev_skip_page_count', 'Total number of pages skipped without reading by cursor prev calls'),
CursorStat('cursor_prev_skip_total', 'Total number of entries skipped by cursor prev calls'),
CursorStat('cursor_search_near_prefix_fast_paths', 'Total number of times a search near has exited due to prefix config'),
CursorStat('cursor_skip_hs_cur_position', 'Total number of entries skipped to position the history store cursor'),
Expand Down
33 changes: 16 additions & 17 deletions src/btree/bt_curnext.c
Expand Up @@ -644,13 +644,12 @@ __wt_btcur_next_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating)
WT_DECL_RET;
WT_PAGE *page;
WT_SESSION_IMPL *session;
size_t pages_skipped_count, total_skipped, skipped;
size_t total_skipped, skipped;
uint32_t flags;
bool newpage, restart;

cursor = &cbt->iface;
session = CUR2S(cbt);
pages_skipped_count = 0;
total_skipped = 0;

WT_STAT_CONN_DATA_INCR(session, cursor_next);
Expand All @@ -670,24 +669,14 @@ __wt_btcur_next_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating)
__wt_btcur_iterate_setup(cbt);

/*
* Walk any page we're holding until the underlying call returns not- found. Then, move to the
* Walk any page we're holding until the underlying call returns not-found. Then, move to the
* next page, until we reach the end of the file.
*/
restart = F_ISSET(cbt, WT_CBT_ITERATE_RETRY_NEXT);
F_CLR(cbt, WT_CBT_ITERATE_RETRY_NEXT);
for (newpage = false;; newpage = true, restart = false) {
page = cbt->ref == NULL ? NULL : cbt->ref->page;

/*
sulabhM marked this conversation as resolved.
Show resolved Hide resolved
* Determine if all records on the page have been deleted and all the tombstones are visible
* to our transaction. If so, we can avoid reading the records on the page and move to the
* next page.
*/
if (__wt_btcur_skip_page(cbt)) {
pages_skipped_count++;
goto skip_page;
}

if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
/* The page cannot be NULL if the above flag is set. */
WT_ASSERT(session, page != NULL);
Expand Down Expand Up @@ -759,16 +748,26 @@ __wt_btcur_next_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating)
WT_STAT_CONN_INCR(session, cache_eviction_force_delete);
}
cbt->page_deleted_count = 0;
skip_page:

if (F_ISSET(cbt, WT_CBT_READ_ONCE))
LF_SET(WT_READ_WONT_NEED);
WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));

/*
* If we are running with snapshot isolation, and not interested in returning tombstones, we
* could potentially skip pages. The skip function looks at the aggregated timestamp
* information to determine if something is visible on the page. If nothing is, the page is
* skipped.
*/
if (session->txn->isolation == WT_ISO_SNAPSHOT &&
!F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE))
hbokhari marked this conversation as resolved.
Show resolved Hide resolved
WT_ERR(
__wt_tree_walk_custom_skip(session, &cbt->ref, __wt_btcur_skip_page, NULL, flags));
else
WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND, false);
}

err:
WT_STAT_CONN_DATA_INCRV(session, cursor_next_skip_page_count, pages_skipped_count);

if (total_skipped < 100)
WT_STAT_CONN_DATA_INCR(session, cursor_next_skip_lt_100);
else
Expand Down
33 changes: 16 additions & 17 deletions src/btree/bt_curprev.c
Expand Up @@ -590,13 +590,12 @@ __wt_btcur_prev_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating)
WT_DECL_RET;
WT_PAGE *page;
WT_SESSION_IMPL *session;
size_t pages_skipped_count, total_skipped, skipped;
size_t total_skipped, skipped;
uint32_t flags;
bool newpage, restart;

cursor = &cbt->iface;
session = CUR2S(cbt);
pages_skipped_count = 0;
total_skipped = 0;

WT_STAT_CONN_DATA_INCR(session, cursor_prev);
Expand All @@ -617,24 +616,14 @@ __wt_btcur_prev_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating)
__wt_btcur_iterate_setup(cbt);

/*
* Walk any page we're holding until the underlying call returns not- found. Then, move to the
* Walk any page we're holding until the underlying call returns not-found. Then, move to the
* previous page, until we reach the start of the file.
*/
restart = F_ISSET(cbt, WT_CBT_ITERATE_RETRY_PREV);
F_CLR(cbt, WT_CBT_ITERATE_RETRY_PREV);
for (newpage = false;; newpage = true, restart = false) {
page = cbt->ref == NULL ? NULL : cbt->ref->page;

/*
* Determine if all records on the page have been deleted and all the tombstones are visible
* to our transaction. If so, we can avoid reading the records on the page and move to the
* next page.
*/
if (__wt_btcur_skip_page(cbt)) {
pages_skipped_count++;
goto skip_page;
}

/*
* Column-store pages may have appended entries. Handle it separately from the usual cursor
* code, it's in a simple format.
Expand Down Expand Up @@ -707,16 +696,26 @@ __wt_btcur_prev_prefix(WT_CURSOR_BTREE *cbt, WT_ITEM *prefix, bool truncating)
WT_STAT_CONN_INCR(session, cache_eviction_force_delete);
}
cbt->page_deleted_count = 0;
skip_page:

if (F_ISSET(cbt, WT_CBT_READ_ONCE))
LF_SET(WT_READ_WONT_NEED);
WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));

/*
* If we are running with snapshot isolation, and not interested in returning tombstones, we
* could potentially skip pages. The skip function looks at the aggregated timestamp
* information to determine if something is visible on the page. If nothing is, the page is
* skipped.
*/
if (session->txn->isolation == WT_ISO_SNAPSHOT &&
!F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE))
WT_ERR(
__wt_tree_walk_custom_skip(session, &cbt->ref, __wt_btcur_skip_page, NULL, flags));
else
WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND, false);
}

err:
WT_STAT_CONN_DATA_INCRV(session, cursor_prev_skip_page_count, pages_skipped_count);

if (total_skipped < 100)
WT_STAT_CONN_DATA_INCR(session, cursor_prev_skip_lt_100);
else
Expand Down
50 changes: 18 additions & 32 deletions src/include/btree_inline.h
Expand Up @@ -2024,50 +2024,36 @@ __wt_bt_col_var_cursor_walk_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *
* Return if the cursor is pointing to a page with deleted records and can be skipped for cursor
* traversal.
*/
static inline bool
__wt_btcur_skip_page(WT_CURSOR_BTREE *cbt)
static inline int
__wt_btcur_skip_page(WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp)
{
WT_ADDR_COPY addr;
WT_PAGE *page;
WT_REF *ref;
WT_SESSION_IMPL *session;
uint8_t previous_state;
bool can_skip;

session = CUR2S(cbt);
ref = cbt->ref;
page = cbt->ref == NULL ? NULL : cbt->ref->page;
WT_UNUSED(context);

if (page == NULL)
return false;

previous_state = ref->state;
can_skip = false;
*skipp = false; /* Default to reading */

/*
* Determine if all records on the page have been deleted and all the tombstones are visible to
* our transaction. If so, we can avoid reading the records on the page and move to the next
* page. We base this decision on the aggregate stop point added to the page during the last
* reconciliation. We can skip this test if the page has been modified since it was reconciled
* or the underlying cursor is configured to ignore tombstones.
* reconciliation. We can skip this test if the page has been modified since it was reconciled.
*
* We are making these decisions while holding a lock for the page as checkpoint or eviction can
* make changes to the data structures (i.e., aggregate timestamps) we are reading.
* make changes to the data structures (i.e., aggregate timestamps) we are reading. It is okay
* if the page is not in memory, or gets evicted before we lock it. In such a case, we can forgo
* checking if the page has been modified. So, only do a page modified check if the page was in
* memory before locking.
*/
if (session->txn->isolation == WT_ISO_SNAPSHOT && !__wt_page_is_modified(page) &&
!F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE) && previous_state == WT_REF_MEM) {

/* We only try to lock the page once. */
if (!WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED))
return false;
WT_REF_LOCK(session, ref, &previous_state);
if ((previous_state == WT_REF_DISK || previous_state == WT_REF_DELETED ||
(previous_state == WT_REF_MEM && !__wt_page_is_modified(ref->page))) &&
__wt_ref_addr_copy(session, ref, &addr) &&
__wt_txn_visible(session, addr.ta.newest_stop_txn, addr.ta.newest_stop_ts) &&
__wt_txn_visible(session, addr.ta.newest_stop_txn, addr.ta.newest_stop_durable_ts))
*skipp = true;
WT_REF_UNLOCK(ref, previous_state);

if (__wt_ref_addr_copy(session, ref, &addr) &&
__wt_txn_visible(session, addr.ta.newest_stop_txn, addr.ta.newest_stop_ts) &&
__wt_txn_visible(session, addr.ta.newest_stop_txn, addr.ta.newest_stop_durable_ts))
can_skip = true;

WT_REF_SET_STATE(ref, previous_state);
}

return (can_skip);
return (0);
}
4 changes: 2 additions & 2 deletions src/include/extern.h
Expand Up @@ -1880,8 +1880,6 @@ static inline WT_IKEY *__wt_ref_key_instantiated(WT_REF *ref)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline WT_VISIBLE_TYPE __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_btcur_skip_page(WT_CURSOR_BTREE *cbt)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_btree_can_evict_dirty(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_btree_dominating_cache(WT_SESSION_IMPL *session, WT_BTREE *btree)
Expand Down Expand Up @@ -1968,6 +1966,8 @@ static inline double __wt_eviction_dirty_target(WT_CACHE *cache)
static inline int __wt_bt_col_var_cursor_walk_txn_read(WT_SESSION_IMPL *session,
WT_CURSOR_BTREE *cbt, WT_PAGE *page, WT_CELL_UNPACK_KV *unpack, WT_COL *cip)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_btcur_skip_page(WT_SESSION_IMPL *session, WT_REF *ref, void *context,
bool *skipp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_btree_block_free(WT_SESSION_IMPL *session, const uint8_t *addr,
size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_buf_extend(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
Expand Down
4 changes: 0 additions & 4 deletions src/include/stat.h
Expand Up @@ -503,8 +503,6 @@ struct __wt_connection_stats {
int64_t cursor_next_skip_total;
int64_t cursor_prev_skip_total;
int64_t cursor_skip_hs_cur_position;
int64_t cursor_next_skip_page_count;
int64_t cursor_prev_skip_page_count;
int64_t cursor_search_near_prefix_fast_paths;
int64_t cursor_cached_count;
int64_t cursor_insert_bulk;
Expand Down Expand Up @@ -932,8 +930,6 @@ struct __wt_dsrc_stats {
int64_t cursor_next_skip_total;
int64_t cursor_prev_skip_total;
int64_t cursor_skip_hs_cur_position;
int64_t cursor_next_skip_page_count;
int64_t cursor_prev_skip_page_count;
int64_t cursor_search_near_prefix_fast_paths;
int64_t cursor_insert_bulk;
int64_t cursor_reopen;
Expand Down