Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WT-8747 Reading between commit_timestamp and durable_timestamp can produce inconsistency #7485

Closed
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1400caf
Prohibit reading for updates that are committed but not stable.
sauclovian-wt Jan 29, 2022
13732cf
Fixes.
sauclovian-wt Feb 1, 2022
293957a
Fix up assorted tests.
sauclovian-wt Feb 1, 2022
dcf4fdf
Fix up the RTS tests.
sauclovian-wt Feb 2, 2022
41dc3fd
Handle on-disk nondurable stop times as well as start times.
sauclovian-wt Feb 3, 2022
7854c8f
Resort enum and fix incomprehensible comment.
sauclovian-wt Feb 3, 2022
eb237a6
Merge branch 'develop' into wt-8747-read-nondurable
sauclovian-wt Feb 4, 2022
7f65022
Update the docs. Add a warning about the cases we still can't detect.
sauclovian-wt Feb 4, 2022
c18971d
Switch to returning WT_PREPARE_CONFLICT for this instead of WT_ROLLBACK.
sauclovian-wt Feb 8, 2022
4cd4818
Honor ignore_prepare=true. Test that it works.
sauclovian-wt Feb 16, 2022
1a9ce37
Merge branch 'develop' into wt-8747-read-nondurable
sauclovian-wt Feb 16, 2022
9af2e1f
Fix tests that failed in the last test run, for posterity.
sauclovian-wt Feb 23, 2022
ba8837f
Merge branch 'develop' into wt-8747-read-nondurable
sauclovian-wt Feb 23, 2022
b5a8b44
Take a different approach for WT-8747.
sauclovian-wt Feb 24, 2022
9c2105d
WT-8866 Disable LSM performance tests in Evergreen (#7576)
lukech Feb 23, 2022
0fb96bc
WT-8708 Fix timestamp usage error in test/checkpoint (#7582)
keithbostic Feb 23, 2022
5907987
WT-8706 Increase the cache size in all test_checkpoint scenarios (#7577)
etienneptl Feb 23, 2022
978cae0
WT-6422 Document expectation around ignore_prepare (#7568)
keithbostic Feb 23, 2022
8d63999
WT-8695 Remove file_close_sync config (#7583)
quchenhao Feb 24, 2022
2065eff
Initial merge with the WT-8170 change was no good; try again.
sauclovian-wt Feb 24, 2022
64d6d3c
Rewrite the new durable_ts04 for this approach.
sauclovian-wt Feb 24, 2022
c4c3e75
Adjust durable_ts04 to avoid tripping on an unrelated issue.
sauclovian-wt Feb 25, 2022
7a54e23
Merge branch 'develop' into wt-8747-read-nondurable
sauclovian-wt Feb 26, 2022
55d2ff0
Fix up merge botches.
sauclovian-wt Feb 26, 2022
39fa982
Merge branch 'develop' into wt-8747-read-nondurable
sauclovian-wt Feb 26, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions dist/s_string.ok
Expand Up @@ -1180,6 +1180,7 @@ noclear
nocrypto
nolock
nonces
nondurable
nonliteral
nontransactionally
noop
Expand Down
6 changes: 6 additions & 0 deletions src/include/extern.h
Expand Up @@ -1961,10 +1961,16 @@ static inline bool __wt_session_can_wait(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_split_descent_race(WT_SESSION_IMPL *session, WT_REF *ref,
WT_PAGE_INDEX *saved_pindex) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_txn_ts_nondurable(WT_SESSION_IMPL *session, uint64_t txnid,
uint64_t commit_ts, uint64_t durable_ts) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_txn_tw_start_nondurable(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_txn_tw_start_visible(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_txn_tw_start_visible_all(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_txn_tw_stop_nondurable(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_txn_tw_stop_visible(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline bool __wt_txn_tw_stop_visible_all(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
Expand Down
8 changes: 5 additions & 3 deletions src/include/txn.h
Expand Up @@ -21,6 +21,7 @@
*/
#define WT_TXN_ROLLBACK_REASON_CACHE "oldest pinned transaction ID rolled back for eviction"
#define WT_TXN_ROLLBACK_REASON_CONFLICT "conflict between concurrent operations"
#define WT_TXN_ROLLBACK_REASON_NONDURABLE "read conflict with committed but non-durable value"

/* AUTOMATIC FLAG VALUE GENERATION START 0 */
#define WT_TXN_LOG_CKPT_CLEANUP 0x01u
Expand All @@ -42,9 +43,10 @@
/* AUTOMATIC FLAG VALUE GENERATION STOP 32 */

typedef enum {
WT_VISIBLE_FALSE = 0, /* Not a visible update */
WT_VISIBLE_PREPARE = 1, /* Prepared update */
WT_VISIBLE_TRUE = 2 /* A visible update */
WT_VISIBLE_FALSE = 0, /* Not a visible update */
WT_VISIBLE_PREPARE = 1, /* Prepared update */
WT_VISIBLE_NONDURABLE = 2, /* Committed but not durable update */
WT_VISIBLE_TRUE = 3 /* A visible update */
keithbostic marked this conversation as resolved.
Show resolved Hide resolved
} WT_VISIBLE_TYPE;

/*
Expand Down
87 changes: 85 additions & 2 deletions src/include/txn_inline.h
Expand Up @@ -606,6 +606,41 @@ __wt_txn_upd_value_visible_all(WT_SESSION_IMPL *session, WT_UPDATE_VALUE *upd_va
__wt_txn_visible_all(session, upd_value->tw.start_txn, upd_value->tw.durable_start_ts));
}

/*
* __wt_txn_ts_nondurable --
* Is the given timestamp not yet durable? It is durable unless we are reading between the
* commit time the durable time *and* the global stable timestamp has not yet advanced past the
* durable time. (Note that it is not sufficient to just check that the global stable timestamp
* has advanced past the commit time; prepared transactions are allowed to commit earlier than
* stable if they were prepared after stable and stable has since moved up.) Assumes caller has
* already checked that the commit time is visible and skips checks that makes redundant.
*/
keithbostic marked this conversation as resolved.
Show resolved Hide resolved
static inline bool
__wt_txn_ts_nondurable(
WT_SESSION_IMPL *session, uint64_t txnid, uint64_t commit_ts, uint64_t durable_ts)
{
WT_TXN_GLOBAL *txn_global;

txn_global = &S2C(session)->txn_global;

/* If there is no gap between commit time and durable time, we can't read between them. */
if (durable_ts == commit_ts)
return (false);

/*
* If the durable time is stable, the value might or might not actually be durable (depending on
* whether a checkpoint has been completed) but we (the caller) cannot become durable before it,
* so for us it counts as durable. The purpose of this check is to make sure we can't read a
* nondurable value, then commit and become durable before it, because that violates consistency
* expectations.
*/
if (durable_ts <= txn_global->stable_timestamp)
return (false);

/* Otherwise, if the durable time is not visible, we are reading before it. */
return (!__wt_txn_visible(session, txnid, durable_ts));
}

/*
* __wt_txn_tw_stop_visible --
* Is the given stop time window visible?
Expand All @@ -617,6 +652,18 @@ __wt_txn_tw_stop_visible(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
__wt_txn_visible(session, tw->stop_txn, tw->stop_ts));
}

/*
* __wt_txn_tw_stop_nondurable --
* Is the given stop time window not yet durable? See notes about the start time below. Assumes
* the caller has already checked that the stop time is visible, so among other things we know
* that the time window *does* have a stop time.
*/
static inline bool
__wt_txn_tw_stop_nondurable(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
{
return (__wt_txn_ts_nondurable(session, tw->stop_txn, tw->stop_ts, tw->durable_stop_ts));
}

/*
* __wt_txn_tw_start_visible --
* Is the given start time window visible?
Expand All @@ -635,6 +682,17 @@ __wt_txn_tw_start_visible(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
__wt_txn_visible(session, tw->start_txn, tw->start_ts));
}

/*
* __wt_txn_tw_start_nondurable --
* Is the given start time window not yet durable? Assumes caller has already checked that the
* start time is visible.
*/
static inline bool
__wt_txn_tw_start_nondurable(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
{
return (__wt_txn_ts_nondurable(session, tw->start_txn, tw->start_ts, tw->durable_start_ts));
}

/*
* __wt_txn_tw_start_visible_all --
* Is the given start time window visible to all (possible) readers?
Expand Down Expand Up @@ -763,9 +821,12 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id, wt_timestamp_t timestamp
static inline WT_VISIBLE_TYPE
__wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
WT_TXN_GLOBAL *txn_global;
uint8_t prepare_state, previous_state;
bool upd_visible;

txn_global = &S2C(session)->txn_global;

for (;; __wt_yield()) {
/* Prepare state change is in progress, yield and try again. */
WT_ORDERED_READ(prepare_state, upd->prepare_state);
Expand All @@ -778,6 +839,12 @@ __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd)
return (WT_VISIBLE_TRUE);

upd_visible = __wt_txn_visible(session, upd->txnid, upd->start_ts);
if (upd_visible && upd->durable_ts > upd->start_ts &&
upd->durable_ts > txn_global->stable_timestamp) {
upd_visible = __wt_txn_visible(session, upd->txnid, upd->durable_ts);
if (!upd_visible)
return (WT_VISIBLE_NONDURABLE);
}

/*
* The visibility check is only valid if the update does not change state. If the state does
Expand Down Expand Up @@ -912,6 +979,9 @@ __wt_txn_read_upd_list_internal(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
if (upd_visible == WT_VISIBLE_TRUE)
break;

if (upd_visible == WT_VISIBLE_NONDURABLE)
return (__wt_txn_rollback_required(session, WT_TXN_ROLLBACK_REASON_NONDURABLE));

/*
* Save the prepared update to help us detect if we race with prepared commit or rollback
* irrespective of update visibility.
Expand Down Expand Up @@ -981,10 +1051,11 @@ __wt_txn_read(
{
WT_TIME_WINDOW tw;
WT_UPDATE *prepare_upd, *restored_upd;
bool have_stop_tw, retry;
bool have_stop_tw, retry, return_onpage_value;

prepare_upd = restored_upd = NULL;
retry = true;
return_onpage_value = false;

retry:
WT_RET(__wt_txn_read_upd_list_internal(session, cbt, upd, &prepare_upd, &restored_upd));
Expand Down Expand Up @@ -1026,6 +1097,9 @@ __wt_txn_read(
*/
if (!have_stop_tw && __wt_txn_tw_stop_visible(session, &tw) &&
!F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE)) {
if (__wt_txn_tw_stop_nondurable(session, &tw))
/* Disallow reading if we can see it but it isn't durable yet. */
return (__wt_txn_rollback_required(session, WT_TXN_ROLLBACK_REASON_NONDURABLE));
cbt->upd_value->buf.data = NULL;
cbt->upd_value->buf.size = 0;
cbt->upd_value->tw.durable_stop_ts = tw.durable_stop_ts;
Expand All @@ -1049,7 +1123,16 @@ __wt_txn_read(
* 1. The record is from the history store.
* 2. It is visible to the reader.
*/
if (WT_IS_HS(session->dhandle) || __wt_txn_tw_start_visible(session, &tw)) {
if (WT_IS_HS(session->dhandle))
return_onpage_value = true;
else if (__wt_txn_tw_start_visible(session, &tw)) {
if (__wt_txn_tw_start_nondurable(session, &tw))
/* Disallow reading if we can see it but it isn't durable yet. */
return (__wt_txn_rollback_required(session, WT_TXN_ROLLBACK_REASON_NONDURABLE));
return_onpage_value = true;
}

if (return_onpage_value) {
if (cbt->upd_value->skip_buf) {
cbt->upd_value->buf.data = NULL;
cbt->upd_value->buf.size = 0;
Expand Down
2 changes: 1 addition & 1 deletion test/suite/test_durable_rollback_to_stable.py
Expand Up @@ -100,7 +100,7 @@ def test_durable_rollback_to_stable(self):

# Read the first update value with timestamp.
self.assertEquals(cursor.reset(), 0)
session.begin_transaction('read_timestamp=' + self.timestamp_str(200))
session.begin_transaction('read_timestamp=' + self.timestamp_str(220))
self.assertEquals(cursor.next(), 0)
for i in range(1, 50):
self.assertEquals(cursor.get_value(), ds.value(111))
Expand Down
2 changes: 1 addition & 1 deletion test/suite/test_durable_ts01.py
Expand Up @@ -99,7 +99,7 @@ def test_durable_ts01(self):

# Read the first update value with timestamp.
self.assertEquals(cursor.reset(), 0)
session.begin_transaction('read_timestamp=' + self.timestamp_str(200))
session.begin_transaction('read_timestamp=' + self.timestamp_str(220))
self.assertEquals(cursor.next(), 0)
for i in range(1, 50):
self.assertEquals(cursor.get_value(), ds.value(111))
Expand Down
10 changes: 9 additions & 1 deletion test/suite/test_durable_ts03.py
Expand Up @@ -97,9 +97,17 @@ def test_durable_ts03(self):
self.assertEqual(value, valueA)
session.commit_transaction()

# Read the updated data to confirm that it is visible.
# Check that the updated data cannot be read while it is not yet durable.
self.assertEquals(cursor.reset(), 0)
session.begin_transaction('read_timestamp=' + self.timestamp_str(210))
cursor.set_key(1)
self.assertRaisesException(wiredtiger.WiredTigerError,
lambda: cursor.search(), '/WT_ROLLBACK/', False)
session.rollback_transaction()

# Read the updated data to confirm that it is visible.
self.assertEquals(cursor.reset(), 0)
session.begin_transaction('read_timestamp=' + self.timestamp_str(220))
for key, value in cursor:
self.assertEqual(value, valueB)
session.commit_transaction()
Expand Down