Skip to content

Commit

Permalink
vinyl: fix secondary index divergence on update
Browse files Browse the repository at this point in the history
If an UPDATE request doesn't touch key parts of a secondary index, we
don't need to write it to the index memory level or dump it to disk, as
this would only increase IO load. Historically, we use column mask set
by the UPDATE operation to skip secondary indexes that are not affected
by the operation on commit. However, there's a problem here: the column
mask isn't precise - it may have a bit set even if the corresponding
column doesn't really get updated, e.g. consider {'+', 2, 0}. Not taking
this into account may result in appearance of phantom tuples on disk as
the write iterator assumes that statements that have no effect aren't
written to secondary indexes (this is needed to apply INSERT+DELETE
"annihilation" optimization). We fixed that by clearing column mask bits
in vy_tx_set in case we detect that the key isn't changed, for more
details see #3607 and commit e72867c ("vinyl: fix appearance of
phantom tuple in secondary index after update"). It was rather an ugly
hack, but it worked.

However, it turned out that apart from looking hackish this code has
a nasty bug that may lead to tuples missing from secondary indexes.
Consider the following example:

  s = box.schema.space.create('test', {engine = 'vinyl'})
  s:create_index('pk')
  s:create_index('sk', {parts = {2, 'unsigned'}})
  s:insert{1, 1, 1}

  box.begin()
  s:update(1, {{'=', 2, 2}})
  s:update(1, {{'=', 3, 2}})
  box.commit()

The first update operation writes DELETE{1,1} and REPLACE{2,1} to the
secondary index write set. The second update replaces REPLACE{2,1} with
DELETE{2,1} and then with REPLACE{2,1}. When replacing DELETE{2,1} with
REPLACE{2,1} in the write set, we assume that the update doesn't modify
secondary index key parts and clear the column mask so as not to commit
a pointless request, see vy_tx_set. As a result, we skip the first
update too and get key {2,1} missing from the secondary index.

Actually, it was a dumb idea to use column mask to skip statements in
the first place, as there's a much easier way to filter out statements
that have no effect for secondary indexes. The thing is every DELETE
statement inserted into a secondary index write set acts as a "single
DELETE", i.e. there's exactly one older statement it is supposed to
purge. This is, because in contrast to the primary index we don't write
DELETE statements blindly - we always look up the tuple overwritten in
the primary index first. This means that REPLACE+DELETE for the same key
is basically a no-op and can be safely skip. Moreover, DELETE+REPLACE
can be treated as no-op, too, because secondary indexes don't store full
tuples hence all REPLACE statements for the same key are equivalent.
By marking such pair of statements as no-op in vy_tx_set, we guarantee
that no-op statements don't make it to secondary index memory or disk
levels.

Closes #4242
  • Loading branch information
locker committed May 24, 2019
1 parent d07e3ac commit 8e4175c
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 66 deletions.
7 changes: 3 additions & 4 deletions src/box/vinyl.c
Expand Up @@ -1870,7 +1870,7 @@ vy_perform_update(struct vy_env *env, struct vy_tx *tx, struct txn_stmt *stmt,

vy_stmt_set_flags(stmt->new_tuple, VY_STMT_UPDATE);

if (vy_tx_set_with_colmask(tx, pk, stmt->new_tuple, column_mask) != 0)
if (vy_tx_set(tx, pk, stmt->new_tuple) != 0)
return -1;
if (space->index_count == 1)
return 0;
Expand All @@ -1884,10 +1884,9 @@ vy_perform_update(struct vy_env *env, struct vy_tx *tx, struct txn_stmt *stmt,
struct vy_lsm *lsm = vy_lsm(space->index[i]);
if (vy_is_committed(env, lsm))
continue;
if (vy_tx_set_with_colmask(tx, lsm, delete, column_mask) != 0)
if (vy_tx_set(tx, lsm, delete) != 0)
goto error;
if (vy_tx_set_with_colmask(tx, lsm, stmt->new_tuple,
column_mask) != 0)
if (vy_tx_set(tx, lsm, stmt->new_tuple) != 0)
goto error;
}
tuple_unref(delete);
Expand Down
70 changes: 21 additions & 49 deletions src/box/vy_tx.c
Expand Up @@ -51,7 +51,6 @@
#include "trigger.h"
#include "trivia/util.h"
#include "tuple.h"
#include "column_mask.h"
#include "vy_cache.h"
#include "vy_lsm.h"
#include "vy_mem.h"
Expand Down Expand Up @@ -212,8 +211,7 @@ tx_manager_destroy_read_view(struct tx_manager *xm,
}

static struct txv *
txv_new(struct vy_tx *tx, struct vy_lsm *lsm,
struct vy_entry entry, uint64_t column_mask)
txv_new(struct vy_tx *tx, struct vy_lsm *lsm, struct vy_entry entry)
{
struct tx_manager *xm = tx->xm;
struct txv *v = mempool_alloc(&xm->txv_mempool);
Expand All @@ -227,9 +225,9 @@ txv_new(struct vy_tx *tx, struct vy_lsm *lsm,
v->entry = entry;
tuple_ref(entry.stmt);
v->region_stmt = NULL;
v->column_mask = column_mask;
v->tx = tx;
v->is_first_insert = false;
v->is_nop = false;
v->is_overwritten = false;
v->overwritten = NULL;
xm->write_set_size += tuple_size(entry.stmt);
Expand Down Expand Up @@ -631,8 +629,7 @@ vy_tx_handle_deferred_delete(struct vy_tx *tx, struct txv *v)
struct vy_lsm *lsm = vy_lsm(space->index[i]);
struct vy_entry entry;
vy_stmt_foreach_entry(entry, delete_stmt, lsm->cmp_def) {
struct txv *delete_txv = txv_new(tx, lsm, entry,
UINT64_MAX);
struct txv *delete_txv = txv_new(tx, lsm, entry);
if (delete_txv == NULL) {
rc = -1;
break;
Expand Down Expand Up @@ -704,13 +701,7 @@ vy_tx_prepare(struct vy_tx *tx)
assert(lsm->space_id == current_space_id);

/* Do not save statements that was overwritten by the same tx */
if (v->is_overwritten)
continue;

/* Skip statements which don't change this secondary key. */
if (lsm->index_id > 0 &&
key_update_can_be_skipped(lsm->key_def->column_mask,
v->column_mask))
if (v->is_overwritten || v->is_nop)
continue;

if (lsm->index_id > 0 && repsert == NULL && delete == NULL) {
Expand Down Expand Up @@ -1038,8 +1029,7 @@ vy_tx_track_point(struct vy_tx *tx, struct vy_lsm *lsm, struct vy_entry entry)
* are multiple entries of a single statement in a single index.
*/
static int
vy_tx_set_entry(struct vy_tx *tx, struct vy_lsm *lsm,
struct vy_entry entry, uint64_t column_mask)
vy_tx_set_entry(struct vy_tx *tx, struct vy_lsm *lsm, struct vy_entry entry)
{
assert(vy_stmt_type(entry.stmt) != 0);
/**
Expand Down Expand Up @@ -1071,7 +1061,7 @@ vy_tx_set_entry(struct vy_tx *tx, struct vy_lsm *lsm,
}

/* Allocate a MVCC container. */
struct txv *v = txv_new(tx, lsm, entry, column_mask);
struct txv *v = txv_new(tx, lsm, entry);
if (applied.stmt != NULL)
tuple_unref(applied.stmt);
if (v == NULL)
Expand All @@ -1089,38 +1079,21 @@ vy_tx_set_entry(struct vy_tx *tx, struct vy_lsm *lsm,
if (old == NULL && vy_stmt_type(entry.stmt) == IPROTO_INSERT)
v->is_first_insert = true;

if (old != NULL) {
/*
* Inherit the column mask of the overwritten statement
* so as not to skip both statements on commit.
*/
v->column_mask |= old->column_mask;
}

if (lsm->index_id > 0 && vy_stmt_type(entry.stmt) == IPROTO_REPLACE &&
old != NULL && vy_stmt_type(old->entry.stmt) == IPROTO_DELETE) {
if (lsm->index_id > 0 && old != NULL && !old->is_nop) {
/*
* The column mask of an update operation may have a bit
* set even if the corresponding field doesn't actually
* get updated, because a column mask is generated
* only from update operations, before applying them.
* E.g. update(1, {{'+', 2, 0}}) doesn't modify the
* second field, but the column mask will say it does.
*
* To discard DELETE statements in the write iterator
* (see optimization #5), we turn a REPLACE into an
* INSERT in case the REPLACE was generated by an
* update that changed secondary key fields. So we
* can't tolerate inaccuracy in a column mask.
*
* So if the update didn't actually modify secondary
* key fields, i.e. DELETE and REPLACE generated by the
* update have the same key fields, we forcefully clear
* key bits in the column mask to ensure that no REPLACE
* statement will be written for this secondary key.
* In a secondary index write set, DELETE statement purges
* exactly one older statement so REPLACE + DELETE is no-op.
* Moreover, DELETE + REPLACE can be treated as no-op, too,
* because secondary indexes don't store full tuples hence
* all REPLACE statements for the same key are equivalent.
* Therefore we can zap DELETE + REPLACE as there must be
* an older REPLACE for the same key stored somewhere in the
* index data.
*/
if (v->column_mask != UINT64_MAX)
v->column_mask &= ~lsm->cmp_def->column_mask;
enum iproto_type type = vy_stmt_type(entry.stmt);
enum iproto_type old_type = vy_stmt_type(old->entry.stmt);
if ((type == IPROTO_DELETE) != (old_type == IPROTO_DELETE))
v->is_nop = true;
}

v->overwritten = old;
Expand All @@ -1133,12 +1106,11 @@ vy_tx_set_entry(struct vy_tx *tx, struct vy_lsm *lsm,
}

int
vy_tx_set_with_colmask(struct vy_tx *tx, struct vy_lsm *lsm,
struct tuple *stmt, uint64_t column_mask)
vy_tx_set(struct vy_tx *tx, struct vy_lsm *lsm, struct tuple *stmt)
{
struct vy_entry entry;
vy_stmt_foreach_entry(entry, stmt, lsm->cmp_def) {
if (vy_tx_set_entry(tx, lsm, entry, column_mask) != 0)
if (vy_tx_set_entry(tx, lsm, entry) != 0)
return -1;
}
return 0;
Expand Down
17 changes: 6 additions & 11 deletions src/box/vy_tx.h
Expand Up @@ -87,8 +87,6 @@ struct txv {
struct vy_entry entry;
/** Statement allocated on vy_mem->allocator. */
struct tuple *region_stmt;
/** Mask of columns modified by this operation. */
uint64_t column_mask;
/** Next in the transaction log. */
struct stailq_entry next_in_log;
/** Member the transaction write set. */
Expand All @@ -100,6 +98,11 @@ struct txv {
* last committed statement is DELETE.
*/
bool is_first_insert;
/**
* True if the statement has no effect and can be safely
* skipped on commit.
*/
bool is_nop;
/**
* True if the txv was overwritten by another txv of
* the same transaction.
Expand Down Expand Up @@ -397,20 +400,12 @@ vy_tx_track_point(struct vy_tx *tx, struct vy_lsm *lsm, struct vy_entry entry);
* @param tx Transaction.
* @param lsm LSM tree the statement is for.
* @param stmt Statement.
* @param column_mask Mask of fields modified by the statement.
*
* @retval 0 Success
* @retval -1 Memory allocation error.
*/
int
vy_tx_set_with_colmask(struct vy_tx *tx, struct vy_lsm *lsm,
struct tuple *stmt, uint64_t column_mask);

static inline int
vy_tx_set(struct vy_tx *tx, struct vy_lsm *lsm, struct tuple *stmt)
{
return vy_tx_set_with_colmask(tx, lsm, stmt, UINT64_MAX);
}
vy_tx_set(struct vy_tx *tx, struct vy_lsm *lsm, struct tuple *stmt);

/**
* Iterator over the write set of a transaction.
Expand Down
30 changes: 30 additions & 0 deletions test/engine/update.result
Expand Up @@ -758,3 +758,33 @@ s:upsert({2, 666}, {{'=', 2, 666}})
s:drop()
---
...
--
-- gh-4242 Tuple is missing from secondary index after update.
--
s = box.schema.space.create('test', {engine = engine})
---
...
pk = s:create_index('pk')
---
...
sk = s:create_index('sk', {parts = {2, 'unsigned'}})
---
...
s:insert{1, 1, 1}
---
- [1, 1, 1]
...
box.begin() s:update(1, {{'=', 2, 2}}) s:update(1, {{'=', 3, 2}}) box.commit()
---
...
pk:select()
---
- - [1, 2, 2]
...
sk:select()
---
- - [1, 2, 2]
...
s:drop()
---
...
12 changes: 12 additions & 0 deletions test/engine/update.test.lua
Expand Up @@ -122,3 +122,15 @@ box.space.tst_sample:get(2).VAL
-- invalid upsert
s:upsert({2, 666}, {{'=', 2, 666}})
s:drop()

--
-- gh-4242 Tuple is missing from secondary index after update.
--
s = box.schema.space.create('test', {engine = engine})
pk = s:create_index('pk')
sk = s:create_index('sk', {parts = {2, 'unsigned'}})
s:insert{1, 1, 1}
box.begin() s:update(1, {{'=', 2, 2}}) s:update(1, {{'=', 3, 2}}) box.commit()
pk:select()
sk:select()
s:drop()
2 changes: 1 addition & 1 deletion test/vinyl/write_iterator.result
Expand Up @@ -767,7 +767,7 @@ box.snapshot()
- ok
...
-- Some padding to trigger minor compaction.
for i = 1001, 1000 + PAD2 do s:replace{i, i} end
for i = 1001, 1000 + PAD2 do s:delete{i} end
---
...
box.snapshot()
Expand Down
2 changes: 1 addition & 1 deletion test/vinyl/write_iterator.test.lua
Expand Up @@ -328,7 +328,7 @@ PAD2 = 15
for i = 1001, 1000 + PAD1 do s:replace{i, i} end
box.snapshot()
-- Some padding to trigger minor compaction.
for i = 1001, 1000 + PAD2 do s:replace{i, i} end
for i = 1001, 1000 + PAD2 do s:delete{i} end
box.snapshot()
-- Generate some INSERT statements and dump them to disk.
_ = s:insert{1, 1} -- insert
Expand Down

0 comments on commit 8e4175c

Please sign in to comment.