Skip to content

Commit

Permalink
mutation_partition_v2: Store range tombstones together with rows
Browse files Browse the repository at this point in the history
This patch changes mutation_partition_v2 to store range tombstone
information together with rows.

This mainly affects the version merging algorithm,
mutation_partition_v2::apply_monotonically().

Continuity setting no longer can drop dummy entry unconditionally
since it may be a boundary of a range tombstone.

Memtable/cache is not switched yet.

Refs scylladb#10587
Refs scylladb#3288
  • Loading branch information
tgrabiec committed Jan 12, 2023
1 parent ebb32a8 commit ffa6bd0
Show file tree
Hide file tree
Showing 4 changed files with 437 additions and 301 deletions.
13 changes: 13 additions & 0 deletions mutation_partition.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1190,6 +1190,7 @@ bool
rows_entry::equal(const schema& s, const rows_entry& other, const schema& other_schema) const {
position_in_partition::equal_compare eq(s);
return eq(position(), other.position())
&& _range_tombstone == other._range_tombstone
&& row().equal(column_kind::regular_column, s, other.row(), other_schema);
}

Expand Down Expand Up @@ -1544,12 +1545,24 @@ rows_entry::rows_entry(rows_entry&& o) noexcept
, _link(std::move(o._link))
, _key(std::move(o._key))
, _row(std::move(o._row))
, _range_tombstone(std::move(o._range_tombstone))
, _flags(std::move(o._flags))
{
}

void rows_entry::compact(const schema& s, tombstone t) {
can_gc_fn never_gc = [] (tombstone) { return false; };
row().compact_and_expire(s,
t + _range_tombstone,
gc_clock::time_point::min(), // no TTL expiration
never_gc, // no GC
gc_clock::time_point::min()); // no GC
// FIXME: Purge redundant _range_tombstone
}

void rows_entry::replace_with(rows_entry&& o) noexcept {
swap(o);
_range_tombstone = std::move(o._range_tombstone);
_row = std::move(o._row);
}

Expand Down
21 changes: 21 additions & 0 deletions mutation_partition.hh
Original file line number Diff line number Diff line change
Expand Up @@ -903,6 +903,16 @@ class rows_entry final : public evictable {
intrusive_b::member_hook _link;
clustering_key _key;
deletable_row _row;

// Given p is the preceding rows_entry&,
// this tombstone applies to the range (p.position(), position()] if continuous()
// and to [position(), position()] if !continuous().
// So the tombstone applies only to the continuous interval, to the left.
// On top of that, _row.deleted_at() may still apply new information.
// So it's not deoverlapped with the row tombstone.
// Set only when in mutation_partition_v2.
tombstone _range_tombstone;

struct flags {
// _before_ck and _after_ck encode position_in_partition::weight
bool _before_ck : 1;
Expand Down Expand Up @@ -944,6 +954,7 @@ public:
rows_entry(const schema& s, const rows_entry& e)
: _key(e._key)
, _row(s, e._row)
, _range_tombstone(e._range_tombstone)
, _flags(e._flags)
{ }
// Valid only if !dummy()
Expand All @@ -967,6 +978,8 @@ public:
is_continuous continuous() const { return is_continuous(_flags._continuous); }
void set_continuous(bool value) { _flags._continuous = value; }
void set_continuous(is_continuous value) { set_continuous(bool(value)); }
void set_range_tombstone(tombstone t) { _range_tombstone = t; }
tombstone range_tombstone() const { return _range_tombstone; }
is_dummy dummy() const { return is_dummy(_flags._dummy); }
bool is_last_dummy() const { return _flags._last_dummy; }
void set_dummy(bool value) { _flags._dummy = value; }
Expand Down Expand Up @@ -1019,6 +1032,8 @@ public:
void on_evicted(cache_tracker&) noexcept;
void on_evicted() noexcept override;

void compact(const schema&, tombstone);

class printer {
const schema& _schema;
const rows_entry& _rows_entry;
Expand Down Expand Up @@ -1103,6 +1118,12 @@ struct apply_resume {
static apply_resume done() {
return {stage::done, position_in_partition::for_partition_start()};
}

void set_position(position_in_partition_view pos) {
with_allocator(standard_allocator(), [&] {
_pos = position_in_partition(pos);
});
}
};

// Represents a set of writes made to a single partition.
Expand Down

0 comments on commit ffa6bd0

Please sign in to comment.