Skip to content

Commit

Permalink
replication: fix extraneous split-brain alerting
Browse files Browse the repository at this point in the history
Current split-brain detector implementation raises an error each time a
CONFIRM or ROLLBACK entry is received from the previous synchronous
transaction queue owner. It is assumed that the new queue owner must
have witnessed all the previous CONFIRMS. Besides, according to Raft,
ROLLBACK should never happen.

Actually there is a case when a CONFIRM from an old term is legal: it's
possible that during leader transition old leader writes a CONFIRM for
the same transaction that is confirmed by the new leader's PROMOTE. If
PROMOTE and CONFIRM lsns match there is nothing bad about such
situation.

Symmetrically, when an old leader issues a ROLLBACK with the lsn right
after the new leader's PROMOTE lsn, it is not a split-brain.

Allow such cases by tracking the last confirmed lsn for each synchronous
transaction queue owner and silently nopifying CONFIRMs with an lsn less
than the one recorded and ROLLBACKs with lsn greater than that.

Closes #9138

NO_DOC=bugfix

(cherry picked from commit ffa6ac1)
  • Loading branch information
sergepetrenko committed Dec 2, 2023
1 parent bcbe923 commit 718aeb1
Show file tree
Hide file tree
Showing 4 changed files with 204 additions and 14 deletions.
6 changes: 6 additions & 0 deletions changelogs/unreleased/gh-9138-relax-split-brain-check.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
## bugfix/replication

* Fixed a false-positive split-brain error when an old synchronous transaction
queue owner confirmed the same transactions which were already confirmed by
the new queue owner, or rolled back the same transactions which were rolled
back by the new queue owner (gh-9138).
44 changes: 36 additions & 8 deletions src/box/applier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1517,14 +1517,14 @@ applier_synchro_filter_tx(struct stailq *rows)
* It may happen that we receive the instance's rows via some third
* node, so cannot check for applier->instance_id here.
*/
row = &stailq_first_entry(rows, struct applier_tx_row, next)->row;
row = &stailq_last_entry(rows, struct applier_tx_row, next)->row;
uint64_t term = txn_limbo_replica_term(&txn_limbo, row->replica_id);
assert(term <= txn_limbo.promote_greatest_term);
if (term == txn_limbo.promote_greatest_term)
return;

/*
* We do not nopify promotion/demotion and confirm/rollback.
* We do not nopify promotion/demotion and most of confirm/rollback.
* Such syncrhonous requests should be filtered by txn_limbo to detect
* possible split brain situations.
*
Expand All @@ -1535,14 +1535,10 @@ applier_synchro_filter_tx(struct stailq *rows)
* claimed by someone is a marker of split-brain by itself: consider it
* a synchronous transaction, which is committed with quorum 1.
*/
struct xrow_header *last_row =
&stailq_last_entry(rows, struct applier_tx_row, next)->row;
struct applier_tx_row *item;
if (!last_row->wait_sync) {
if (!iproto_type_is_dml(last_row->type) ||
txn_limbo.owner_id == REPLICA_ID_NIL) {
if (iproto_type_is_dml(row->type) && !row->wait_sync) {
if (txn_limbo.owner_id == REPLICA_ID_NIL)
return;
}
stailq_foreach_entry(item, rows, next) {
row = &item->row;
if (row->type == IPROTO_NOP)
Expand All @@ -1551,6 +1547,38 @@ applier_synchro_filter_tx(struct stailq *rows)
"got an async transaction from an old term");
}
return;
} else if (iproto_type_is_synchro_request(row->type)) {
item = stailq_last_entry(rows, typeof(*item), next);
struct synchro_request req = item->req.synchro;
/* Note! Might be different from row->replica_id. */
uint32_t owner_id = req.replica_id;
int64_t confirmed_lsn =
txn_limbo_replica_confirmed_lsn(&txn_limbo, owner_id);
/*
* A CONFIRM with lsn <= known confirm lsn for this replica may
* be nopified without a second thought. The transactions it's
* going to confirm were already confirmed by one of the
* PROMOTE/DEMOTE requests in a new term.
*
* Same about a ROLLBACK with lsn > known confirm lsn.
* These requests, although being out of date, do not contradict
* anything, so we may silently skip them.
*/
switch (row->type) {
case IPROTO_RAFT_PROMOTE:
case IPROTO_RAFT_DEMOTE:
return;
case IPROTO_RAFT_CONFIRM:
if (req.lsn > confirmed_lsn)
return;
break;
case IPROTO_RAFT_ROLLBACK:
if (req.lsn <= confirmed_lsn)
return;
break;
default:
unreachable();
}
}
stailq_foreach_entry(item, rows, next) {
row = &item->row;
Expand Down
10 changes: 10 additions & 0 deletions src/box/txn_limbo.h
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,16 @@ txn_limbo_replica_term(const struct txn_limbo *limbo, uint32_t replica_id)
return vclock_get(&limbo->promote_term_map, replica_id);
}

/**
* Return the latest confirmed lsn for the replica with id @replica_id.
*/
static inline int64_t
txn_limbo_replica_confirmed_lsn(const struct txn_limbo *limbo,
uint32_t replica_id)
{
return vclock_get(&limbo->confirmed_vclock, replica_id);
}

/**
* Return the last synchronous transaction in the limbo or NULL when it is
* empty.
Expand Down
158 changes: 152 additions & 6 deletions test/replication-luatest/gh_5295_split_brain_test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ g.before_all(function(cg)
replication_synchro_quorum = 1,
replication_synchro_timeout = 0.01,
election_timeout = 0.1,
election_fencing_enabled = false,
election_fencing_mode = 'off',
log_level = 6,
}

Expand Down Expand Up @@ -112,6 +112,15 @@ local function reconnect_and_check_split_brain(cg)
end)
end

local function reconnect_and_check_no_split_brain(cg)
local srv = cg.split_replica
srv:exec(update_replication, {cg.main.net_box_uri})
t.helpers.retrying({}, srv.exec, srv, function()
local upstream = box.info.replication[1].upstream
t.assert_equals(upstream.status, 'follow', 'no split-brain')
end)
end

local function write_promote()
t.assert_not_equals(box.info.synchro.queue.owner, box.info.id,
"Promoting a follower")
Expand Down Expand Up @@ -140,26 +149,80 @@ g.test_async_old_term = function(cg)
reconnect_and_check_split_brain(cg)
end

-- Any unseen sync transaction confirmation from an obsolete term means a
-- A conflicting sync transaction confirmation from an obsolete term means a
-- split-brain.
g.test_confirm_old_term = function(cg)
g.test_bad_confirm_old_term = function(cg)
partition_replica(cg)
cg.split_replica:exec(write_promote)
cg.main:exec(function() box.space.sync:replace{1} end)
reconnect_and_check_split_brain(cg)
end

-- Any unseen sync transaction rollback from an obsolete term means a
-- Obsolete sync transaction confirmation might be fine when it doesn't
-- contradict local history.
g.test_good_confirm_old_term = function(cg)
t.tarantool.skip_if_not_debug()
-- Delay confirmation on the old leader, so that the transaction is included
-- into new leader's PROMOTE, but the old leader writes a CONFIRM for it.
cg.main:exec(function()
local lsn = box.info.lsn
box.error.injection.set('ERRINJ_WAL_DELAY_COUNTDOWN', 1)
require('fiber').new(function() box.space.sync:replace{1} end)
t.helpers.retrying({}, function()
t.assert_equals(box.info.lsn, lsn + 1)
end)
end)
partition_replica(cg)
cg.split_replica:exec(write_promote)
cg.main:exec(function()
local lsn = box.info.lsn
box.error.injection.set('ERRINJ_WAL_DELAY', false)
t.helpers.retrying({}, function()
t.assert_equals(box.info.lsn, lsn + 1)
end)
end)
reconnect_and_check_no_split_brain(cg)
end

-- A conflicting sync transaction rollback from an obsolete term means a
-- split-brain.
g.test_rollback_old_term = function(cg)
g.test_bad_rollback_old_term = function(cg)
t.tarantool.skip_if_not_debug()
-- Delay rollback on the old leader, so that the transaction is included
-- into new leader's PROMOTE, but the old leader writes a ROLLBACK for it.
cg.main:exec(function()
local lsn = box.info.lsn
box.cfg{replication_synchro_quorum = 31}
box.error.injection.set('ERRINJ_WAL_DELAY_COUNTDOWN', 1)
require('fiber').new(function() box.space.sync:replace{1} end)
t.helpers.retrying({}, function()
t.assert_equals(box.info.lsn, lsn + 1)
end)
end)
partition_replica(cg)
cg.split_replica:exec(write_promote)
cg.main:exec(function()
local lsn = box.info.lsn
box.error.injection.set('ERRINJ_WAL_DELAY', false)
t.helpers.retrying({}, function()
t.assert_equals(box.info.lsn, lsn + 1)
end)
box.cfg{replication_synchro_quorum = 1}
end)
reconnect_and_check_split_brain(cg)
end

-- Obsolete sync transaction rollback might be fine when it doesn't contradict
-- local history.
g.test_good_rollback_old_term = function(cg)
partition_replica(cg)
cg.split_replica:exec(write_promote)
cg.main:exec(function()
box.cfg{replication_synchro_quorum = 31}
pcall(box.space.sync.replace, box.space.sync, {1})
box.cfg{replication_synchro_quorum = 1}
end)
reconnect_and_check_split_brain(cg)
reconnect_and_check_no_split_brain(cg)
end

-- Conflicting demote for the same term is a split-brain.
Expand Down Expand Up @@ -234,3 +297,86 @@ g.test_promote_new_term_conflicting_queue = function(cg)
fill_queue_and_write(cg.split_replica)
reconnect_and_check_split_brain(cg)
end

local g_very_old_term = t.group('test-confirm-very-old-term')

g_very_old_term.before_each(function(cg)
t.tarantool.skip_if_not_debug()
cg.cluster = cluster:new{}
cg.box_cfg = {
replication_timeout = 0.1,
replication_synchro_quorum = 1,
replication = {
server.build_listen_uri('server1', cg.cluster.id),
server.build_listen_uri('server2', cg.cluster.id),
server.build_listen_uri('server3', cg.cluster.id),
},
election_mode = 'voter',
}
cg.servers = {}
for i = 1, 3 do
cg.servers[i] = cg.cluster:build_and_add_server{
alias = 'server' .. i,
box_cfg = cg.box_cfg,
}
end
cg.servers[1].box_cfg.election_mode = 'manual'
cg.cluster:start()
cg.cluster:wait_for_fullmesh()
cg.servers[1]:exec(function()
local s = box.schema.space.create('sync', {is_sync = true})
s:create_index('pk')
end)
end)

g_very_old_term.after_each(function(cg)
cg.cluster:drop()
end)

g_very_old_term.test_confirm = function( cg)
-- Create a sync transaction and block its confirmation.
cg.servers[1]:exec(function()
local lsn = box.info.lsn
box.error.injection.set('ERRINJ_WAL_DELAY_COUNTDOWN', 1)
require('fiber').new(function()
box.space.sync:insert{1}
end)
t.helpers.retrying({}, function()
t.assert_equals(box.info.lsn, lsn + 1)
end)
end)
-- Make sure the transaction reaches other servers and disconnect them.
cg.servers[1]:update_box_cfg({replication = ""})
local new_cfg = table.deepcopy(cg.box_cfg)
table.remove(new_cfg.replication, 1)
new_cfg.election_mode = 'manual'
for i = 2, 3 do
cg.servers[i]:wait_for_vclock_of(cg.servers[1])
cg.servers[i]:update_box_cfg(new_cfg)
end
cg.servers[1]:exec(function()
local lsn = box.info.lsn
box.error.injection.set('ERRINJ_WAL_DELAY', false)
t.helpers.retrying({}, function()
t.assert_equals(box.info.lsn, lsn + 1)
end)
end)
-- Perform a few cycles: promote either server2 or server3, let it write
-- some synchronous transaction.
for j = 1, 4 do
local leader = cg.servers[2 + j % 2]
local replica = cg.servers[2 + (j - 1) % 2]
leader:exec(function(n)
box.ctl.promote()
box.ctl.wait_rw()
box.space.sync:insert{n}
end, {j + 1})
replica:wait_for_vclock_of(leader)
end
for i = 2, 3 do
cg.servers[i]:exec(function() box.snapshot() end)
cg.servers[i]:restart()
end
cg.servers[1]:update_box_cfg({replication = cg.box_cfg.replication})
cg.cluster:wait_for_fullmesh()
end

0 comments on commit 718aeb1

Please sign in to comment.