-
Notifications
You must be signed in to change notification settings - Fork 376
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve election stability and reproducibility #8699
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
## bugfix/replication | ||
|
||
* Fixed a possible failure to promote the desired node by `box.ctl.promote()` on | ||
a cluster with nodes configured with `election_mode = "candidate"` (gh-8497). |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
## bugfix/replication | ||
|
||
* Fixed nodes configured with `election_mode = 'candidate'` spuriously detecting | ||
a split-vote when another candidate should win with exactly a quorum of votes | ||
for it (gh-8698). |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -740,6 +740,15 @@ raft_worker_handle_io(struct raft *raft) | |
assert(raft->volatile_term >= raft->term); | ||
if (raft->volatile_vote == 0) | ||
goto do_dump; | ||
/* | ||
* Skip self. When vote is issued, own vclock can be smaller, | ||
* but that doesn't matter. Can always vote for self. Not having | ||
* this special case still works if the node is configured as a | ||
* candidate, but the node might log that it canceled a vote for | ||
* self, which is confusing. | ||
*/ | ||
if (raft->volatile_vote == raft->self) | ||
goto do_dump_with_vote; | ||
/* | ||
* Vote and term bumps are persisted separately. This serves as | ||
* a flush of all transactions going to WAL right now so as the | ||
|
@@ -750,15 +759,6 @@ raft_worker_handle_io(struct raft *raft) | |
*/ | ||
if (raft->volatile_term > raft->term) | ||
goto do_dump; | ||
/* | ||
* Skip self. When vote was issued, own vclock could be smaller, | ||
* but that doesn't matter. Can always vote for self. Not having | ||
* this special case still works if the node is configured as a | ||
* candidate, but the node might log that it canceled a vote for | ||
* self, which is confusing. | ||
*/ | ||
if (raft->volatile_vote == raft->self) | ||
goto do_dump_with_vote; | ||
if (!raft_can_vote_for(raft, &raft->candidate_vclock)) { | ||
say_info("RAFT: vote request for %u is canceled - the " | ||
"vclock is not acceptable anymore", | ||
|
@@ -935,7 +935,7 @@ raft_sm_schedule_new_vote(struct raft *raft, uint32_t candidate_id, | |
assert(!raft->votes[raft->self].did_vote); | ||
raft->volatile_vote = candidate_id; | ||
vclock_copy(&raft->candidate_vclock, candidate_vclock); | ||
raft_add_vote(raft, raft->self, raft->self); | ||
raft_add_vote(raft, raft->self, candidate_id); | ||
raft_sm_pause_and_dump(raft); | ||
/* Nothing visible is changed - no broadcast. */ | ||
} | ||
|
@@ -1211,6 +1211,7 @@ raft_promote(struct raft *raft) | |
return; | ||
raft_sm_schedule_new_term(raft, raft->volatile_term + 1); | ||
raft_start_candidate(raft); | ||
raft_sm_schedule_new_vote(raft, raft->self, raft->vclock); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't we use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not really. We have to do On the other hand I can do this, because diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index f31cee5b1..e0b073b5c 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -959,8 +959,7 @@ raft_sm_schedule_new_election(struct raft *raft)
say_info("RAFT: begin new election round");
assert(raft->is_cfg_candidate);
/* Everyone is a follower until its vote for self is persisted. */
- raft_sm_schedule_new_term(raft, raft->volatile_term + 1);
- raft_sm_schedule_new_vote(raft, raft->self, raft->vclock);
+ raft_promote(raft);
}
static void There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for explanation, I see now, that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, I don't like this hunk either. Let's revert it then: diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index e0b073b5c..f31cee5b1 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -959,7 +959,8 @@ raft_sm_schedule_new_election(struct raft *raft)
say_info("RAFT: begin new election round");
assert(raft->is_cfg_candidate);
/* Everyone is a follower until its vote for self is persisted. */
- raft_promote(raft);
+ raft_sm_schedule_new_term(raft, raft->volatile_term + 1);
+ raft_sm_schedule_new_vote(raft, raft->self, raft->vclock);
}
static void |
||
} | ||
|
||
void | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
local t = require('luatest') | ||
local server = require('luatest.server') | ||
local replica_set = require('luatest.replica_set') | ||
|
||
local g = t.group('gh-8497-atomic-promote') | ||
|
||
g.before_each(function(cg) | ||
t.tarantool.skip_if_not_debug() | ||
cg.replica_set = replica_set:new({}) | ||
cg.box_cfg = { | ||
replication_timeout = 0.1, | ||
replication = { | ||
server.build_listen_uri('server1', cg.replica_set.id), | ||
server.build_listen_uri('server2', cg.replica_set.id), | ||
}, | ||
} | ||
cg.box_cfg.election_mode = 'candidate' | ||
cg.server1 = cg.replica_set:build_and_add_server{ | ||
alias = 'server1', | ||
box_cfg = cg.box_cfg, | ||
} | ||
cg.box_cfg.election_mode = 'voter' | ||
cg.server2 = cg.replica_set:build_and_add_server{ | ||
alias = 'server2', | ||
box_cfg = cg.box_cfg, | ||
} | ||
cg.replica_set:start() | ||
cg.replica_set:wait_for_fullmesh() | ||
cg.server1:wait_for_election_leader() | ||
end) | ||
|
||
g.after_each(function(cg) | ||
cg.replica_set:drop() | ||
end) | ||
|
||
g.test_election_promote_finishes_in_one_term = function(cg) | ||
cg.server2:update_box_cfg{election_mode = 'candidate'} | ||
local term = cg.server1:get_election_term() | ||
t.assert_equals(term, cg.server2:get_election_term(), | ||
'The cluster is stable') | ||
local ok, err = cg.server2:exec(function() | ||
local fiber = require('fiber') | ||
box.error.injection.set('ERRINJ_WAL_DELAY_COUNTDOWN', 1) | ||
local fib = fiber.new(box.ctl.promote) | ||
fib:set_joinable(true) | ||
fiber.sleep(2 * box.cfg.replication_timeout) | ||
box.error.injection.set('ERRINJ_WAL_DELAY', false) | ||
return fib:join() | ||
end) | ||
t.assert_equals({ok, err}, {true, nil}, 'No error in promote') | ||
cg.server2:wait_for_election_leader() | ||
t.assert_equals(term + 1, cg.server1:get_election_term(), | ||
'The term is bumped once') | ||
t.assert_equals(term + 1, cg.server2:get_election_term(), | ||
'The term is bumped once') | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change better be in the previous commit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for noticing! Fixed.