Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[reparentutil] ERS should not attempt to WaitForRelayLogsToApply on primary tablets that were not running replication #7523

Merged
merged 4 commits into from
Feb 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
33 changes: 30 additions & 3 deletions go/vt/vtctl/reparentutil/emergency_reparenter.go
Expand Up @@ -351,17 +351,44 @@ func (erp *EmergencyReparenter) waitForAllRelayLogsToApply(
groupCtx, groupCancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout)
defer groupCancel()

waiterCount := 0

for candidate := range validCandidates {
// When we called StopReplicationAndBuildStatusMaps, we got back two
// maps: (1) the StopReplicationStatus of any replicas that actually
// stopped replication; and (2) the MasterStatus of anything that
// returned ErrNotReplica, which is a tablet that is either the current
// primary or is stuck thinking it is a MASTER but is not in actuality.
//
// If we have a tablet in the validCandidates map that does not appear
// in the statusMap, then we have either (a) the current primary, which
// is not replicating, so it is not applying relay logs; or (b) a tablet
// that is stuck thinking it is MASTER but is not in actuality. In that
// second case - (b) - we will most likely find that the stuck MASTER
// does not have a winning position, and fail the ERS. If, on the other
// hand, it does have a winning position, we are trusting the operator
// to know what they are doing by emergency-reparenting onto that
// tablet. In either case, it does not make sense to wait for relay logs
// to apply on a tablet that was never applying relay logs in the first
// place, so we skip it, and log that we did.
status, ok := statusMap[candidate]
if !ok {
erp.logger.Infof("EmergencyReparent candidate %v not in replica status map; this means it was not running replication (because it was formerly MASTER), so skipping WaitForRelayLogsToApply step for this candidate", candidate)
continue
}

go func(alias string) {
var err error
defer func() { errCh <- err }()
err = WaitForRelayLogsToApply(groupCtx, erp.tmc, tabletMap[alias], statusMap[alias])
err = WaitForRelayLogsToApply(groupCtx, erp.tmc, tabletMap[alias], status)
}(candidate)

waiterCount++
}

errgroup := concurrency.ErrorGroup{
NumGoroutines: len(validCandidates),
NumRequiredSuccesses: len(validCandidates),
NumGoroutines: waiterCount,
NumRequiredSuccesses: waiterCount,
NumAllowedErrors: 0,
}
rec := errgroup.Wait(groupCancel, errCh)
Expand Down
105 changes: 105 additions & 0 deletions go/vt/vtctl/reparentutil/emergency_reparenter_test.go
Expand Up @@ -338,6 +338,111 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) {
},
shouldErr: false,
},
{
name: "success with existing primary",
ts: memorytopo.NewServer("zone1"),
tmc: &emergencyReparenterTestTMClient{
DemoteMasterResults: map[string]struct {
Status *replicationdatapb.MasterStatus
Error error
}{
"zone1-0000000100": {
Status: &replicationdatapb.MasterStatus{
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21",
},
},
},
PopulateReparentJournalResults: map[string]error{
"zone1-0000000102": nil,
},
PromoteReplicaResults: map[string]struct {
Result string
Error error
}{
"zone1-0000000102": {
Result: "ok",
Error: nil,
},
},
SetMasterResults: map[string]error{
"zone1-0000000100": nil,
"zone1-0000000101": nil,
},
StopReplicationAndGetStatusResults: map[string]struct {
Status *replicationdatapb.Status
StopStatus *replicationdatapb.StopReplicationStatus
Error error
}{
"zone1-0000000100": { // This tablet claims MASTER, so is not running replication.
Error: mysql.ErrNotReplica,
},
"zone1-0000000101": {
StopStatus: &replicationdatapb.StopReplicationStatus{
Before: &replicationdatapb.Status{},
After: &replicationdatapb.Status{
MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562",
RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21",
},
},
},
"zone1-0000000102": {
StopStatus: &replicationdatapb.StopReplicationStatus{
Before: &replicationdatapb.Status{},
After: &replicationdatapb.Status{
MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562",
RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-26",
},
},
},
},
WaitForPositionResults: map[string]map[string]error{
"zone1-0000000101": {
"MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21": nil,
},
"zone1-0000000102": {
"MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-26": nil,
},
},
},
shards: []*vtctldatapb.Shard{
{
Keyspace: "testkeyspace",
Name: "-",
},
},
tablets: []*topodatapb.Tablet{
{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
Keyspace: "testkeyspace",
Shard: "-",
Type: topodatapb.TabletType_MASTER,
},
{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
Keyspace: "testkeyspace",
Shard: "-",
},
{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 102,
},
Keyspace: "testkeyspace",
Shard: "-",
Hostname: "most up-to-date position, wins election",
},
},
keyspace: "testkeyspace",
shard: "-",
opts: EmergencyReparentOptions{},
shouldErr: false,
},
{
name: "shard not found",
ts: memorytopo.NewServer("zone1"),
Expand Down