Skip to content

Commit

Permalink
Merge pull request #7523 from tinyspeck/am_ers_delete_current_primary
Browse files Browse the repository at this point in the history
[reparentutil] ERS should not attempt to WaitForRelayLogsToApply on primary tablets that were not running replication
  • Loading branch information
deepthi committed Feb 24, 2021
2 parents 5da35cd + 2400c5b commit 6403489
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 3 deletions.
33 changes: 30 additions & 3 deletions go/vt/vtctl/reparentutil/emergency_reparenter.go
Expand Up @@ -351,17 +351,44 @@ func (erp *EmergencyReparenter) waitForAllRelayLogsToApply(
groupCtx, groupCancel := context.WithTimeout(ctx, opts.WaitReplicasTimeout)
defer groupCancel()

waiterCount := 0

for candidate := range validCandidates {
// When we called StopReplicationAndBuildStatusMaps, we got back two
// maps: (1) the StopReplicationStatus of any replicas that actually
// stopped replication; and (2) the MasterStatus of anything that
// returned ErrNotReplica, which is a tablet that is either the current
// primary or is stuck thinking it is a MASTER but is not in actuality.
//
// If we have a tablet in the validCandidates map that does not appear
// in the statusMap, then we have either (a) the current primary, which
// is not replicating, so it is not applying relay logs; or (b) a tablet
// that is stuck thinking it is MASTER but is not in actuality. In that
// second case - (b) - we will most likely find that the stuck MASTER
// does not have a winning position, and fail the ERS. If, on the other
// hand, it does have a winning position, we are trusting the operator
// to know what they are doing by emergency-reparenting onto that
// tablet. In either case, it does not make sense to wait for relay logs
// to apply on a tablet that was never applying relay logs in the first
// place, so we skip it, and log that we did.
status, ok := statusMap[candidate]
if !ok {
erp.logger.Infof("EmergencyReparent candidate %v not in replica status map; this means it was not running replication (because it was formerly MASTER), so skipping WaitForRelayLogsToApply step for this candidate", candidate)
continue
}

go func(alias string) {
var err error
defer func() { errCh <- err }()
err = WaitForRelayLogsToApply(groupCtx, erp.tmc, tabletMap[alias], statusMap[alias])
err = WaitForRelayLogsToApply(groupCtx, erp.tmc, tabletMap[alias], status)
}(candidate)

waiterCount++
}

errgroup := concurrency.ErrorGroup{
NumGoroutines: len(validCandidates),
NumRequiredSuccesses: len(validCandidates),
NumGoroutines: waiterCount,
NumRequiredSuccesses: waiterCount,
NumAllowedErrors: 0,
}
rec := errgroup.Wait(groupCancel, errCh)
Expand Down
105 changes: 105 additions & 0 deletions go/vt/vtctl/reparentutil/emergency_reparenter_test.go
Expand Up @@ -336,6 +336,111 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) {
},
shouldErr: false,
},
{
name: "success with existing primary",
ts: memorytopo.NewServer("zone1"),
tmc: &emergencyReparenterTestTMClient{
DemoteMasterResults: map[string]struct {
Status *replicationdatapb.MasterStatus
Error error
}{
"zone1-0000000100": {
Status: &replicationdatapb.MasterStatus{
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21",
},
},
},
PopulateReparentJournalResults: map[string]error{
"zone1-0000000102": nil,
},
PromoteReplicaResults: map[string]struct {
Result string
Error error
}{
"zone1-0000000102": {
Result: "ok",
Error: nil,
},
},
SetMasterResults: map[string]error{
"zone1-0000000100": nil,
"zone1-0000000101": nil,
},
StopReplicationAndGetStatusResults: map[string]struct {
Status *replicationdatapb.Status
StopStatus *replicationdatapb.StopReplicationStatus
Error error
}{
"zone1-0000000100": { // This tablet claims MASTER, so is not running replication.
Error: mysql.ErrNotReplica,
},
"zone1-0000000101": {
StopStatus: &replicationdatapb.StopReplicationStatus{
Before: &replicationdatapb.Status{},
After: &replicationdatapb.Status{
MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562",
RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21",
},
},
},
"zone1-0000000102": {
StopStatus: &replicationdatapb.StopReplicationStatus{
Before: &replicationdatapb.Status{},
After: &replicationdatapb.Status{
MasterUuid: "3E11FA47-71CA-11E1-9E33-C80AA9429562",
RelayLogPosition: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-26",
},
},
},
},
WaitForPositionResults: map[string]map[string]error{
"zone1-0000000101": {
"MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21": nil,
},
"zone1-0000000102": {
"MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-26": nil,
},
},
},
shards: []*vtctldatapb.Shard{
{
Keyspace: "testkeyspace",
Name: "-",
},
},
tablets: []*topodatapb.Tablet{
{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
Keyspace: "testkeyspace",
Shard: "-",
Type: topodatapb.TabletType_MASTER,
},
{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
Keyspace: "testkeyspace",
Shard: "-",
},
{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 102,
},
Keyspace: "testkeyspace",
Shard: "-",
Hostname: "most up-to-date position, wins election",
},
},
keyspace: "testkeyspace",
shard: "-",
opts: EmergencyReparentOptions{},
shouldErr: false,
},
{
name: "shard not found",
ts: memorytopo.NewServer("zone1"),
Expand Down

0 comments on commit 6403489

Please sign in to comment.