Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Respect -disable_active_reparents in backup/restore #7576

Merged
merged 2 commits into from
Mar 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion go/vt/mysqlctl/builtinbackupengine.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ func (be *BuiltinBackupEngine) ExecuteBackup(ctx context.Context, params BackupP
replicaStatus, err := params.Mysqld.ReplicationStatus()
switch err {
case nil:
replicaStartRequired = replicaStatus.ReplicationRunning()
replicaStartRequired = replicaStatus.ReplicationRunning() && !*DisableActiveReparents
case mysql.ErrNotReplica:
// keep going if we're the master, might be a degenerate case
sourceIsMaster = true
Expand Down
30 changes: 28 additions & 2 deletions go/vt/vttablet/tabletmanager/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,28 @@ func (tm *TabletManager) RestoreData(ctx context.Context, logger logutil.Logger,
if tm.Cnf == nil {
return fmt.Errorf("cannot perform restore without my.cnf, please restart vttablet with a my.cnf file specified")
}
return tm.restoreDataLocked(ctx, logger, waitForBackupInterval, deleteBeforeRestore)
// Tell Orchestrator we're stopped on purpose for some Vitess task.
// Do this in the background, as it's best-effort.
go func() {
if tm.orc == nil {
return
}
if err := tm.orc.BeginMaintenance(tm.Tablet(), "vttablet has been told to Restore"); err != nil {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from our experiment, tm.Tablet() seems to give an out dated state and we are getting cannot find mysql port error, we ended up using tm.tmState.Tablet(). Let me know if you also experience it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tm.Tablet() simply returns tm.tmState.Tablet() so they should be equivalent. I do see that it is possible for a race condition to occur between checkMysql and handleRestore.

log.Warningf("Orchestrator BeginMaintenance failed: %v", err)
}
}()
err := tm.restoreDataLocked(ctx, logger, waitForBackupInterval, deleteBeforeRestore)
// Tell Orchestrator we're no longer stopped on purpose.
// Do this in the background, as it's best-effort.
go func() {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we only EndMaintenance if restoreDataLocked returns no error.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That seems reasonable.

if tm.orc == nil {
return
}
if err := tm.orc.EndMaintenance(tm.Tablet()); err != nil {
log.Warningf("Orchestrator EndMaintenance failed: %v", err)
}
}()
return err
}

func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool) error {
Expand Down Expand Up @@ -470,10 +491,15 @@ func (tm *TabletManager) startReplication(ctx context.Context, pos mysql.Positio
}

// Set master and start replication.
if err := tm.MysqlDaemon.SetMaster(ctx, ti.Tablet.MysqlHostname, int(ti.Tablet.MysqlPort), false /* stopReplicationBefore */, true /* startReplicationAfter */); err != nil {
if err := tm.MysqlDaemon.SetMaster(ctx, ti.Tablet.MysqlHostname, int(ti.Tablet.MysqlPort), false /* stopReplicationBefore */, !*mysqlctl.DisableActiveReparents /* startReplicationAfter */); err != nil {
return vterrors.Wrap(err, "MysqlDaemon.SetMaster failed")
}

// If active reparents are disabled, we don't restart replication. So it makes no sense to wait for an update on the replica.
// Return immediately.
if !*mysqlctl.DisableActiveReparents {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would it be the opposite?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for pointing this out. It would have been a regression. I have opened #7703 to fix this.

return nil
}
// wait for reliable seconds behind master
// we have pos where we want to resume from
// if MasterPosition is the same, that means no writes
Expand Down
20 changes: 20 additions & 0 deletions go/vt/vttablet/tabletmanager/rpc_backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,16 @@ func (tm *TabletManager) Backup(ctx context.Context, concurrency int, logger log
if err := tm.changeTypeLocked(ctx, topodatapb.TabletType_BACKUP, DBActionNone); err != nil {
return err
}
// Tell Orchestrator we're stopped on purpose for some Vitess task.
// Do this in the background, as it's best-effort.
go func() {
if tm.orc == nil {
return
}
if err := tm.orc.BeginMaintenance(tm.Tablet(), "vttablet has been told to run an offline backup"); err != nil {
logger.Warningf("Orchestrator BeginMaintenance failed: %v", err)
}
}()
}
// create the loggers: tee to console and source
l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger)
Expand Down Expand Up @@ -124,6 +134,16 @@ func (tm *TabletManager) Backup(ctx context.Context, concurrency int, logger log
}
returnErr = err
}
// Tell Orchestrator we're no longer stopped on purpose.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Zhannan what about here? Should this also be called only if err == nil?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense!

// Do this in the background, as it's best-effort.
go func() {
if tm.orc == nil {
return
}
if err := tm.orc.EndMaintenance(tm.Tablet()); err != nil {
logger.Warningf("Orchestrator EndMaintenance failed: %v", err)
}
}()
}

return returnErr
Expand Down
5 changes: 1 addition & 4 deletions go/vt/vttablet/tabletmanager/tm_init.go
Original file line number Diff line number Diff line change
Expand Up @@ -597,10 +597,7 @@ func (tm *TabletManager) handleRestore(ctx context.Context) (bool, error) {
return false, fmt.Errorf("you cannot enable -restore_from_backup without a my.cnf file")
}

// two cases then:
// - restoreFromBackup is set: we restore, then initHealthCheck, all
// in the background
// - restoreFromBackup is not set: we initHealthCheck right away
// Restore in the background
if *restoreFromBackup {
go func() {
// Open the state manager after restore is done.
Expand Down
151 changes: 151 additions & 0 deletions go/vt/wrangler/testlib/backup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -394,3 +394,154 @@ func TestRestoreUnreachableMaster(t *testing.T) {
assert.True(t, destTablet.FakeMysqlDaemon.Replicating)
assert.True(t, destTablet.FakeMysqlDaemon.Running)
}

func TestDisableActiveReparents(t *testing.T) {
*mysqlctl.DisableActiveReparents = true
delay := discovery.GetTabletPickerRetryDelay()
defer func() {
// When you mess with globals you must remember to reset them
*mysqlctl.DisableActiveReparents = false
discovery.SetTabletPickerRetryDelay(delay)
}()
discovery.SetTabletPickerRetryDelay(5 * time.Millisecond)

// Initialize our environment
ctx := context.Background()
db := fakesqldb.New(t)
defer db.Close()
ts := memorytopo.NewServer("cell1", "cell2")
wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient())
vp := NewVtctlPipe(t, ts)
defer vp.Close()

// Set up mock query results.
db.AddQuery("CREATE DATABASE IF NOT EXISTS _vt", &sqltypes.Result{})
db.AddQuery("BEGIN", &sqltypes.Result{})
db.AddQuery("COMMIT", &sqltypes.Result{})
db.AddQueryPattern(`SET @@session\.sql_log_bin = .*`, &sqltypes.Result{})
db.AddQueryPattern(`CREATE TABLE IF NOT EXISTS _vt\.shard_metadata .*`, &sqltypes.Result{})
db.AddQueryPattern(`CREATE TABLE IF NOT EXISTS _vt\.local_metadata .*`, &sqltypes.Result{})
db.AddQueryPattern(`ALTER TABLE _vt\.local_metadata .*`, &sqltypes.Result{})
db.AddQueryPattern(`ALTER TABLE _vt\.shard_metadata .*`, &sqltypes.Result{})
db.AddQueryPattern(`UPDATE _vt\.local_metadata SET db_name=.*`, &sqltypes.Result{})
db.AddQueryPattern(`UPDATE _vt\.shard_metadata SET db_name=.*`, &sqltypes.Result{})
db.AddQueryPattern(`INSERT INTO _vt\.local_metadata .*`, &sqltypes.Result{})

// Initialize our temp dirs
root, err := ioutil.TempDir("", "backuptest")
require.NoError(t, err)
defer os.RemoveAll(root)

// Initialize BackupStorage
fbsRoot := path.Join(root, "fbs")
*filebackupstorage.FileBackupStorageRoot = fbsRoot
*backupstorage.BackupStorageImplementation = "file"

// Initialize the fake mysql root directories
sourceInnodbDataDir := path.Join(root, "source_innodb_data")
sourceInnodbLogDir := path.Join(root, "source_innodb_log")
sourceDataDir := path.Join(root, "source_data")
sourceDataDbDir := path.Join(sourceDataDir, "vt_db")
for _, s := range []string{sourceInnodbDataDir, sourceInnodbLogDir, sourceDataDbDir} {
require.NoError(t, os.MkdirAll(s, os.ModePerm))
}
require.NoError(t, ioutil.WriteFile(path.Join(sourceInnodbDataDir, "innodb_data_1"), []byte("innodb data 1 contents"), os.ModePerm))
require.NoError(t, ioutil.WriteFile(path.Join(sourceInnodbLogDir, "innodb_log_1"), []byte("innodb log 1 contents"), os.ModePerm))
require.NoError(t, ioutil.WriteFile(path.Join(sourceDataDbDir, "db.opt"), []byte("db opt file"), os.ModePerm))

// create a master tablet, set its master position
master := NewFakeTablet(t, wr, "cell1", 0, topodatapb.TabletType_MASTER, db)
master.FakeMysqlDaemon.ReadOnly = false
master.FakeMysqlDaemon.Replicating = false
master.FakeMysqlDaemon.CurrentMasterPosition = mysql.Position{
GTIDSet: mysql.MariadbGTIDSet{
2: mysql.MariadbGTID{
Domain: 2,
Server: 123,
Sequence: 457,
},
},
}

// start master so that replica can fetch master position from it
master.StartActionLoop(t, wr)
defer master.StopActionLoop(t)

// create a single tablet, set it up so we can do backups
// set its position same as that of master so that backup doesn't wait for catchup
sourceTablet := NewFakeTablet(t, wr, "cell1", 1, topodatapb.TabletType_REPLICA, db)
sourceTablet.FakeMysqlDaemon.ReadOnly = true
sourceTablet.FakeMysqlDaemon.Replicating = true
sourceTablet.FakeMysqlDaemon.CurrentMasterPosition = mysql.Position{
GTIDSet: mysql.MariadbGTIDSet{
2: mysql.MariadbGTID{
Domain: 2,
Server: 123,
Sequence: 457,
},
},
}
sourceTablet.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
"STOP SLAVE",
}
sourceTablet.StartActionLoop(t, wr)
defer sourceTablet.StopActionLoop(t)

sourceTablet.TM.Cnf = &mysqlctl.Mycnf{
DataDir: sourceDataDir,
InnodbDataHomeDir: sourceInnodbDataDir,
InnodbLogGroupHomeDir: sourceInnodbLogDir,
}

// run the backup
require.NoError(t, vp.Run([]string{"Backup", topoproto.TabletAliasString(sourceTablet.Tablet.Alias)}))

// verify the full status
require.NoError(t, sourceTablet.FakeMysqlDaemon.CheckSuperQueryList())
assert.False(t, sourceTablet.FakeMysqlDaemon.Replicating)
assert.True(t, sourceTablet.FakeMysqlDaemon.Running)

// create a destination tablet, set it up so we can do restores
destTablet := NewFakeTablet(t, wr, "cell1", 2, topodatapb.TabletType_REPLICA, db)
destTablet.FakeMysqlDaemon.ReadOnly = true
destTablet.FakeMysqlDaemon.Replicating = true
destTablet.FakeMysqlDaemon.CurrentMasterPosition = mysql.Position{
GTIDSet: mysql.MariadbGTIDSet{
2: mysql.MariadbGTID{
Domain: 2,
Server: 123,
Sequence: 457,
},
},
}
destTablet.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
"STOP SLAVE",
"RESET SLAVE ALL",
"FAKE SET SLAVE POSITION",
"FAKE SET MASTER",
}
destTablet.FakeMysqlDaemon.FetchSuperQueryMap = map[string]*sqltypes.Result{
"SHOW DATABASES": {},
}
destTablet.FakeMysqlDaemon.SetReplicationPositionPos = sourceTablet.FakeMysqlDaemon.CurrentMasterPosition
destTablet.FakeMysqlDaemon.SetMasterInput = topoproto.MysqlAddr(master.Tablet)

destTablet.StartActionLoop(t, wr)
defer destTablet.StopActionLoop(t)

destTablet.TM.Cnf = &mysqlctl.Mycnf{
DataDir: sourceDataDir,
InnodbDataHomeDir: sourceInnodbDataDir,
InnodbLogGroupHomeDir: sourceInnodbLogDir,
BinLogPath: path.Join(root, "bin-logs/filename_prefix"),
RelayLogPath: path.Join(root, "relay-logs/filename_prefix"),
RelayLogIndexPath: path.Join(root, "relay-log.index"),
RelayLogInfoPath: path.Join(root, "relay-log.info"),
}

require.NoError(t, destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */))
// verify the full status
require.NoError(t, destTablet.FakeMysqlDaemon.CheckSuperQueryList(), "destTablet.FakeMysqlDaemon.CheckSuperQueryList failed")
assert.False(t, destTablet.FakeMysqlDaemon.Replicating)
assert.True(t, destTablet.FakeMysqlDaemon.Running)
}