Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Emergency Reparent Refactor #6449

Merged
merged 27 commits into from
Aug 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
1839e73
Finished strategy steps 1-4 from main refactor issue.
PrismaPhonic Jul 16, 2020
35c85a6
Updated test for new io_thread only logic.
PrismaPhonic Jul 16, 2020
62950ff
Some tests are failing because master is not actually stopped. We sho…
PrismaPhonic Jul 16, 2020
70f7273
Finally passing. Hallelujah.
PrismaPhonic Jul 17, 2020
2af7875
Finished all major aspects of design, other than
PrismaPhonic Jul 20, 2020
88b0788
Added ignore_unreachable_replicas flag along with a test to check tha…
PrismaPhonic Jul 23, 2020
0b4f29c
Silly commit because CICD seems to still have an old version.
PrismaPhonic Jul 23, 2020
5696a12
Switch error nil checks to NoError checks per review suggestion.
PrismaPhonic Jul 27, 2020
08f018c
Added explicit checking of error string.
PrismaPhonic Jul 27, 2020
a69e6a0
Trivial fixes per review suggestions.
PrismaPhonic Jul 31, 2020
6a7d62a
Refactor per review suggestion to reduce number of code paths.
PrismaPhonic Jul 31, 2020
80ef344
Record all errors and surface them in the case we could not get a win…
PrismaPhonic Jul 31, 2020
b231de2
Updated waitOnNMinusOneTablets helper function to take a maximum acce…
PrismaPhonic Jul 31, 2020
7e71b59
Simplify design drastically per review suggestion to synchronously wa…
PrismaPhonic Aug 1, 2020
5f7d2cc
Fix test issue now that SetMaster calls are just best effort and done…
PrismaPhonic Aug 5, 2020
3af3cf1
Small fixes per review suggestions.
PrismaPhonic Aug 18, 2020
1ccfab4
Switched all fmt.Errorf to vterrors.Errorf or Wrapf.
PrismaPhonic Aug 19, 2020
e166f94
Rename to ignoredTablets to match flag.
PrismaPhonic Aug 19, 2020
3a6fc89
Significant logic re-write to make all replicas wait for relay logs t…
PrismaPhonic Aug 19, 2020
96e4843
Make sure we run competition for most caught up.
PrismaPhonic Aug 19, 2020
fff559f
Fixed unit tests and refactored logic per pairing session.
PrismaPhonic Aug 19, 2020
519c048
Resolved merge conflicts with master.
PrismaPhonic Aug 19, 2020
d239ff5
Ensure we use Wrapf for ALL error returns.
PrismaPhonic Aug 20, 2020
f445f79
Get rid of unnecessary subcontext.
PrismaPhonic Aug 20, 2020
dcfe4f1
More error improvement.
PrismaPhonic Aug 20, 2020
04a1a60
Rename per review suggestion.
PrismaPhonic Aug 20, 2020
c3c24f5
Remove InitTablet calls. They are unnecessary now.
PrismaPhonic Aug 20, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -96,5 +96,6 @@ require (
k8s.io/apiextensions-apiserver v0.17.3
k8s.io/apimachinery v0.17.3
k8s.io/client-go v0.17.3
k8s.io/utils v0.0.0-20191114184206-e782cd3c129f
sigs.k8s.io/yaml v1.1.0
)
6 changes: 3 additions & 3 deletions go/mysql/filepos_gtid.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ func parseFilePosGTID(s string) (GTID, error) {
}, nil
}

// parseFilePosGTIDSet is registered as a GTIDSet parser.
func parseFilePosGTIDSet(s string) (GTIDSet, error) {
// ParseFilePosGTIDSet is registered as a GTIDSet parser.
func ParseFilePosGTIDSet(s string) (GTIDSet, error) {
gtid, err := parseFilePosGTID(s)
if err != nil {
return nil, err
Expand Down Expand Up @@ -156,6 +156,6 @@ func (gtid filePosGTID) Last() string {

func init() {
gtidParsers[FilePosFlavorID] = parseFilePosGTID
gtidSetParsers[FilePosFlavorID] = parseFilePosGTIDSet
gtidSetParsers[FilePosFlavorID] = ParseFilePosGTIDSet
flavors[FilePosFlavorID] = newFilePosFlavor
}
283 changes: 277 additions & 6 deletions go/test/endtoend/reparent/reparent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,13 @@ import (
"testing"
"time"

"vitess.io/vitess/go/vt/log"

"vitess.io/vitess/go/mysql"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"vitess.io/vitess/go/json2"
"vitess.io/vitess/go/mysql"
"vitess.io/vitess/go/test/endtoend/cluster"
"vitess.io/vitess/go/vt/log"
querypb "vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
)
Expand Down Expand Up @@ -110,18 +109,34 @@ func TestReparentDownMaster(t *testing.T) {
err = clusterInstance.VtctlclientProcess.ExecuteCommand(
"-action_timeout", "1s",
"PlannedReparentShard",
"-wait_replicas_timeout", "5s",
"-keyspace_shard", keyspaceShard,
"-new_master", tablet62044.Alias)
require.Error(t, err)

// Run forced reparent operation, this should now proceed unimpeded.
err = clusterInstance.VtctlclientProcess.ExecuteCommand(
out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput(
"EmergencyReparentShard",
"-keyspace_shard", keyspaceShard,
"-new_master", tablet62044.Alias,
"-wait_replicas_timeout", "31s")
"-wait_replicas_timeout", "30s")
log.Infof("EmergencyReparentShard Output: %v", out)
require.Nil(t, err)
require.NoError(t, err)

// Check that old master tablet is left around for human intervention.
out, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("Validate")
require.Error(t, err)
PrismaPhonic marked this conversation as resolved.
Show resolved Hide resolved
require.Contains(t, out, "already has master")

// Now we'll manually remove it, simulating a human cleaning up a dead master.
err = clusterInstance.VtctlclientProcess.ExecuteCommand(
"DeleteTablet",
"-allow_master",
tablet62344.Alias)
require.NoError(t, err)

// Now validate topo is correct.
validateTopology(t, false)

checkMasterTablet(t, tablet62044)
Expand All @@ -138,6 +153,113 @@ func TestReparentDownMaster(t *testing.T) {
tablet62344.MysqlctlProcess.InitMysql = false
err = tablet62344.MysqlctlProcess.Start()
require.NoError(t, err)
err = clusterInstance.VtctlclientProcess.InitTablet(tablet62344, tablet62344.Cell, keyspaceName, hostname, shardName)
require.NoError(t, err)

// As there is already a master the new replica will come directly in SERVING state
tablet62344.VttabletProcess.ServingStatus = "SERVING"
// Start the tablet
err = tablet62344.VttabletProcess.Setup()
require.NoError(t, err)

err = checkInsertedValues(ctx, t, tablet62344, 2)
require.NoError(t, err)

// Kill tablets
killTablets(t)
}

func TestReparentNoChoiceDownMaster(t *testing.T) {
defer cluster.PanicHandler(t)
ctx := context.Background()

for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} {
// Create Database
err := tablet.VttabletProcess.CreateDB(keyspaceName)
require.NoError(t, err)

// Reset status, don't wait for the tablet status. We will check it later
tablet.VttabletProcess.ServingStatus = ""

// Start the tablet
err = tablet.VttabletProcess.Setup()
require.NoError(t, err)
}

for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} {
err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"})
require.NoError(t, err)
}

// Init Shard Master
err := clusterInstance.VtctlclientProcess.ExecuteCommand("InitShardMaster",
"-force", fmt.Sprintf("%s/%s", keyspaceName, shardName), tablet62344.Alias)
require.NoError(t, err)

validateTopology(t, true)

// create Tables
runSQL(ctx, t, sqlSchema, tablet62344)

// insert data into the old master, check the connected replica work
insertSQL1 := fmt.Sprintf(insertSQL, 2, 2)
runSQL(ctx, t, insertSQL1, tablet62344)
err = checkInsertedValues(ctx, t, tablet62044, 2)
require.NoError(t, err)
err = checkInsertedValues(ctx, t, tablet41983, 2)
require.NoError(t, err)
err = checkInsertedValues(ctx, t, tablet31981, 2)
require.NoError(t, err)

// Make the current master agent and database unavailable.
err = tablet62344.VttabletProcess.TearDown()
require.NoError(t, err)
err = tablet62344.MysqlctlProcess.Stop()
require.NoError(t, err)

// Run forced reparent operation, this should now proceed unimpeded.
err = clusterInstance.VtctlclientProcess.ExecuteCommand(
"EmergencyReparentShard",
"-keyspace_shard", keyspaceShard,
"-wait_replicas_timeout", "30s")
require.NoError(t, err)

// Check that old master tablet is left around for human intervention.
out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("Validate")
require.Error(t, err)
require.Contains(t, out, "already has master")

// Now we'll manually remove the old master, simulating a human cleaning up a dead master.
err = clusterInstance.VtctlclientProcess.ExecuteCommand(
"DeleteTablet",
"-allow_master",
tablet62344.Alias)
require.NoError(t, err)

// Now validate topo is correct.
validateTopology(t, false)

var newMasterTablet *cluster.Vttablet
for _, tablet := range []*cluster.Vttablet{tablet62044, tablet41983, tablet31981} {
if isHealthyMasterTablet(t, tablet) {
newMasterTablet = tablet
break
}
}
require.NotNil(t, newMasterTablet)
// Validate new master is not old master.
require.NotEqual(t, newMasterTablet.Alias, tablet62344.Alias)

// Check new master has latest transaction.
err = checkInsertedValues(ctx, t, newMasterTablet, 2)
require.NoError(t, err)

// bring back the old master as a replica, check that it catches up
tablet62344.MysqlctlProcess.InitMysql = false
err = tablet62344.MysqlctlProcess.Start()
require.NoError(t, err)
err = clusterInstance.VtctlclientProcess.InitTablet(tablet62344, tablet62344.Cell, keyspaceName, hostname, shardName)
require.NoError(t, err)

// As there is already a master the new replica will come directly in SERVING state
tablet62344.VttabletProcess.ServingStatus = "SERVING"
Expand All @@ -152,6 +274,132 @@ func TestReparentDownMaster(t *testing.T) {
killTablets(t)
}

func TestReparentIgnoreReplicas(t *testing.T) {
defer cluster.PanicHandler(t)
ctx := context.Background()

for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} {
// Create Database
err := tablet.VttabletProcess.CreateDB(keyspaceName)
require.Nil(t, err)

// Reset status, don't wait for the tablet status. We will check it later
tablet.VttabletProcess.ServingStatus = ""
// Init Tablet
err = clusterInstance.VtctlclientProcess.InitTablet(&tablet, tablet.Cell, keyspaceName, hostname, shardName)
require.Nil(t, err)

// Start the tablet
err = tablet.VttabletProcess.Setup()
require.Nil(t, err)
}

for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} {
err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"})
require.Nil(t, err)
}

// Init Shard Master.
err := clusterInstance.VtctlclientProcess.ExecuteCommand("InitShardMaster",
"-force", fmt.Sprintf("%s/%s", keyspaceName, shardName), tablet62344.Alias)
require.Nil(t, err)

validateTopology(t, true)

// Create Tables.
runSQL(ctx, t, sqlSchema, tablet62344)

// insert data into the old master, check the connected replica work
insertSQL1 := fmt.Sprintf(insertSQL, 2, 2)
runSQL(ctx, t, insertSQL1, tablet62344)
err = checkInsertedValues(ctx, t, tablet62044, 2)
require.Nil(t, err)
err = checkInsertedValues(ctx, t, tablet41983, 2)
require.Nil(t, err)
err = checkInsertedValues(ctx, t, tablet31981, 2)
require.Nil(t, err)

// Make the current master agent and database unavailable.
err = tablet62344.VttabletProcess.TearDown()
require.Nil(t, err)
err = tablet62344.MysqlctlProcess.Stop()
require.Nil(t, err)

// Take down a replica - this should cause the emergency reparent to fail.
err = tablet41983.VttabletProcess.TearDown()
require.Nil(t, err)
err = tablet41983.MysqlctlProcess.Stop()
require.Nil(t, err)

// We expect this one to fail because we have an unreachable replica
err = clusterInstance.VtctlclientProcess.ExecuteCommand(
"EmergencyReparentShard",
"-keyspace_shard", keyspaceShard,
"-wait_replicas_timeout", "30s")
require.NotNil(t, err)

// Now let's run it again, but set the command to ignore the unreachable replica.
err = clusterInstance.VtctlclientProcess.ExecuteCommand(
"EmergencyReparentShard",
"-keyspace_shard", keyspaceShard,
"-ignore_replicas", tablet41983.Alias,
"-wait_replicas_timeout", "30s")
require.Nil(t, err)

// We'll bring back the replica we took down.
tablet41983.MysqlctlProcess.InitMysql = false
err = tablet41983.MysqlctlProcess.Start()
require.Nil(t, err)
err = clusterInstance.VtctlclientProcess.InitTablet(tablet41983, tablet41983.Cell, keyspaceName, hostname, shardName)
require.Nil(t, err)

// Check that old master tablet is left around for human intervention.
err = clusterInstance.VtctlclientProcess.ExecuteCommand("Validate")
require.Error(t, err)

// Now we'll manually remove the old master, simulating a human cleaning up a dead master.
err = clusterInstance.VtctlclientProcess.ExecuteCommand(
"DeleteTablet",
"-allow_master",
tablet62344.Alias)
require.Nil(t, err)

// Now validate topo is correct.
validateTopology(t, false)

var newMasterTablet *cluster.Vttablet
for _, tablet := range []*cluster.Vttablet{tablet62044, tablet41983, tablet31981} {
if isHealthyMasterTablet(t, tablet) {
newMasterTablet = tablet
break
}
}
require.NotNil(t, newMasterTablet)

// Check new master has latest transaction.
err = checkInsertedValues(ctx, t, newMasterTablet, 2)
require.Nil(t, err)

// bring back the old master as a replica, check that it catches up
tablet62344.MysqlctlProcess.InitMysql = false
err = tablet62344.MysqlctlProcess.Start()
require.Nil(t, err)
err = clusterInstance.VtctlclientProcess.InitTablet(tablet62344, tablet62344.Cell, keyspaceName, hostname, shardName)
require.Nil(t, err)

// As there is already a master the new replica will come directly in SERVING state
tablet62344.VttabletProcess.ServingStatus = "SERVING"
// Start the tablet
err = tablet62344.VttabletProcess.Setup()
require.Nil(t, err)

err = checkInsertedValues(ctx, t, tablet62344, 2)
require.Nil(t, err)

// Kill tablets
killTablets(t)
}

func TestReparentCrossCell(t *testing.T) {

defer cluster.PanicHandler(t)
Expand Down Expand Up @@ -820,7 +1068,30 @@ func checkMasterTablet(t *testing.T, tablet *cluster.Vttablet) {
assert.True(t, streamHealthResponse.GetServing())
tabletType := streamHealthResponse.GetTarget().GetTabletType()
assert.Equal(t, topodatapb.TabletType_MASTER, tabletType)
}

// isHealthyMasterTablet will return if tablet is master AND healthy.
func isHealthyMasterTablet(t *testing.T, tablet *cluster.Vttablet) bool {
result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", tablet.Alias)
require.Nil(t, err)
var tabletInfo topodatapb.Tablet
err = json2.Unmarshal([]byte(result), &tabletInfo)
require.Nil(t, err)
if tabletInfo.GetType() != topodatapb.TabletType_MASTER {
return false
}

// make sure the health stream is updated
result, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("VtTabletStreamHealth", "-count", "1", tablet.Alias)
require.Nil(t, err)
var streamHealthResponse querypb.StreamHealthResponse

err = json2.Unmarshal([]byte(result), &streamHealthResponse)
require.Nil(t, err)

assert.True(t, streamHealthResponse.GetServing())
tabletType := streamHealthResponse.GetTarget().GetTabletType()
return tabletType == topodatapb.TabletType_MASTER
}

func checkInsertedValues(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, index int) error {
Expand Down
12 changes: 9 additions & 3 deletions go/vt/mysqlctl/fakemysqldaemon/fakemysqldaemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ type FakeMysqlDaemon struct {
// and ReplicationStatus
CurrentMasterPosition mysql.Position

// CurrentMasterFilePosition is used to determine the executed file based positioning of the master.
CurrentMasterFilePosition mysql.Position

// ReplicationStatusError is used by ReplicationStatus
ReplicationStatusError error

Expand Down Expand Up @@ -225,8 +228,10 @@ func (fmd *FakeMysqlDaemon) ReplicationStatus() (mysql.ReplicationStatus, error)
return mysql.ReplicationStatus{}, fmd.ReplicationStatusError
}
return mysql.ReplicationStatus{
Position: fmd.CurrentMasterPosition,
SecondsBehindMaster: fmd.SecondsBehindMaster,
Position: fmd.CurrentMasterPosition,
FilePosition: fmd.CurrentMasterFilePosition,
FileRelayLogPosition: fmd.CurrentMasterFilePosition,
SecondsBehindMaster: fmd.SecondsBehindMaster,
// implemented as AND to avoid changing all tests that were
// previously using Replicating = false
IOThreadRunning: fmd.Replicating && fmd.IOThreadRunning,
Expand All @@ -242,7 +247,8 @@ func (fmd *FakeMysqlDaemon) MasterStatus(ctx context.Context) (mysql.MasterStatu
return mysql.MasterStatus{}, fmd.MasterStatusError
}
return mysql.MasterStatus{
Position: fmd.CurrentMasterPosition,
Position: fmd.CurrentMasterPosition,
FilePosition: fmd.CurrentMasterFilePosition,
}, nil
}

Expand Down