Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

in sync mode select only syncStandby as switchover candidate #2278

Merged
merged 4 commits into from
Apr 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
46 changes: 29 additions & 17 deletions pkg/cluster/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -469,39 +469,51 @@ func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, e
func() (bool, error) {
var err error
members, err = c.patroni.GetClusterMembers(master)

if err != nil {
return false, err
}

// look for SyncStandby candidates (which also implies pod is in running state)
for _, member := range members {
if PostgresRole(member.Role) == SyncStandby {
syncCandidates = append(syncCandidates, member)
}
}

// if synchronous mode is enabled and no SyncStandy was found
// return false for retry - cannot failover with no sync candidate
if c.Spec.Patroni.SynchronousMode && len(syncCandidates) == 0 {
c.logger.Warnf("no sync standby found - retrying fetching cluster members")
return false, nil
}

return true, nil
},
)
if err != nil {
return spec.NamespacedName{}, fmt.Errorf("failed to get Patroni cluster members: %s", err)
}

for _, member := range members {
if PostgresRole(member.Role) != Leader && PostgresRole(member.Role) != StandbyLeader && member.State == "running" {
candidates = append(candidates, member)
if PostgresRole(member.Role) == SyncStandby {
syncCandidates = append(syncCandidates, member)
}
}
}

// pick candidate with lowest lag
// if sync_standby replicas were found assume synchronous_mode is enabled and ignore other candidates list
if len(syncCandidates) > 0 {
sort.Slice(syncCandidates, func(i, j int) bool {
return syncCandidates[i].Lag < syncCandidates[j].Lag
})
return spec.NamespacedName{Namespace: master.Namespace, Name: syncCandidates[0].Name}, nil
}
if len(candidates) > 0 {
sort.Slice(candidates, func(i, j int) bool {
return candidates[i].Lag < candidates[j].Lag
})
return spec.NamespacedName{Namespace: master.Namespace, Name: candidates[0].Name}, nil
} else {
// in asynchronous mode find running replicas
for _, member := range members {
if PostgresRole(member.Role) != Leader && PostgresRole(member.Role) != StandbyLeader && member.State == "running" {
candidates = append(candidates, member)
}
}

if len(candidates) > 0 {
sort.Slice(candidates, func(i, j int) bool {
return candidates[i].Lag < candidates[j].Lag
})
return spec.NamespacedName{Namespace: master.Namespace, Name: candidates[0].Name}, nil
}
}

return spec.NamespacedName{}, fmt.Errorf("no switchover candidate found")
Expand Down
13 changes: 13 additions & 0 deletions pkg/cluster/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,30 +36,42 @@ func TestGetSwitchoverCandidate(t *testing.T) {
tests := []struct {
subtest string
clusterJson string
syncModeEnabled bool
expectedCandidate spec.NamespacedName
expectedError error
}{
{
subtest: "choose sync_standby over replica",
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "sync_standby", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 0}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 0}]}`,
syncModeEnabled: true,
expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-1"},
expectedError: nil,
},
{
subtest: "no running sync_standby available",
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 0}]}`,
syncModeEnabled: true,
expectedCandidate: spec.NamespacedName{},
expectedError: fmt.Errorf("failed to get Patroni cluster members: unexpected end of JSON input"),
},
{
subtest: "choose replica with lowest lag",
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 2}]}`,
syncModeEnabled: false,
expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-2"},
expectedError: nil,
},
{
subtest: "choose first replica when lag is equal evrywhere",
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 5}]}`,
syncModeEnabled: false,
expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-1"},
expectedError: nil,
},
{
subtest: "no running replica available",
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 2}, {"name": "acid-test-cluster-1", "role": "replica", "state": "starting", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 2}]}`,
syncModeEnabled: false,
expectedCandidate: spec.NamespacedName{},
expectedError: fmt.Errorf("no switchover candidate found"),
},
Expand All @@ -81,6 +93,7 @@ func TestGetSwitchoverCandidate(t *testing.T) {
cluster.patroni = p
mockMasterPod := newMockPod("192.168.100.1")
mockMasterPod.Namespace = namespace
cluster.Spec.Patroni.SynchronousMode = tt.syncModeEnabled

candidate, err := cluster.getSwitchoverCandidate(mockMasterPod)
if err != nil && err.Error() != tt.expectedError.Error() {
Expand Down
2 changes: 1 addition & 1 deletion pkg/cluster/sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ func (c *Cluster) syncPatroniConfig(pods []v1.Pod, requiredPatroniConfig acidv1.
// get Postgres config, compare with manifest and update via Patroni PATCH endpoint if it differs
for i, pod := range pods {
podName := util.NameFromMeta(pods[i].ObjectMeta)
effectivePatroniConfig, effectivePgParameters, err = c.patroni.GetConfig(&pod)
effectivePatroniConfig, effectivePgParameters, err = c.getPatroniConfig(&pod)
if err != nil {
errors = append(errors, fmt.Sprintf("could not get Postgres config from pod %s: %v", podName, err))
continue
Expand Down