Skip to content

Commit 85e96a1

Browse files
jswxstwoninowang
andauthored
fix: correct finding the closest ancestor retry node. Fixes #14517 (#14576)
Signed-off-by: oninowang <oninowang@tencent.com> Co-authored-by: oninowang <oninowang@tencent.com>
1 parent 162e645 commit 85e96a1

File tree

4 files changed

+71
-67
lines changed

4 files changed

+71
-67
lines changed

pkg/apis/workflow/v1alpha1/workflow_types.go

Lines changed: 42 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1771,20 +1771,24 @@ var _ ArgumentsProvider = &Arguments{}
17711771

17721772
type Nodes map[string]NodeStatus
17731773

1774-
func (in Nodes) FindByDisplayName(name string) *NodeStatus {
1775-
return in.Find(NodeWithDisplayName(name))
1774+
func (n Nodes) FindByDisplayName(name string) *NodeStatus {
1775+
return n.Find(NodeWithDisplayName(name))
17761776
}
17771777

1778-
func (in Nodes) FindByName(name string) *NodeStatus {
1779-
return in.Find(NodeWithName(name))
1778+
func (n Nodes) FindByName(name string) *NodeStatus {
1779+
return n.Find(NodeWithName(name))
17801780
}
17811781

1782-
func (in Nodes) Any(f func(NodeStatus) bool) bool {
1783-
return in.Find(f) != nil
1782+
func (n Nodes) FindByChild(childID string) *NodeStatus {
1783+
return n.Find(NodeWithChild(childID))
17841784
}
17851785

1786-
func (in Nodes) Find(f func(NodeStatus) bool) *NodeStatus {
1787-
for _, i := range in {
1786+
func (n Nodes) Any(f func(NodeStatus) bool) bool {
1787+
return n.Find(f) != nil
1788+
}
1789+
1790+
func (n Nodes) Find(f func(NodeStatus) bool) *NodeStatus {
1791+
for _, i := range n {
17881792
if f(i) {
17891793
return &i
17901794
}
@@ -1794,57 +1798,57 @@ func (in Nodes) Find(f func(NodeStatus) bool) *NodeStatus {
17941798

17951799
// Get a NodeStatus from the hashmap of Nodes.
17961800
// Return a nil along with an error if non existent.
1797-
func (in Nodes) Get(key string) (*NodeStatus, error) {
1798-
val, ok := in[key]
1801+
func (n Nodes) Get(key string) (*NodeStatus, error) {
1802+
val, ok := n[key]
17991803
if !ok {
18001804
return nil, fmt.Errorf("key was not found for %s", key)
18011805
}
18021806
return &val, nil
18031807
}
18041808

18051809
// Check if the Nodes map has a key entry
1806-
func (in Nodes) Has(key string) bool {
1807-
_, err := in.Get(key)
1810+
func (n Nodes) Has(key string) bool {
1811+
_, err := n.Get(key)
18081812
return err == nil
18091813
}
18101814

18111815
// Get the Phase of a Node
1812-
func (in Nodes) GetPhase(key string) (*NodePhase, error) {
1813-
val, err := in.Get(key)
1816+
func (n Nodes) GetPhase(key string) (*NodePhase, error) {
1817+
val, err := n.Get(key)
18141818
if err != nil {
18151819
return nil, err
18161820
}
18171821
return &val.Phase, nil
18181822
}
18191823

18201824
// Set the status of a node by key
1821-
func (in Nodes) Set(key string, status NodeStatus) {
1825+
func (n Nodes) Set(key string, status NodeStatus) {
18221826
if status.Name == "" {
18231827
log.Warnf("Name was not set for key %s", key)
18241828
}
18251829
if status.ID == "" {
18261830
log.Warnf("ID was not set for key %s", key)
18271831
}
1828-
_, ok := in[key]
1832+
_, ok := n[key]
18291833
if ok {
18301834
log.Tracef("Changing NodeStatus for %s to %+v", key, status)
18311835
}
1832-
in[key] = status
1836+
n[key] = status
18331837
}
18341838

18351839
// Delete a node from the Nodes by key
1836-
func (in Nodes) Delete(key string) {
1837-
has := in.Has(key)
1840+
func (n Nodes) Delete(key string) {
1841+
has := n.Has(key)
18381842
if !has {
18391843
log.Warnf("Trying to delete non existent key %s", key)
18401844
return
18411845
}
1842-
delete(in, key)
1846+
delete(n, key)
18431847
}
18441848

18451849
// Get the name of a node by key
1846-
func (in Nodes) GetName(key string) (string, error) {
1847-
val, err := in.Get(key)
1850+
func (n Nodes) GetName(key string) (string, error) {
1851+
val, err := n.Get(key)
18481852
if err != nil {
18491853
return "", err
18501854
}
@@ -1858,6 +1862,12 @@ func NodeWithDisplayName(name string) func(n NodeStatus) bool {
18581862
return func(n NodeStatus) bool { return n.DisplayName == name }
18591863
}
18601864

1865+
func NodeWithChild(childID string) func(n NodeStatus) bool {
1866+
return func(n NodeStatus) bool {
1867+
return n.HasChild(childID)
1868+
}
1869+
}
1870+
18611871
func FailedPodNode(n NodeStatus) bool {
18621872
return n.Type == NodeTypePod && n.Phase == NodeFailed
18631873
}
@@ -1867,14 +1877,14 @@ func SucceededPodNode(n NodeStatus) bool {
18671877
}
18681878

18691879
// Children returns the children of the parent.
1870-
func (in Nodes) Children(parentNodeID string) Nodes {
1880+
func (n Nodes) Children(parentNodeID string) Nodes {
18711881
childNodes := make(Nodes)
1872-
parentNode, ok := in[parentNodeID]
1882+
parentNode, ok := n[parentNodeID]
18731883
if !ok {
18741884
return childNodes
18751885
}
18761886
for _, childID := range parentNode.Children {
1877-
if childNode, ok := in[childID]; ok {
1887+
if childNode, ok := n[childID]; ok {
18781888
childNodes[childID] = childNode
18791889
}
18801890
}
@@ -1883,8 +1893,8 @@ func (in Nodes) Children(parentNodeID string) Nodes {
18831893

18841894
// NestedChildrenStatus takes in a nodeID and returns all its children, this involves a tree search using DFS.
18851895
// This is needed to mark all children nodes as failed for example.
1886-
func (in Nodes) NestedChildrenStatus(parentNodeID string) ([]NodeStatus, error) {
1887-
parentNode, ok := in[parentNodeID]
1896+
func (n Nodes) NestedChildrenStatus(parentNodeID string) ([]NodeStatus, error) {
1897+
parentNode, ok := n[parentNodeID]
18881898
if !ok {
18891899
return nil, fmt.Errorf("could not find %s in nodes when searching for nested children", parentNodeID)
18901900
}
@@ -1896,7 +1906,7 @@ func (in Nodes) NestedChildrenStatus(parentNodeID string) ([]NodeStatus, error)
18961906
childNode := toexplore[0]
18971907
toexplore = toexplore[1:]
18981908
for _, nodeID := range childNode.Children {
1899-
toexplore = append(toexplore, in[nodeID])
1909+
toexplore = append(toexplore, n[nodeID])
19001910
}
19011911

19021912
if childNode.Name == parentNode.Name {
@@ -1909,9 +1919,9 @@ func (in Nodes) NestedChildrenStatus(parentNodeID string) ([]NodeStatus, error)
19091919
}
19101920

19111921
// Filter returns the subset of the nodes that match the predicate, e.g. only failed nodes
1912-
func (in Nodes) Filter(predicate func(NodeStatus) bool) Nodes {
1922+
func (n Nodes) Filter(predicate func(NodeStatus) bool) Nodes {
19131923
filteredNodes := make(Nodes)
1914-
for _, node := range in {
1924+
for _, node := range n {
19151925
if predicate(node) {
19161926
filteredNodes[node.ID] = node
19171927
}
@@ -1920,9 +1930,9 @@ func (in Nodes) Filter(predicate func(NodeStatus) bool) Nodes {
19201930
}
19211931

19221932
// Map maps the nodes to new values, e.g. `x.Hostname`
1923-
func (in Nodes) Map(f func(x NodeStatus) interface{}) map[string]interface{} {
1933+
func (n Nodes) Map(f func(x NodeStatus) interface{}) map[string]interface{} {
19241934
values := make(map[string]interface{})
1925-
for _, node := range in {
1935+
for _, node := range n {
19261936
values[node.ID] = f(node)
19271937
}
19281938
return values

workflow/controller/retry_tweak.go

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,21 +11,14 @@ import (
1111
// RetryTweak is a 2nd order function interface for tweaking the retry
1212
type RetryTweak = func(retryStrategy wfv1.RetryStrategy, nodes wfv1.Nodes, pod *apiv1.Pod)
1313

14-
// FindRetryNode locates the closes retry node ancestor to nodeID
14+
// FindRetryNode locates the closest retry node ancestor to nodeID
1515
func FindRetryNode(nodes wfv1.Nodes, nodeID string) *wfv1.NodeStatus {
16+
if parentNode := nodes.FindByChild(nodeID); parentNode != nil && parentNode.Type == wfv1.NodeTypeRetry {
17+
return parentNode
18+
}
1619
boundaryID := nodes[nodeID].BoundaryID
17-
boundaryNode := nodes[boundaryID]
18-
for _, node := range nodes {
19-
if node.Type != wfv1.NodeTypeRetry {
20-
continue
21-
}
22-
if boundaryID == "" && node.HasChild(nodeID) {
23-
return &node
24-
} else if boundaryNode.TemplateName != "" && node.TemplateName == boundaryNode.TemplateName {
25-
return &node
26-
} else if boundaryNode.TemplateRef != nil && node.TemplateRef != nil && node.TemplateRef.Name == boundaryNode.TemplateRef.Name && node.TemplateRef.Template == boundaryNode.TemplateRef.Template {
27-
return &node
28-
}
20+
if parentNode := nodes.FindByChild(boundaryID); parentNode != nil && parentNode.Type == wfv1.NodeTypeRetry {
21+
return parentNode
2922
}
3023
return nil
3124
}
@@ -36,10 +29,12 @@ func RetryOnDifferentHost(retryNodeName string) RetryTweak {
3629
if retryStrategy.Affinity == nil {
3730
return
3831
}
39-
hostNames := wfretry.GetFailHosts(nodes, retryNodeName)
40-
hostLabel := env.GetString("RETRY_HOST_NAME_LABEL_KEY", "kubernetes.io/hostname")
41-
if hostLabel != "" && len(hostNames) > 0 {
42-
pod.Spec.Affinity = wfretry.AddHostnamesToAffinity(hostLabel, hostNames, pod.Spec.Affinity)
32+
if retryStrategy.Affinity.NodeAntiAffinity != nil {
33+
hostNames := wfretry.GetFailHosts(nodes, retryNodeName)
34+
hostLabel := env.GetString("RETRY_HOST_NAME_LABEL_KEY", "kubernetes.io/hostname")
35+
if hostLabel != "" && len(hostNames) > 0 {
36+
pod.Spec.Affinity = wfretry.AddHostnamesToAffinity(hostLabel, hostNames, pod.Spec.Affinity)
37+
}
4338
}
4439
}
4540
}

workflow/controller/retry_tweak_test.go

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,66 +15,67 @@ func TestFindRetryNode(t *testing.T) {
1515
Type: wfv1.NodeTypeSteps,
1616
Phase: wfv1.NodeRunning,
1717
BoundaryID: "",
18-
Children: []string{"B1", "B2"},
18+
Children: []string{"B1", "B2", "B3"},
1919
TemplateName: "tmpl1",
2020
},
2121
"B1": wfv1.NodeStatus{
2222
ID: "B1",
2323
Type: wfv1.NodeTypeSkipped,
2424
Phase: wfv1.NodeSkipped,
25-
BoundaryID: "",
25+
BoundaryID: "A1",
2626
Children: []string{},
2727
TemplateName: "tmpl2",
2828
},
29-
// retry node
29+
// retry node containing steps
3030
"B2": wfv1.NodeStatus{
3131
ID: "B2",
3232
Type: wfv1.NodeTypeRetry,
3333
Phase: wfv1.NodeRunning,
34-
BoundaryID: "",
34+
BoundaryID: "A1",
3535
Children: []string{"C1"},
3636
TemplateName: "tmpl1",
3737
},
3838
"C1": wfv1.NodeStatus{
3939
ID: "C1",
4040
Type: wfv1.NodeTypeSteps,
4141
Phase: wfv1.NodeRunning,
42-
BoundaryID: "",
42+
BoundaryID: "A1",
4343
Children: []string{"D1", "D2"},
4444
TemplateName: "tmpl2",
4545
},
4646
"D1": wfv1.NodeStatus{
4747
ID: "D1",
4848
Type: wfv1.NodeTypeSkipped,
4949
Phase: wfv1.NodeSkipped,
50-
BoundaryID: "A1",
50+
BoundaryID: "C1",
5151
Children: []string{},
5252
TemplateName: "tmpl2",
5353
},
5454
"D2": wfv1.NodeStatus{
5555
ID: "D2",
5656
Type: wfv1.NodeTypePod,
5757
Phase: wfv1.NodeRunning,
58-
BoundaryID: "A1",
58+
BoundaryID: "C1",
5959
Children: []string{},
6060
TemplateName: "tmpl2",
6161
},
62-
"E1": wfv1.NodeStatus{
63-
ID: "E1",
62+
// retry node containing single step and templteRef
63+
"B3": wfv1.NodeStatus{
64+
ID: "B3",
6465
Type: wfv1.NodeTypeRetry,
6566
Phase: wfv1.NodeRunning,
6667
BoundaryID: "A1",
67-
Children: []string{},
68+
Children: []string{"C2"},
6869
TemplateRef: &wfv1.TemplateRef{
6970
Name: "tmpl1",
7071
Template: "tmpl3",
7172
},
7273
},
73-
"E2": wfv1.NodeStatus{
74-
ID: "E2",
74+
"C2": wfv1.NodeStatus{
75+
ID: "C2",
7576
Type: wfv1.NodeTypePod,
7677
Phase: wfv1.NodeRunning,
77-
BoundaryID: "E1",
78+
BoundaryID: "A1",
7879
Children: []string{},
7980
TemplateName: "tmpl2",
8081
},
@@ -88,7 +89,7 @@ func TestFindRetryNode(t *testing.T) {
8889
assert.Nil(t, a)
8990
})
9091
t.Run("Expect to find retry node has TemplateRef", func(t *testing.T) {
91-
node := allNodes["E1"]
92-
assert.Equal(t, FindRetryNode(allNodes, "E2"), &node)
92+
node := allNodes["B3"]
93+
assert.Equal(t, FindRetryNode(allNodes, "C2"), &node)
9394
})
9495
}

workflow/util/util.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,9 +1169,7 @@ func FormulateRetryWorkflow(ctx context.Context, wf *wfv1.Workflow, restartSucce
11691169
if node.FailedOrError() && isExecutionNodeType(node.Type) {
11701170
// Check its parent if current node is retry node
11711171
if node.NodeFlag != nil && node.NodeFlag.Retried {
1172-
node = *wf.Status.Nodes.Find(func(nodeStatus wfv1.NodeStatus) bool {
1173-
return nodeStatus.HasChild(node.ID)
1174-
})
1172+
node = *wf.Status.Nodes.FindByChild(nodeID)
11751173
}
11761174
if !isDescendantNodeSucceeded(wf, node, deleteNodesMap) {
11771175
failed[nodeID] = true

0 commit comments

Comments
 (0)