diff --git a/changelogs/unreleased/5319-lyndon b/changelogs/unreleased/5319-lyndon new file mode 100644 index 0000000000..5c87431c56 --- /dev/null +++ b/changelogs/unreleased/5319-lyndon @@ -0,0 +1 @@ +Fix issue 4874 and 4752: check the daemonset pod is running in the node where the workload pod resides before running the PVB for the pod \ No newline at end of file diff --git a/pkg/cmd/server/server.go b/pkg/cmd/server/server.go index 06ad6c2a32..54cfe62034 100644 --- a/pkg/cmd/server/server.go +++ b/pkg/cmd/server/server.go @@ -533,11 +533,11 @@ var defaultRestorePriorities = []string{ } func (s *server) initRestic() error { - // warn if restic daemonset does not exist + // warn if node agent does not exist if err := nodeagent.IsRunning(s.ctx, s.kubeClient, s.namespace); err == nodeagent.DaemonsetNotFound { - s.logger.Warn("Velero restic daemonset not found; restic backups/restores will not work until it's created") + s.logger.Warn("Velero node agent not found; pod volume backups/restores will not work until it's created") } else if err != nil { - s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero restic daemonset") + s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent") } // ensure the repo key secret is set up diff --git a/pkg/nodeagent/node_agent.go b/pkg/nodeagent/node_agent.go index f4fcd92500..fcfd10931f 100644 --- a/pkg/nodeagent/node_agent.go +++ b/pkg/nodeagent/node_agent.go @@ -23,13 +23,15 @@ import ( "github.com/pkg/errors" "k8s.io/client-go/kubernetes" + "github.com/vmware-tanzu/velero/pkg/util/kube" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" corev1client "k8s.io/client-go/kubernetes/typed/core/v1" ) const ( - // daemonSet is the name of the Velero restic daemonset. + // daemonSet is the name of the Velero node agent daemonset. daemonSet = "restic" ) @@ -48,7 +50,7 @@ func IsRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace s } } -// IsRunningInNode checks if the node agent daemonset pod is running properly in a specified node. If not, return the error found +// IsRunningInNode checks if the node agent pod is running properly in a specified node. If not, return the error found func IsRunningInNode(ctx context.Context, namespace string, nodeName string, podClient corev1client.PodsGetter) error { if nodeName == "" { return errors.New("node name is empty") @@ -60,10 +62,14 @@ func IsRunningInNode(ctx context.Context, namespace string, nodeName string, pod } for _, pod := range pods.Items { + if kube.IsPodRunning(&pod) != nil { + continue + } + if pod.Spec.NodeName == nodeName { return nil } } - return errors.Errorf("daemonset pod not found in node %s", nodeName) + return errors.Errorf("daemonset pod not found in running state in node %s", nodeName) } diff --git a/pkg/podvolume/backupper.go b/pkg/podvolume/backupper.go index 476b3e4f17..3a5db6ceb6 100644 --- a/pkg/podvolume/backupper.go +++ b/pkg/podvolume/backupper.go @@ -35,6 +35,7 @@ import ( "github.com/vmware-tanzu/velero/pkg/nodeagent" "github.com/vmware-tanzu/velero/pkg/repository" "github.com/vmware-tanzu/velero/pkg/util/boolptr" + "github.com/vmware-tanzu/velero/pkg/util/kube" ) // Backupper can execute restic backups of volumes in a pod. @@ -125,7 +126,7 @@ func (b *backupper) BackupPodVolumes(backup *velerov1api.Backup, pod *corev1api. return nil, []error{err} } - err = IsPodQualified(pod) + err = kube.IsPodRunning(pod) if err != nil { return nil, []error{err} } diff --git a/pkg/podvolume/util.go b/pkg/podvolume/util.go index 3652e50719..a7ab78245a 100644 --- a/pkg/podvolume/util.go +++ b/pkg/podvolume/util.go @@ -22,7 +22,6 @@ import ( corev1api "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/pkg/errors" velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" "github.com/vmware-tanzu/velero/pkg/repository" "github.com/vmware-tanzu/velero/pkg/uploader" @@ -296,17 +295,3 @@ func GetPodVolumesUsingRestic(pod *corev1api.Pod, defaultVolumesToRestic bool) [ } return podVolumes } - -// IsPodQualified checks if the pod's status is qualified for a PVB/PVR to backup/restore its volumes. -// If no, return the error found -func IsPodQualified(pod *corev1api.Pod) error { - if pod.Spec.NodeName == "" { - return errors.Errorf("pod is not scheduled, name=%s, namespace=%s, status=%s", pod.Name, pod.Namespace, pod.Status.Phase) - } - - if pod.Status.Phase != corev1api.PodRunning { - return errors.Errorf("pod is not running, name=%s, namespace=%s, status=%s", pod.Name, pod.Namespace, pod.Status.Phase) - } - - return nil -} diff --git a/pkg/util/kube/pod.go b/pkg/util/kube/pod.go new file mode 100644 index 0000000000..4e589c4f04 --- /dev/null +++ b/pkg/util/kube/pod.go @@ -0,0 +1,39 @@ +/* +Copyright The Velero Contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package kube + +import ( + "github.com/pkg/errors" + corev1api "k8s.io/api/core/v1" +) + +// IsPodRunning does a well-rounded check to make sure the specified pod is running stably. +// If not, return the error found +func IsPodRunning(pod *corev1api.Pod) error { + if pod.Spec.NodeName == "" { + return errors.Errorf("pod is not scheduled, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase) + } + + if pod.Status.Phase != corev1api.PodRunning { + return errors.Errorf("pod is not running, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase) + } + + if pod.DeletionTimestamp != nil { + return errors.Errorf("pod is being terminated, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase) + } + + return nil +}