Merge pull request openshift#24786 from jottofar/ota-141-add-ci-alerts

Bug 1828427: Add CI test to check for critical alerts post upgrade success
wking · Apr 27, 2020 · 3be2bc2 · 3be2bc2
2 parents 83fb2e9 + 3a92334
commit 3be2bc2
Show file tree

Hide file tree

Showing 6 changed files with 341 additions and 118 deletions.
diff --git a/test/e2e/upgrade/alert/alert.go b/test/e2e/upgrade/alert/alert.go
@@ -0,0 +1,103 @@
+package alert
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	g "github.com/onsi/ginkgo"
+
+	exutil "github.com/openshift/origin/test/extended/util"
+	helper "github.com/openshift/origin/test/extended/util/prometheus"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/kubernetes/test/e2e/framework"
+	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
+	"k8s.io/kubernetes/test/e2e/upgrades"
+)
+
+const (
+	// Delay after upgrade is complete before checking for critical alerts
+	alertCheckSleepMinutes = 5
+	alertCheckSleep        = alertCheckSleepMinutes * time.Minute
+
+	// Previous period in which to check for critical alerts
+	alertPeriodCheckMinutes = 1
+)
+
+// UpgradeTest runs post-upgrade after alertCheckSleep delay and tests if any critical alerts are firing.
+type UpgradeTest struct {
+	url         string
+	bearerToken string
+	oc          *exutil.CLI
+}
+
+func (UpgradeTest) Name() string { return "check-for-critical-alerts" }
+func (UpgradeTest) DisplayName() string {
+	return "Check if critical alerts are firing after upgrade success"
+}
+
+// Setup creates parameters to query Prometheus
+func (t *UpgradeTest) Setup(f *framework.Framework) {
+	g.By("Setting up post-upgrade alert test")
+
+	url, bearerToken, oc, ok := helper.ExpectPrometheus(f)
+	if !ok {
+		framework.Failf("Prometheus could not be located on this cluster, failing test %s", t.Name())
+	}
+	t.url = url
+	t.bearerToken = bearerToken
+	t.oc = oc
+	framework.Logf("Post-upgrade alert test setup complete")
+}
+
+// Test checks if any critical alerts are firing.
+func (t *UpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade upgrades.UpgradeType) {
+	g.By("Checking for critical alerts")
+
+	// Recover current test if it fails so test suite can complete
+	defer g.GinkgoRecover()
+
+	// Block until upgrade is done
+	g.By("Waiting for upgrade to finish before checking for critical alerts")
+	<-done
+
+	ctx, cancel := context.WithCancel(context.Background())
+
+	// Additonal delay after upgrade completion
+	g.By("Waiting before checking for critical alerts")
+	time.Sleep(alertCheckSleep)
+	cancel()
+
+	if helper.TestUnsupportedAllowVersionSkew() {
+		e2eskipper.Skipf("Test is disabled to allow cluster components to have different versions, and skewed versions trigger multiple other alerts")
+	}
+	t.oc.SetupProject()
+	ns := t.oc.Namespace()
+	execPod := exutil.CreateCentosExecPodOrFail(t.oc.AdminKubeClient(), ns, "execpod", nil)
+	defer func() {
+		t.oc.AdminKubeClient().CoreV1().Pods(ns).Delete(ctx, execPod.Name, *metav1.NewDeleteOptions(1))
+	}()
+
+	// Query to check if Prometheus has been up and running for entire post-upgrade
+	// period by verifying Watchdog alert has been in firing state
+	watchdogQuery := fmt.Sprintf(`count_over_time(ALERTS{alertstate="firing",alertname="Watchdog", severity="none"}[%dm])`, alertCheckSleepMinutes)
+
+	// Query to check for any critical severity alerts that have occurred within the last alertPeriodCheckMinutes.
+	// TODO Remove KubeAPIErrorBudgetBurn from ignore list once Bug 1821661 is fixed.
+	criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh|KubeAPIErrorBudgetBurn",alertstate="firing",severity="critical"}[%dm]) >= 1`, alertPeriodCheckMinutes)
+
+	tests := map[string]bool{
+		watchdogQuery:      true,
+		criticalAlertQuery: false,
+	}
+
+	helper.RunQueries(tests, t.oc, ns, execPod.Name, t.url, t.bearerToken)
+
+	framework.Logf("No crtical alerts firing post-upgrade")
+}
+
+// Teardown cleans up any remaining resources.
+func (t *UpgradeTest) Teardown(f *framework.Framework) {
+	// rely on the namespace deletion to clean up everything
+}
diff --git a/test/e2e/upgrade/upgrade.go b/test/e2e/upgrade/upgrade.go
@@ -26,6 +26,7 @@ import (
 
 	configv1 "github.com/openshift/api/config/v1"
 	configv1client "github.com/openshift/client-go/config/clientset/versioned"
+	"github.com/openshift/origin/test/e2e/upgrade/alert"
 	"github.com/openshift/origin/test/e2e/upgrade/service"
 	"github.com/openshift/origin/test/extended/util/disruption"
 	"github.com/openshift/origin/test/extended/util/disruption/controlplane"
@@ -36,6 +37,7 @@ func AllTests() []upgrades.Test {
 	return []upgrades.Test{
 		&controlplane.KubeAvailableTest{},
 		&controlplane.OpenShiftAvailableTest{},
+		&alert.UpgradeTest{},
 		&frontends.AvailableTest{},
 		&service.UpgradeTest{},
 		&upgrades.SecretUpgradeTest{},

diff --git a/test/extended/prometheus/prometheus.go b/test/extended/prometheus/prometheus.go
@@ -33,6 +33,7 @@ import (
 	"github.com/openshift/origin/test/extended/networking"
 	exutil "github.com/openshift/origin/test/extended/util"
 	"github.com/openshift/origin/test/extended/util/ibmcloud"
+	helper "github.com/openshift/origin/test/extended/util/prometheus"
 )
 
 var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
@@ -44,7 +45,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
 	)
 	g.BeforeEach(func() {
 		var ok bool
-		url, bearerToken, ok = locatePrometheus(oc)
+		url, bearerToken, ok = helper.LocatePrometheus(oc)
 		if !ok {
 			e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test")
 		}
@@ -65,7 +66,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
 			// Checking Watchdog alert state is done in "should have a Watchdog alert in firing state".
 			`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity!="info"}[2h]) >= 1`: false,
 		}
-		runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
+		helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
 	})
 
 	g.It("should have a Watchdog alert in firing state the entire cluster run", func() {
@@ -80,7 +81,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
 			// should have constantly firing a watchdog alert
 			`count_over_time(ALERTS{alertstate="firing",alertname="Watchdog", severity="none"}[1h])`: true,
 		}
-		runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
+		helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
 
 		e2e.Logf("Watchdog alert is firing")
 	})
@@ -102,7 +103,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
 			// rule contains the count of the all the series that are sent via telemetry.
 			`max_over_time(cluster:telemetry_selected_series:count[2h]) >= 500`: false,
 		}
-		runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
+		helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
 
 		e2e.Logf("Total number of series sent via telemetry is below the limit")
 	})
@@ -119,7 +120,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 
 	g.BeforeEach(func() {
 		var ok bool
-		url, bearerToken, ok = locatePrometheus(oc)
+		url, bearerToken, ok = helper.LocatePrometheus(oc)
 		if !ok {
 			e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test")
 		}
@@ -149,7 +150,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 				// should have scraped some metrics from prometheus
 				`federate_samples{job="telemeter-client"} >= 10`: true,
 			}
-			runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
+			helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
 
 			e2e.Logf("Telemetry is enabled: %s", bearerToken)
 		})
@@ -193,7 +194,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 			})).NotTo(o.HaveOccurred(), fmt.Sprintf("Did not find tsdb_samples_appended_total, tsdb_head_samples_appended_total, or prometheus_tsdb_head_samples_appended_total"))
 
 			g.By("verifying the oauth-proxy reports a 403 on the root URL")
-			err := expectURLStatusCodeExec(ns, execPod.Name, url, 403)
+			err := helper.ExpectURLStatusCodeExec(ns, execPod.Name, url, 403)
 			o.Expect(err).NotTo(o.HaveOccurred())
 
 			g.By("verifying a service account token is able to authenticate")
@@ -302,7 +303,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 				// should have constantly firing a watchdog alert
 				`ALERTS{alertstate="firing",alertname="AlertmanagerReceiversNotConfigured"} == 1`: true,
 			}
-			runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
+			helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
 
 			e2e.Logf("AlertmanagerReceiversNotConfigured alert is firing")
 		})
@@ -329,7 +330,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 				`sum(node_role_os_version_machine:cpu_capacity_cores:sum{label_kubernetes_io_arch!="",label_node_role_kubernetes_io_master!=""}) > 0`:                                      true,
 				`sum(node_role_os_version_machine:cpu_capacity_sockets:sum{label_kubernetes_io_arch!="",label_node_hyperthread_enabled!="",label_node_role_kubernetes_io_master!=""}) > 0`: true,
 			}
-			runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
+			helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
 		})
 		g.It("should have non-Pod host cAdvisor metrics", func() {
 			oc.SetupProject()
@@ -342,7 +343,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 			tests := map[string]bool{
 				`container_cpu_usage_seconds_total{id!~"/kubepods.slice/.*"} >= 1`: true,
 			}
-			runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
+			helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
 		})
 		g.It("shouldn't have failing rules evaluation", func() {
 			oc.SetupProject()
@@ -355,7 +356,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 			tests := map[string]bool{
 				`prometheus_rule_evaluation_failures_total >= 1`: false,
 			}
-			runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
+			helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
 		})
 		networking.InOpenShiftSDNContext(func() {
 			g.It("should be able to get the sdn ovs flows", func() {
@@ -370,7 +371,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 					//something
 					`openshift_sdn_ovs_flows >= 1`: true,
 				}
-				runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
+				helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
 			})
 		})
 		g.It("shouldn't report any alerts in firing state apart from Watchdog and AlertmanagerReceiversNotConfigured [Early]", func() {
@@ -388,7 +389,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 				// Checking Watchdog alert state is done in "should have a Watchdog alert in firing state".
 				`ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|PrometheusRemoteWriteDesiredShards",alertstate="firing",severity!="info"} >= 1`: false,
 			}
-			runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
+			helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
 		})
 		g.It("should provide ingress metrics", func() {
 			oc.SetupProject()
@@ -425,7 +426,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
 				`template_router_reload_seconds_count{job="router-internal-default"} >= 1`: true,
 				`haproxy_server_up{job="router-internal-default"} >= 1`:                    true,
 			}
-			runQueries(queries, oc, ns, execPod.Name, url, bearerToken)
+			helper.RunQueries(queries, oc, ns, execPod.Name, url, bearerToken)
 		})
 	})
 })
@@ -546,18 +547,6 @@ func findMetricLabels(f *dto.MetricFamily, labels map[string]string, match strin
 	return result
 }
 
-func expectURLStatusCodeExec(ns, execPodName, url string, statusCode int) error {
-	cmd := fmt.Sprintf("curl -k -s -o /dev/null -w '%%{http_code}' %q", url)
-	output, err := e2e.RunHostCmd(ns, execPodName, cmd)
-	if err != nil {
-		return fmt.Errorf("host command failed: %v\n%s", err, output)
-	}
-	if output != strconv.Itoa(statusCode) {
-		return fmt.Errorf("last response from server was not %d: %s", statusCode, output)
-	}
-	return nil
-}
-
 func expectBearerTokenURLStatusCodeExec(ns, execPodName, url, bearer string, statusCode int) error {
 	cmd := fmt.Sprintf("curl -k -s -H 'Authorization: Bearer %s' -o /dev/null -w '%%{http_code}' %q", bearer, url)
 	output, err := e2e.RunHostCmd(ns, execPodName, cmd)