Skip to content

Commit

Permalink
Merge pull request openshift#24786 from jottofar/ota-141-add-ci-alerts
Browse files Browse the repository at this point in the history
Bug 1828427: Add CI test to check for critical alerts post upgrade success
  • Loading branch information
openshift-merge-robot committed Apr 27, 2020
2 parents 83fb2e9 + 3a92334 commit 3be2bc2
Show file tree
Hide file tree
Showing 6 changed files with 341 additions and 118 deletions.
103 changes: 103 additions & 0 deletions test/e2e/upgrade/alert/alert.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package alert

import (
"context"
"fmt"
"time"

g "github.com/onsi/ginkgo"

exutil "github.com/openshift/origin/test/extended/util"
helper "github.com/openshift/origin/test/extended/util/prometheus"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/test/e2e/framework"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
"k8s.io/kubernetes/test/e2e/upgrades"
)

const (
// Delay after upgrade is complete before checking for critical alerts
alertCheckSleepMinutes = 5
alertCheckSleep = alertCheckSleepMinutes * time.Minute

// Previous period in which to check for critical alerts
alertPeriodCheckMinutes = 1
)

// UpgradeTest runs post-upgrade after alertCheckSleep delay and tests if any critical alerts are firing.
type UpgradeTest struct {
url string
bearerToken string
oc *exutil.CLI
}

func (UpgradeTest) Name() string { return "check-for-critical-alerts" }
func (UpgradeTest) DisplayName() string {
return "Check if critical alerts are firing after upgrade success"
}

// Setup creates parameters to query Prometheus
func (t *UpgradeTest) Setup(f *framework.Framework) {
g.By("Setting up post-upgrade alert test")

url, bearerToken, oc, ok := helper.ExpectPrometheus(f)
if !ok {
framework.Failf("Prometheus could not be located on this cluster, failing test %s", t.Name())
}
t.url = url
t.bearerToken = bearerToken
t.oc = oc
framework.Logf("Post-upgrade alert test setup complete")
}

// Test checks if any critical alerts are firing.
func (t *UpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade upgrades.UpgradeType) {
g.By("Checking for critical alerts")

// Recover current test if it fails so test suite can complete
defer g.GinkgoRecover()

// Block until upgrade is done
g.By("Waiting for upgrade to finish before checking for critical alerts")
<-done

ctx, cancel := context.WithCancel(context.Background())

// Additonal delay after upgrade completion
g.By("Waiting before checking for critical alerts")
time.Sleep(alertCheckSleep)
cancel()

if helper.TestUnsupportedAllowVersionSkew() {
e2eskipper.Skipf("Test is disabled to allow cluster components to have different versions, and skewed versions trigger multiple other alerts")
}
t.oc.SetupProject()
ns := t.oc.Namespace()
execPod := exutil.CreateCentosExecPodOrFail(t.oc.AdminKubeClient(), ns, "execpod", nil)
defer func() {
t.oc.AdminKubeClient().CoreV1().Pods(ns).Delete(ctx, execPod.Name, *metav1.NewDeleteOptions(1))
}()

// Query to check if Prometheus has been up and running for entire post-upgrade
// period by verifying Watchdog alert has been in firing state
watchdogQuery := fmt.Sprintf(`count_over_time(ALERTS{alertstate="firing",alertname="Watchdog", severity="none"}[%dm])`, alertCheckSleepMinutes)

// Query to check for any critical severity alerts that have occurred within the last alertPeriodCheckMinutes.
// TODO Remove KubeAPIErrorBudgetBurn from ignore list once Bug 1821661 is fixed.
criticalAlertQuery := fmt.Sprintf(`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh|KubeAPIErrorBudgetBurn",alertstate="firing",severity="critical"}[%dm]) >= 1`, alertPeriodCheckMinutes)

tests := map[string]bool{
watchdogQuery: true,
criticalAlertQuery: false,
}

helper.RunQueries(tests, t.oc, ns, execPod.Name, t.url, t.bearerToken)

framework.Logf("No crtical alerts firing post-upgrade")
}

// Teardown cleans up any remaining resources.
func (t *UpgradeTest) Teardown(f *framework.Framework) {
// rely on the namespace deletion to clean up everything
}
2 changes: 2 additions & 0 deletions test/e2e/upgrade/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

configv1 "github.com/openshift/api/config/v1"
configv1client "github.com/openshift/client-go/config/clientset/versioned"
"github.com/openshift/origin/test/e2e/upgrade/alert"
"github.com/openshift/origin/test/e2e/upgrade/service"
"github.com/openshift/origin/test/extended/util/disruption"
"github.com/openshift/origin/test/extended/util/disruption/controlplane"
Expand All @@ -36,6 +37,7 @@ func AllTests() []upgrades.Test {
return []upgrades.Test{
&controlplane.KubeAvailableTest{},
&controlplane.OpenShiftAvailableTest{},
&alert.UpgradeTest{},
&frontends.AvailableTest{},
&service.UpgradeTest{},
&upgrades.SecretUpgradeTest{},
Expand Down
41 changes: 15 additions & 26 deletions test/extended/prometheus/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/openshift/origin/test/extended/networking"
exutil "github.com/openshift/origin/test/extended/util"
"github.com/openshift/origin/test/extended/util/ibmcloud"
helper "github.com/openshift/origin/test/extended/util/prometheus"
)

var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
Expand All @@ -44,7 +45,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
)
g.BeforeEach(func() {
var ok bool
url, bearerToken, ok = locatePrometheus(oc)
url, bearerToken, ok = helper.LocatePrometheus(oc)
if !ok {
e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test")
}
Expand All @@ -65,7 +66,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
// Checking Watchdog alert state is done in "should have a Watchdog alert in firing state".
`count_over_time(ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|KubeAPILatencyHigh",alertstate="firing",severity!="info"}[2h]) >= 1`: false,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
})

g.It("should have a Watchdog alert in firing state the entire cluster run", func() {
Expand All @@ -80,7 +81,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
// should have constantly firing a watchdog alert
`count_over_time(ALERTS{alertstate="firing",alertname="Watchdog", severity="none"}[1h])`: true,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)

e2e.Logf("Watchdog alert is firing")
})
Expand All @@ -102,7 +103,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
// rule contains the count of the all the series that are sent via telemetry.
`max_over_time(cluster:telemetry_selected_series:count[2h]) >= 500`: false,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)

e2e.Logf("Total number of series sent via telemetry is below the limit")
})
Expand All @@ -119,7 +120,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {

g.BeforeEach(func() {
var ok bool
url, bearerToken, ok = locatePrometheus(oc)
url, bearerToken, ok = helper.LocatePrometheus(oc)
if !ok {
e2e.Failf("Prometheus could not be located on this cluster, failing prometheus test")
}
Expand Down Expand Up @@ -149,7 +150,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
// should have scraped some metrics from prometheus
`federate_samples{job="telemeter-client"} >= 10`: true,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)

e2e.Logf("Telemetry is enabled: %s", bearerToken)
})
Expand Down Expand Up @@ -193,7 +194,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
})).NotTo(o.HaveOccurred(), fmt.Sprintf("Did not find tsdb_samples_appended_total, tsdb_head_samples_appended_total, or prometheus_tsdb_head_samples_appended_total"))

g.By("verifying the oauth-proxy reports a 403 on the root URL")
err := expectURLStatusCodeExec(ns, execPod.Name, url, 403)
err := helper.ExpectURLStatusCodeExec(ns, execPod.Name, url, 403)
o.Expect(err).NotTo(o.HaveOccurred())

g.By("verifying a service account token is able to authenticate")
Expand Down Expand Up @@ -302,7 +303,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
// should have constantly firing a watchdog alert
`ALERTS{alertstate="firing",alertname="AlertmanagerReceiversNotConfigured"} == 1`: true,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)

e2e.Logf("AlertmanagerReceiversNotConfigured alert is firing")
})
Expand All @@ -329,7 +330,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
`sum(node_role_os_version_machine:cpu_capacity_cores:sum{label_kubernetes_io_arch!="",label_node_role_kubernetes_io_master!=""}) > 0`: true,
`sum(node_role_os_version_machine:cpu_capacity_sockets:sum{label_kubernetes_io_arch!="",label_node_hyperthread_enabled!="",label_node_role_kubernetes_io_master!=""}) > 0`: true,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
})
g.It("should have non-Pod host cAdvisor metrics", func() {
oc.SetupProject()
Expand All @@ -342,7 +343,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
tests := map[string]bool{
`container_cpu_usage_seconds_total{id!~"/kubepods.slice/.*"} >= 1`: true,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
})
g.It("shouldn't have failing rules evaluation", func() {
oc.SetupProject()
Expand All @@ -355,7 +356,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
tests := map[string]bool{
`prometheus_rule_evaluation_failures_total >= 1`: false,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
})
networking.InOpenShiftSDNContext(func() {
g.It("should be able to get the sdn ovs flows", func() {
Expand All @@ -370,7 +371,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
//something
`openshift_sdn_ovs_flows >= 1`: true,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
})
})
g.It("shouldn't report any alerts in firing state apart from Watchdog and AlertmanagerReceiversNotConfigured [Early]", func() {
Expand All @@ -388,7 +389,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
// Checking Watchdog alert state is done in "should have a Watchdog alert in firing state".
`ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|PrometheusRemoteWriteDesiredShards",alertstate="firing",severity!="info"} >= 1`: false,
}
runQueries(tests, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(tests, oc, ns, execPod.Name, url, bearerToken)
})
g.It("should provide ingress metrics", func() {
oc.SetupProject()
Expand Down Expand Up @@ -425,7 +426,7 @@ var _ = g.Describe("[sig-instrumentation] Prometheus", func() {
`template_router_reload_seconds_count{job="router-internal-default"} >= 1`: true,
`haproxy_server_up{job="router-internal-default"} >= 1`: true,
}
runQueries(queries, oc, ns, execPod.Name, url, bearerToken)
helper.RunQueries(queries, oc, ns, execPod.Name, url, bearerToken)
})
})
})
Expand Down Expand Up @@ -546,18 +547,6 @@ func findMetricLabels(f *dto.MetricFamily, labels map[string]string, match strin
return result
}

func expectURLStatusCodeExec(ns, execPodName, url string, statusCode int) error {
cmd := fmt.Sprintf("curl -k -s -o /dev/null -w '%%{http_code}' %q", url)
output, err := e2e.RunHostCmd(ns, execPodName, cmd)
if err != nil {
return fmt.Errorf("host command failed: %v\n%s", err, output)
}
if output != strconv.Itoa(statusCode) {
return fmt.Errorf("last response from server was not %d: %s", statusCode, output)
}
return nil
}

func expectBearerTokenURLStatusCodeExec(ns, execPodName, url, bearer string, statusCode int) error {
cmd := fmt.Sprintf("curl -k -s -H 'Authorization: Bearer %s' -o /dev/null -w '%%{http_code}' %q", bearer, url)
output, err := e2e.RunHostCmd(ns, execPodName, cmd)
Expand Down
Loading

0 comments on commit 3be2bc2

Please sign in to comment.