Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(application): add application metrics for failed #1963

Merged
merged 1 commit into from
May 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions cmd/tke-application-controller/controller-manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,13 @@ package main

import (
"math/rand"
"net/http"
"os"
"runtime"
"time"

"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/klog"
"tkestack.io/tke/cmd/tke-application-controller/app"
)

Expand All @@ -33,5 +36,13 @@ func main() {
runtime.GOMAXPROCS(runtime.NumCPU())
}

mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.Handler())
go func() {
if err := http.ListenAndServe(":9464", mux); err != nil {
klog.Fatal(err)
}
}()

app.NewApp("tke-application-controller").Run()
}
5 changes: 5 additions & 0 deletions pkg/application/controller/app/action/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"tkestack.io/tke/pkg/application/util"
chartpath "tkestack.io/tke/pkg/application/util/chartpath/v1"
"tkestack.io/tke/pkg/util/log"
"tkestack.io/tke/pkg/util/metrics"
)

// Install installs a chart archive
Expand All @@ -57,6 +58,7 @@ func Install(ctx context.Context,
if updateStatusFunc != nil {
if app.Status.Phase == applicationv1.AppPhaseInstallFailed {
log.Error(fmt.Sprintf("install app failed, helm pull err: %s", err.Error()))
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
// delayed retry, queue.AddRateLimited does not meet the demand
return app, nil
}
Expand All @@ -65,6 +67,7 @@ func Install(ctx context.Context,
newStatus.Reason = err.Error()
newStatus.LastTransitionTime = metav1.Now()
_, updateStatusErr := updateStatusFunc(ctx, app, &app.Status, newStatus)
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
if updateStatusErr != nil {
return nil, updateStatusErr
}
Expand Down Expand Up @@ -101,13 +104,15 @@ func Install(ctx context.Context,
if err != nil {
if app.Status.Phase == applicationv1.AppPhaseInstallFailed {
log.Error(fmt.Sprintf("install app failed, helm install err: %s", err.Error()))
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
// delayed retry, queue.AddRateLimited does not meet the demand
return app, nil
}
newStatus.Phase = applicationv1.AppPhaseInstallFailed
newStatus.Message = "install app failed"
newStatus.Reason = err.Error()
newStatus.LastTransitionTime = metav1.Now()
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
} else {
newStatus.Phase = applicationv1.AppPhaseSucceeded
newStatus.Message = ""
Expand Down
2 changes: 2 additions & 0 deletions pkg/application/controller/app/action/rollback.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
helmaction "tkestack.io/tke/pkg/application/helm/action"
applicationprovider "tkestack.io/tke/pkg/application/provider/application"
"tkestack.io/tke/pkg/application/util"
"tkestack.io/tke/pkg/util/metrics"
)

// Rollback roll back to the previous release
Expand Down Expand Up @@ -62,6 +63,7 @@ func Rollback(ctx context.Context,
newStatus.Message = "rollback app failed"
newStatus.Reason = err.Error()
newStatus.LastTransitionTime = metav1.Now()
metrics.GaugeApplicationRollbackFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
} else {
newStatus.Phase = applicationv1.AppPhaseRolledBack
newStatus.Message = ""
Expand Down
5 changes: 5 additions & 0 deletions pkg/application/controller/app/action/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"tkestack.io/tke/pkg/application/util"
chartpath "tkestack.io/tke/pkg/application/util/chartpath/v1"
"tkestack.io/tke/pkg/util/log"
"tkestack.io/tke/pkg/util/metrics"
)

// Upgrade upgrade a helm release
Expand All @@ -58,6 +59,7 @@ func Upgrade(ctx context.Context,
if updateStatusFunc != nil {
if app.Status.Phase == applicationv1.AppPhaseUpgradFailed {
log.Error(fmt.Sprintf("upgrade app failed, helm pull err: %s", err.Error()))
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
// delayed retry, queue.AddRateLimited does not meet the demand
return app, nil
}
Expand All @@ -66,6 +68,7 @@ func Upgrade(ctx context.Context,
newStatus.Reason = err.Error()
newStatus.LastTransitionTime = metav1.Now()
updateStatusFunc(ctx, app, &app.Status, newStatus)
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
}
return nil, err
}
Expand Down Expand Up @@ -102,13 +105,15 @@ func Upgrade(ctx context.Context,
if err != nil {
if app.Status.Phase == applicationv1.AppPhaseUpgradFailed {
log.Error(fmt.Sprintf("upgrade app failed, helm upgrade err: %s", err.Error()))
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
// delayed retry, queue.AddRateLimited does not meet the demand
return app, nil
}
newStatus.Phase = applicationv1.AppPhaseUpgradFailed
newStatus.Message = "upgrade app failed"
newStatus.Reason = err.Error()
newStatus.LastTransitionTime = metav1.Now()
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
} else {
newStatus.Phase = applicationv1.AppPhaseSucceeded
newStatus.Message = ""
Expand Down
4 changes: 4 additions & 0 deletions pkg/application/controller/app/app_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,26 +373,30 @@ func (c *Controller) syncAppFromRelease(ctx context.Context, cachedApp *cachedAp
if err != nil {
if app.Status.Phase == applicationv1.AppPhaseSyncFailed {
log.Error(fmt.Sprintf("sync app failed, helm list failed, err: %s", err.Error()))
metrics.GaugeApplicationSyncFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
// delayed retry, queue.AddRateLimited does not meet the demand
return app, nil
}
newStatus.Phase = applicationv1.AppPhaseSyncFailed
newStatus.Message = "sync app failed"
newStatus.Reason = err.Error()
newStatus.LastTransitionTime = metav1.Now()
metrics.GaugeApplicationSyncFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
return c.updateStatus(ctx, app, &app.Status, newStatus)
}
rel, found := helmutil.Filter(rels, app.Spec.TargetNamespace, app.Spec.Name)
if !found {
if app.Status.Phase == applicationv1.AppPhaseSyncFailed {
log.Error(fmt.Sprintf("sync app failed, release not found: %s/%s", app.Spec.TargetNamespace, app.Spec.Name))
metrics.GaugeApplicationSyncFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
// delayed retry, queue.AddRateLimited does not meet the demand
return app, nil
}
newStatus.Phase = applicationv1.AppPhaseSyncFailed
newStatus.Message = "sync app failed"
newStatus.Reason = fmt.Sprintf("release not found: %s/%s", app.Spec.TargetNamespace, app.Spec.Name)
newStatus.LastTransitionTime = metav1.Now()
metrics.GaugeApplicationSyncFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
return c.updateStatus(ctx, app, &app.Status, newStatus)
}
app.Spec.Chart.ChartVersion = rel.Chart.Metadata.Version
Expand Down
24 changes: 24 additions & 0 deletions pkg/util/metrics/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,27 @@ func RegisterMetricAndTrackRateLimiterUsage(ownerName string, rateLimiter flowco
// }, updatePeriod, rateLimiterMetrics[ownerName].stopCh)
return nil
}

var (
GaugeApplicationInstallFailed = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "applicationInstallFailed",
Help: "application install failed count",
}, []string{"cluster", "application"})
GaugeApplicationUpgradeFailed = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "applicationUpgradeFailed",
Help: "application upgrade failed count",
}, []string{"cluster", "application"})
GaugeApplicationRollbackFailed = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "applicationRollbackFailed",
Help: "application rollback failed count",
}, []string{"cluster", "application"})
GaugeApplicationSyncFailed = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "applicationSyncFailed",
Help: "application sync failed count",
}, []string{"cluster", "application"})
)

func init() {
// Register the summary and the histogram with Prometheus's default registry.
prometheus.MustRegister(GaugeApplicationInstallFailed, GaugeApplicationUpgradeFailed, GaugeApplicationRollbackFailed, GaugeApplicationSyncFailed)
}