Skip to content

Commit

Permalink
feat(application): add application metrics for failed (#1963)
Browse files Browse the repository at this point in the history
Co-authored-by: xdonggao <xdonggao@tencent.com>
  • Loading branch information
GaoXiaodong and xdonggao committed May 25, 2022
1 parent 08c5599 commit 97cae51
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 0 deletions.
11 changes: 11 additions & 0 deletions cmd/tke-application-controller/controller-manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,13 @@ package main

import (
"math/rand"
"net/http"
"os"
"runtime"
"time"

"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/klog"
"tkestack.io/tke/cmd/tke-application-controller/app"
)

Expand All @@ -33,5 +36,13 @@ func main() {
runtime.GOMAXPROCS(runtime.NumCPU())
}

mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.Handler())
go func() {
if err := http.ListenAndServe(":9464", mux); err != nil {
klog.Fatal(err)
}
}()

app.NewApp("tke-application-controller").Run()
}
5 changes: 5 additions & 0 deletions pkg/application/controller/app/action/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"tkestack.io/tke/pkg/application/util"
chartpath "tkestack.io/tke/pkg/application/util/chartpath/v1"
"tkestack.io/tke/pkg/util/log"
"tkestack.io/tke/pkg/util/metrics"
)

// Install installs a chart archive
Expand All @@ -57,6 +58,7 @@ func Install(ctx context.Context,
if updateStatusFunc != nil {
if app.Status.Phase == applicationv1.AppPhaseInstallFailed {
log.Error(fmt.Sprintf("install app failed, helm pull err: %s", err.Error()))
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
// delayed retry, queue.AddRateLimited does not meet the demand
return app, nil
}
Expand All @@ -65,6 +67,7 @@ func Install(ctx context.Context,
newStatus.Reason = err.Error()
newStatus.LastTransitionTime = metav1.Now()
_, updateStatusErr := updateStatusFunc(ctx, app, &app.Status, newStatus)
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
if updateStatusErr != nil {
return nil, updateStatusErr
}
Expand Down Expand Up @@ -101,13 +104,15 @@ func Install(ctx context.Context,
if err != nil {
if app.Status.Phase == applicationv1.AppPhaseInstallFailed {
log.Error(fmt.Sprintf("install app failed, helm install err: %s", err.Error()))
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
// delayed retry, queue.AddRateLimited does not meet the demand
return app, nil
}
newStatus.Phase = applicationv1.AppPhaseInstallFailed
newStatus.Message = "install app failed"
newStatus.Reason = err.Error()
newStatus.LastTransitionTime = metav1.Now()
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
} else {
newStatus.Phase = applicationv1.AppPhaseSucceeded
newStatus.Message = ""
Expand Down
2 changes: 2 additions & 0 deletions pkg/application/controller/app/action/rollback.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
helmaction "tkestack.io/tke/pkg/application/helm/action"
applicationprovider "tkestack.io/tke/pkg/application/provider/application"
"tkestack.io/tke/pkg/application/util"
"tkestack.io/tke/pkg/util/metrics"
)

// Rollback roll back to the previous release
Expand Down Expand Up @@ -62,6 +63,7 @@ func Rollback(ctx context.Context,
newStatus.Message = "rollback app failed"
newStatus.Reason = err.Error()
newStatus.LastTransitionTime = metav1.Now()
metrics.GaugeApplicationRollbackFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
} else {
newStatus.Phase = applicationv1.AppPhaseRolledBack
newStatus.Message = ""
Expand Down
5 changes: 5 additions & 0 deletions pkg/application/controller/app/action/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"tkestack.io/tke/pkg/application/util"
chartpath "tkestack.io/tke/pkg/application/util/chartpath/v1"
"tkestack.io/tke/pkg/util/log"
"tkestack.io/tke/pkg/util/metrics"
)

// Upgrade upgrade a helm release
Expand All @@ -58,6 +59,7 @@ func Upgrade(ctx context.Context,
if updateStatusFunc != nil {
if app.Status.Phase == applicationv1.AppPhaseUpgradFailed {
log.Error(fmt.Sprintf("upgrade app failed, helm pull err: %s", err.Error()))
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
// delayed retry, queue.AddRateLimited does not meet the demand
return app, nil
}
Expand All @@ -66,6 +68,7 @@ func Upgrade(ctx context.Context,
newStatus.Reason = err.Error()
newStatus.LastTransitionTime = metav1.Now()
updateStatusFunc(ctx, app, &app.Status, newStatus)
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
}
return nil, err
}
Expand Down Expand Up @@ -102,13 +105,15 @@ func Upgrade(ctx context.Context,
if err != nil {
if app.Status.Phase == applicationv1.AppPhaseUpgradFailed {
log.Error(fmt.Sprintf("upgrade app failed, helm upgrade err: %s", err.Error()))
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
// delayed retry, queue.AddRateLimited does not meet the demand
return app, nil
}
newStatus.Phase = applicationv1.AppPhaseUpgradFailed
newStatus.Message = "upgrade app failed"
newStatus.Reason = err.Error()
newStatus.LastTransitionTime = metav1.Now()
metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
} else {
newStatus.Phase = applicationv1.AppPhaseSucceeded
newStatus.Message = ""
Expand Down
4 changes: 4 additions & 0 deletions pkg/application/controller/app/app_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,26 +373,30 @@ func (c *Controller) syncAppFromRelease(ctx context.Context, cachedApp *cachedAp
if err != nil {
if app.Status.Phase == applicationv1.AppPhaseSyncFailed {
log.Error(fmt.Sprintf("sync app failed, helm list failed, err: %s", err.Error()))
metrics.GaugeApplicationSyncFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
// delayed retry, queue.AddRateLimited does not meet the demand
return app, nil
}
newStatus.Phase = applicationv1.AppPhaseSyncFailed
newStatus.Message = "sync app failed"
newStatus.Reason = err.Error()
newStatus.LastTransitionTime = metav1.Now()
metrics.GaugeApplicationSyncFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
return c.updateStatus(ctx, app, &app.Status, newStatus)
}
rel, found := helmutil.Filter(rels, app.Spec.TargetNamespace, app.Spec.Name)
if !found {
if app.Status.Phase == applicationv1.AppPhaseSyncFailed {
log.Error(fmt.Sprintf("sync app failed, release not found: %s/%s", app.Spec.TargetNamespace, app.Spec.Name))
metrics.GaugeApplicationSyncFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
// delayed retry, queue.AddRateLimited does not meet the demand
return app, nil
}
newStatus.Phase = applicationv1.AppPhaseSyncFailed
newStatus.Message = "sync app failed"
newStatus.Reason = fmt.Sprintf("release not found: %s/%s", app.Spec.TargetNamespace, app.Spec.Name)
newStatus.LastTransitionTime = metav1.Now()
metrics.GaugeApplicationSyncFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1)
return c.updateStatus(ctx, app, &app.Status, newStatus)
}
app.Spec.Chart.ChartVersion = rel.Chart.Metadata.Version
Expand Down
24 changes: 24 additions & 0 deletions pkg/util/metrics/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,27 @@ func RegisterMetricAndTrackRateLimiterUsage(ownerName string, rateLimiter flowco
// }, updatePeriod, rateLimiterMetrics[ownerName].stopCh)
return nil
}

var (
GaugeApplicationInstallFailed = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "applicationInstallFailed",
Help: "application install failed count",
}, []string{"cluster", "application"})
GaugeApplicationUpgradeFailed = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "applicationUpgradeFailed",
Help: "application upgrade failed count",
}, []string{"cluster", "application"})
GaugeApplicationRollbackFailed = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "applicationRollbackFailed",
Help: "application rollback failed count",
}, []string{"cluster", "application"})
GaugeApplicationSyncFailed = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "applicationSyncFailed",
Help: "application sync failed count",
}, []string{"cluster", "application"})
)

func init() {
// Register the summary and the histogram with Prometheus's default registry.
prometheus.MustRegister(GaugeApplicationInstallFailed, GaugeApplicationUpgradeFailed, GaugeApplicationRollbackFailed, GaugeApplicationSyncFailed)
}

0 comments on commit 97cae51

Please sign in to comment.