diff --git a/cmd/tke-application-controller/controller-manager.go b/cmd/tke-application-controller/controller-manager.go index a27646749..bfa664439 100644 --- a/cmd/tke-application-controller/controller-manager.go +++ b/cmd/tke-application-controller/controller-manager.go @@ -20,10 +20,13 @@ package main import ( "math/rand" + "net/http" "os" "runtime" "time" + "github.com/prometheus/client_golang/prometheus/promhttp" + "k8s.io/klog" "tkestack.io/tke/cmd/tke-application-controller/app" ) @@ -33,5 +36,13 @@ func main() { runtime.GOMAXPROCS(runtime.NumCPU()) } + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.Handler()) + go func() { + if err := http.ListenAndServe(":9464", mux); err != nil { + klog.Fatal(err) + } + }() + app.NewApp("tke-application-controller").Run() } diff --git a/pkg/application/controller/app/action/install.go b/pkg/application/controller/app/action/install.go index f8aefc0e0..0ab47283b 100644 --- a/pkg/application/controller/app/action/install.go +++ b/pkg/application/controller/app/action/install.go @@ -33,6 +33,7 @@ import ( "tkestack.io/tke/pkg/application/util" chartpath "tkestack.io/tke/pkg/application/util/chartpath/v1" "tkestack.io/tke/pkg/util/log" + "tkestack.io/tke/pkg/util/metrics" ) // Install installs a chart archive @@ -57,6 +58,7 @@ func Install(ctx context.Context, if updateStatusFunc != nil { if app.Status.Phase == applicationv1.AppPhaseInstallFailed { log.Error(fmt.Sprintf("install app failed, helm pull err: %s", err.Error())) + metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1) // delayed retry, queue.AddRateLimited does not meet the demand return app, nil } @@ -65,6 +67,7 @@ func Install(ctx context.Context, newStatus.Reason = err.Error() newStatus.LastTransitionTime = metav1.Now() _, updateStatusErr := updateStatusFunc(ctx, app, &app.Status, newStatus) + metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1) if updateStatusErr != nil { return nil, updateStatusErr } @@ -101,6 +104,7 @@ func Install(ctx context.Context, if err != nil { if app.Status.Phase == applicationv1.AppPhaseInstallFailed { log.Error(fmt.Sprintf("install app failed, helm install err: %s", err.Error())) + metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1) // delayed retry, queue.AddRateLimited does not meet the demand return app, nil } @@ -108,6 +112,7 @@ func Install(ctx context.Context, newStatus.Message = "install app failed" newStatus.Reason = err.Error() newStatus.LastTransitionTime = metav1.Now() + metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1) } else { newStatus.Phase = applicationv1.AppPhaseSucceeded newStatus.Message = "" diff --git a/pkg/application/controller/app/action/rollback.go b/pkg/application/controller/app/action/rollback.go index 7a6597a25..87c66c98f 100644 --- a/pkg/application/controller/app/action/rollback.go +++ b/pkg/application/controller/app/action/rollback.go @@ -29,6 +29,7 @@ import ( helmaction "tkestack.io/tke/pkg/application/helm/action" applicationprovider "tkestack.io/tke/pkg/application/provider/application" "tkestack.io/tke/pkg/application/util" + "tkestack.io/tke/pkg/util/metrics" ) // Rollback roll back to the previous release @@ -62,6 +63,7 @@ func Rollback(ctx context.Context, newStatus.Message = "rollback app failed" newStatus.Reason = err.Error() newStatus.LastTransitionTime = metav1.Now() + metrics.GaugeApplicationRollbackFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1) } else { newStatus.Phase = applicationv1.AppPhaseRolledBack newStatus.Message = "" diff --git a/pkg/application/controller/app/action/upgrade.go b/pkg/application/controller/app/action/upgrade.go index e364c730c..bd139e204 100644 --- a/pkg/application/controller/app/action/upgrade.go +++ b/pkg/application/controller/app/action/upgrade.go @@ -33,6 +33,7 @@ import ( "tkestack.io/tke/pkg/application/util" chartpath "tkestack.io/tke/pkg/application/util/chartpath/v1" "tkestack.io/tke/pkg/util/log" + "tkestack.io/tke/pkg/util/metrics" ) // Upgrade upgrade a helm release @@ -58,6 +59,7 @@ func Upgrade(ctx context.Context, if updateStatusFunc != nil { if app.Status.Phase == applicationv1.AppPhaseUpgradFailed { log.Error(fmt.Sprintf("upgrade app failed, helm pull err: %s", err.Error())) + metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1) // delayed retry, queue.AddRateLimited does not meet the demand return app, nil } @@ -66,6 +68,7 @@ func Upgrade(ctx context.Context, newStatus.Reason = err.Error() newStatus.LastTransitionTime = metav1.Now() updateStatusFunc(ctx, app, &app.Status, newStatus) + metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1) } return nil, err } @@ -102,6 +105,7 @@ func Upgrade(ctx context.Context, if err != nil { if app.Status.Phase == applicationv1.AppPhaseUpgradFailed { log.Error(fmt.Sprintf("upgrade app failed, helm upgrade err: %s", err.Error())) + metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1) // delayed retry, queue.AddRateLimited does not meet the demand return app, nil } @@ -109,6 +113,7 @@ func Upgrade(ctx context.Context, newStatus.Message = "upgrade app failed" newStatus.Reason = err.Error() newStatus.LastTransitionTime = metav1.Now() + metrics.GaugeApplicationUpgradeFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1) } else { newStatus.Phase = applicationv1.AppPhaseSucceeded newStatus.Message = "" diff --git a/pkg/application/controller/app/app_controller.go b/pkg/application/controller/app/app_controller.go index 696796784..41c960303 100644 --- a/pkg/application/controller/app/app_controller.go +++ b/pkg/application/controller/app/app_controller.go @@ -373,6 +373,7 @@ func (c *Controller) syncAppFromRelease(ctx context.Context, cachedApp *cachedAp if err != nil { if app.Status.Phase == applicationv1.AppPhaseSyncFailed { log.Error(fmt.Sprintf("sync app failed, helm list failed, err: %s", err.Error())) + metrics.GaugeApplicationSyncFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1) // delayed retry, queue.AddRateLimited does not meet the demand return app, nil } @@ -380,12 +381,14 @@ func (c *Controller) syncAppFromRelease(ctx context.Context, cachedApp *cachedAp newStatus.Message = "sync app failed" newStatus.Reason = err.Error() newStatus.LastTransitionTime = metav1.Now() + metrics.GaugeApplicationSyncFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1) return c.updateStatus(ctx, app, &app.Status, newStatus) } rel, found := helmutil.Filter(rels, app.Spec.TargetNamespace, app.Spec.Name) if !found { if app.Status.Phase == applicationv1.AppPhaseSyncFailed { log.Error(fmt.Sprintf("sync app failed, release not found: %s/%s", app.Spec.TargetNamespace, app.Spec.Name)) + metrics.GaugeApplicationSyncFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1) // delayed retry, queue.AddRateLimited does not meet the demand return app, nil } @@ -393,6 +396,7 @@ func (c *Controller) syncAppFromRelease(ctx context.Context, cachedApp *cachedAp newStatus.Message = "sync app failed" newStatus.Reason = fmt.Sprintf("release not found: %s/%s", app.Spec.TargetNamespace, app.Spec.Name) newStatus.LastTransitionTime = metav1.Now() + metrics.GaugeApplicationSyncFailed.WithLabelValues(app.Spec.TargetCluster, app.Name).Set(1) return c.updateStatus(ctx, app, &app.Status, newStatus) } app.Spec.Chart.ChartVersion = rel.Chart.Metadata.Version diff --git a/pkg/util/metrics/util.go b/pkg/util/metrics/util.go index f4295207f..f1dfff25a 100644 --- a/pkg/util/metrics/util.go +++ b/pkg/util/metrics/util.go @@ -79,3 +79,27 @@ func RegisterMetricAndTrackRateLimiterUsage(ownerName string, rateLimiter flowco // }, updatePeriod, rateLimiterMetrics[ownerName].stopCh) return nil } + +var ( + GaugeApplicationInstallFailed = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "applicationInstallFailed", + Help: "application install failed count", + }, []string{"cluster", "application"}) + GaugeApplicationUpgradeFailed = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "applicationUpgradeFailed", + Help: "application upgrade failed count", + }, []string{"cluster", "application"}) + GaugeApplicationRollbackFailed = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "applicationRollbackFailed", + Help: "application rollback failed count", + }, []string{"cluster", "application"}) + GaugeApplicationSyncFailed = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "applicationSyncFailed", + Help: "application sync failed count", + }, []string{"cluster", "application"}) +) + +func init() { + // Register the summary and the histogram with Prometheus's default registry. + prometheus.MustRegister(GaugeApplicationInstallFailed, GaugeApplicationUpgradeFailed, GaugeApplicationRollbackFailed, GaugeApplicationSyncFailed) +}