From 2b4ea922c2529c24c61a51475bcb49f5cf973dde Mon Sep 17 00:00:00 2001 From: Guy Baron Date: Fri, 9 Aug 2019 10:38:01 +0300 Subject: [PATCH] added metric report on saga timeout (#114) 1) added reporting saga timeouts to the glue component 2) fixed mysql timeoutmanager error when trying to clear a timeout --- gbus/metrics/saga_metrics.go | 29 +++++++++++++++++++ gbus/saga/glue.go | 4 +++ gbus/tx/mysql/timeout.go | 2 +- ...andler_metrics_test.go => metrics_test.go} | 0 tests/saga_test.go | 5 ++++ 5 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 gbus/metrics/saga_metrics.go rename tests/{handler_metrics_test.go => metrics_test.go} (100%) diff --git a/gbus/metrics/saga_metrics.go b/gbus/metrics/saga_metrics.go new file mode 100644 index 0000000..d3087c8 --- /dev/null +++ b/gbus/metrics/saga_metrics.go @@ -0,0 +1,29 @@ +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + io_prometheus_client "github.com/prometheus/client_model/go" +) + +var SagaTimeoutCounter = newSagaTimeoutCounter() + +func GetSagaTimeoutCounterValue() (float64, error) { + m := &io_prometheus_client.Metric{} + err := SagaTimeoutCounter.Write(m) + + if err != nil { + return 0, err + } + + return m.GetCounter().GetValue(), nil +} + +func newSagaTimeoutCounter() prometheus.Counter { + return promauto.NewCounter(prometheus.CounterOpts{ + Namespace: grabbitPrefix, + Subsystem: "saga", + Name: "timeedout_sagas", + Help: "counting the number of timedout saga instances", + }) +} diff --git a/gbus/saga/glue.go b/gbus/saga/glue.go index 9e6cc48..d6e71e2 100644 --- a/gbus/saga/glue.go +++ b/gbus/saga/glue.go @@ -10,6 +10,7 @@ import ( "github.com/sirupsen/logrus" "github.com/wework/grabbit/gbus" + "github.com/wework/grabbit/gbus/metrics" ) func fqnsFromMessages(objs []gbus.Message) []string { @@ -247,6 +248,7 @@ func (imsm *Glue) registerEvent(exchange, topic string, event gbus.Message) erro func (imsm *Glue) TimeoutSaga(tx *sql.Tx, sagaID string) error { saga, err := imsm.sagaStore.GetSagaByID(tx, sagaID) + //we are assuming that if the TimeoutSaga has been called but no instance returned from the store the saga //has been completed already and if err == ErrInstanceNotFound { @@ -260,6 +262,8 @@ func (imsm *Glue) TimeoutSaga(tx *sql.Tx, sagaID string) error { imsm.Log().WithError(timeoutErr).WithField("sagaID", sagaID).Error("failed to timeout saga") return timeoutErr } + + metrics.SagaTimeoutCounter.Inc() return imsm.completeOrUpdateSaga(tx, saga) } diff --git a/gbus/tx/mysql/timeout.go b/gbus/tx/mysql/timeout.go index 56ab6da..ae8ca33 100644 --- a/gbus/tx/mysql/timeout.go +++ b/gbus/tx/mysql/timeout.go @@ -186,7 +186,7 @@ func (tm *TimeoutManager) RegisterTimeout(tx *sql.Tx, sagaID string, duration ti //ClearTimeout clears a timeout for a specific saga func (tm *TimeoutManager) ClearTimeout(tx *sql.Tx, sagaID string) error { - deleteSQL := `delete from ` + tm.timeoutsTableName + ` where saga_id_id = ?` + deleteSQL := `delete from ` + tm.timeoutsTableName + ` where saga_id = ?` _, err := tx.Exec(deleteSQL, sagaID) return err } diff --git a/tests/handler_metrics_test.go b/tests/metrics_test.go similarity index 100% rename from tests/handler_metrics_test.go rename to tests/metrics_test.go diff --git a/tests/saga_test.go b/tests/saga_test.go index c4064a6..d9b380d 100644 --- a/tests/saga_test.go +++ b/tests/saga_test.go @@ -9,6 +9,7 @@ import ( "time" "github.com/wework/grabbit/gbus" + "github.com/wework/grabbit/gbus/metrics" ) /* @@ -225,6 +226,10 @@ func TestSagaTimeout(t *testing.T) { } <-proceed + timeoutCounter, e := metrics.GetSagaTimeoutCounterValue() + if timeoutCounter != 1 || e != nil { + t.Errorf("saga timeout counter expected to be 1 actual %v", timeoutCounter) + } } func TestSagaSelfMessaging(t *testing.T) {