diff --git a/common/metrics/defs.go b/common/metrics/defs.go index 92e51c990a1..87af9b59a1a 100644 --- a/common/metrics/defs.go +++ b/common/metrics/defs.go @@ -64,6 +64,10 @@ func NewDimensionlessHistogramDef(name string) metricDefinition { return metricDefinition{name: name, unit: Dimensionless} } +func NewTimeHistogramDef(name string) metricDefinition { + return metricDefinition{name: name, unit: Milliseconds} +} + func NewCounterDef(name string) metricDefinition { return metricDefinition{name: name} } diff --git a/common/metrics/metric_defs.go b/common/metrics/metric_defs.go index 761c847a7f6..430f1a69f0b 100644 --- a/common/metrics/metric_defs.go +++ b/common/metrics/metric_defs.go @@ -1379,90 +1379,96 @@ var ( // to standard dispatch. // Timeouts and failures are not counted in this metric. // This metric has a "reason" tag attached to it to understand why eager start was denied. - WorkflowEagerExecutionDeniedCounter = NewCounterDef("workflow_eager_execution_denied") - EmptyCompletionCommandsCounter = NewCounterDef("empty_completion_commands") - MultipleCompletionCommandsCounter = NewCounterDef("multiple_completion_commands") - FailedWorkflowTasksCounter = NewCounterDef("failed_workflow_tasks") - WorkflowTaskAttempt = NewDimensionlessHistogramDef("workflow_task_attempt") - StaleMutableStateCounter = NewCounterDef("stale_mutable_state") - AutoResetPointsLimitExceededCounter = NewCounterDef("auto_reset_points_exceed_limit") - AutoResetPointCorruptionCounter = NewCounterDef("auto_reset_point_corruption") - ConcurrencyUpdateFailureCounter = NewCounterDef("concurrency_update_failure") - ServiceErrShardOwnershipLostCounter = NewCounterDef("service_errors_shard_ownership_lost") - ServiceErrTaskAlreadyStartedCounter = NewCounterDef("service_errors_task_already_started") - HeartbeatTimeoutCounter = NewCounterDef("heartbeat_timeout") - ScheduleToStartTimeoutCounter = NewCounterDef("schedule_to_start_timeout") - StartToCloseTimeoutCounter = NewCounterDef("start_to_close_timeout") - ScheduleToCloseTimeoutCounter = NewCounterDef("schedule_to_close_timeout") - NewTimerNotifyCounter = NewCounterDef("new_timer_notifications") - AcquireShardsCounter = NewCounterDef("acquire_shards_count") - AcquireShardsLatency = NewTimerDef("acquire_shards_latency") - MembershipChangedCounter = NewCounterDef("membership_changed_count") - NumShardsGauge = NewGaugeDef("numshards_gauge") - GetEngineForShardErrorCounter = NewCounterDef("get_engine_for_shard_errors") - GetEngineForShardLatency = NewTimerDef("get_engine_for_shard_latency") - RemoveEngineForShardLatency = NewTimerDef("remove_engine_for_shard_latency") - CompleteWorkflowTaskWithStickyEnabledCounter = NewCounterDef("complete_workflow_task_sticky_enabled_count") - CompleteWorkflowTaskWithStickyDisabledCounter = NewCounterDef("complete_workflow_task_sticky_disabled_count") - WorkflowTaskHeartbeatTimeoutCounter = NewCounterDef("workflow_task_heartbeat_timeout_count") - EmptyReplicationEventsCounter = NewCounterDef("empty_replication_events") - DuplicateReplicationEventsCounter = NewCounterDef("duplicate_replication_events") - StaleReplicationEventsCounter = NewCounterDef("stale_replication_events") - ReplicationEventsSizeTimer = NewTimerDef("replication_events_size") - BufferReplicationTaskTimer = NewTimerDef("buffer_replication_tasks") - UnbufferReplicationTaskTimer = NewTimerDef("unbuffer_replication_tasks") - HistoryConflictsCounter = NewCounterDef("history_conflicts") - CompleteTaskFailedCounter = NewCounterDef("complete_task_fail_count") - AcquireLockFailedCounter = NewCounterDef("acquire_lock_failed") - WorkflowContextCleared = NewCounterDef("workflow_context_cleared") - MutableStateSize = NewBytesHistogramDef("mutable_state_size") - ExecutionInfoSize = NewBytesHistogramDef("execution_info_size") - ExecutionStateSize = NewBytesHistogramDef("execution_state_size") - ActivityInfoSize = NewBytesHistogramDef("activity_info_size") - TimerInfoSize = NewBytesHistogramDef("timer_info_size") - ChildInfoSize = NewBytesHistogramDef("child_info_size") - RequestCancelInfoSize = NewBytesHistogramDef("request_cancel_info_size") - SignalInfoSize = NewBytesHistogramDef("signal_info_size") - SignalRequestIDSize = NewBytesHistogramDef("signal_request_id_size") - BufferedEventsSize = NewBytesHistogramDef("buffered_events_size") - ActivityInfoCount = NewDimensionlessHistogramDef("activity_info_count") - TimerInfoCount = NewDimensionlessHistogramDef("timer_info_count") - ChildInfoCount = NewDimensionlessHistogramDef("child_info_count") - SignalInfoCount = NewDimensionlessHistogramDef("signal_info_count") - RequestCancelInfoCount = NewDimensionlessHistogramDef("request_cancel_info_count") - SignalRequestIDCount = NewDimensionlessHistogramDef("signal_request_id_count") - BufferedEventsCount = NewDimensionlessHistogramDef("buffered_events_count") - TaskCount = NewDimensionlessHistogramDef("task_count") - TotalActivityCount = NewDimensionlessHistogramDef("total_activity_count") - TotalUserTimerCount = NewDimensionlessHistogramDef("total_user_timer_count") - TotalChildExecutionCount = NewDimensionlessHistogramDef("total_child_execution_count") - TotalRequestCancelExternalCount = NewDimensionlessHistogramDef("total_request_cancel_external_count") - TotalSignalExternalCount = NewDimensionlessHistogramDef("total_signal_external_count") - TotalSignalCount = NewDimensionlessHistogramDef("total_signal_count") - WorkflowRetryBackoffTimerCount = NewCounterDef("workflow_retry_backoff_timer") - WorkflowCronBackoffTimerCount = NewCounterDef("workflow_cron_backoff_timer") - WorkflowDelayedStartBackoffTimerCount = NewCounterDef("workflow_delayed_start_backoff_timer") - WorkflowCleanupDeleteCount = NewCounterDef("workflow_cleanup_delete") - WorkflowCleanupArchiveCount = NewCounterDef("workflow_cleanup_archive") - WorkflowCleanupNopCount = NewCounterDef("workflow_cleanup_nop") - WorkflowCleanupDeleteHistoryInlineCount = NewCounterDef("workflow_cleanup_delete_history_inline") - WorkflowSuccessCount = NewCounterDef("workflow_success") - WorkflowCancelCount = NewCounterDef("workflow_cancel") - WorkflowFailedCount = NewCounterDef("workflow_failed") - WorkflowTimeoutCount = NewCounterDef("workflow_timeout") - WorkflowTerminateCount = NewCounterDef("workflow_terminate") - WorkflowContinuedAsNewCount = NewCounterDef("workflow_continued_as_new") - LastRetrievedMessageID = NewGaugeDef("last_retrieved_message_id") - LastProcessedMessageID = NewGaugeDef("last_processed_message_id") - ReplicationTasksSend = NewCounterDef("replication_tasks_send") - ReplicationTasksRecv = NewCounterDef("replication_tasks_recv") - ReplicationTasksRecvBacklog = NewDimensionlessHistogramDef("replication_tasks_recv_backlog") - ReplicationTasksApplied = NewCounterDef("replication_tasks_applied") - ReplicationTasksFailed = NewCounterDef("replication_tasks_failed") - ReplicationTasksLag = NewTimerDef("replication_tasks_lag") + WorkflowEagerExecutionDeniedCounter = NewCounterDef("workflow_eager_execution_denied") + EmptyCompletionCommandsCounter = NewCounterDef("empty_completion_commands") + MultipleCompletionCommandsCounter = NewCounterDef("multiple_completion_commands") + FailedWorkflowTasksCounter = NewCounterDef("failed_workflow_tasks") + WorkflowTaskAttempt = NewDimensionlessHistogramDef("workflow_task_attempt") + StaleMutableStateCounter = NewCounterDef("stale_mutable_state") + AutoResetPointsLimitExceededCounter = NewCounterDef("auto_reset_points_exceed_limit") + AutoResetPointCorruptionCounter = NewCounterDef("auto_reset_point_corruption") + ConcurrencyUpdateFailureCounter = NewCounterDef("concurrency_update_failure") + ServiceErrShardOwnershipLostCounter = NewCounterDef("service_errors_shard_ownership_lost") + ServiceErrTaskAlreadyStartedCounter = NewCounterDef("service_errors_task_already_started") + HeartbeatTimeoutCounter = NewCounterDef("heartbeat_timeout") + ScheduleToStartTimeoutCounter = NewCounterDef("schedule_to_start_timeout") + StartToCloseTimeoutCounter = NewCounterDef("start_to_close_timeout") + ScheduleToCloseTimeoutCounter = NewCounterDef("schedule_to_close_timeout") + NewTimerNotifyCounter = NewCounterDef("new_timer_notifications") + AcquireShardsCounter = NewCounterDef("acquire_shards_count") + AcquireShardsLatency = NewTimerDef("acquire_shards_latency") + MembershipChangedCounter = NewCounterDef("membership_changed_count") + NumShardsGauge = NewGaugeDef("numshards_gauge") + GetEngineForShardErrorCounter = NewCounterDef("get_engine_for_shard_errors") + GetEngineForShardLatency = NewTimerDef("get_engine_for_shard_latency") + RemoveEngineForShardLatency = NewTimerDef("remove_engine_for_shard_latency") + CompleteWorkflowTaskWithStickyEnabledCounter = NewCounterDef("complete_workflow_task_sticky_enabled_count") + CompleteWorkflowTaskWithStickyDisabledCounter = NewCounterDef("complete_workflow_task_sticky_disabled_count") + WorkflowTaskHeartbeatTimeoutCounter = NewCounterDef("workflow_task_heartbeat_timeout_count") + EmptyReplicationEventsCounter = NewCounterDef("empty_replication_events") + DuplicateReplicationEventsCounter = NewCounterDef("duplicate_replication_events") + StaleReplicationEventsCounter = NewCounterDef("stale_replication_events") + ReplicationEventsSizeTimer = NewTimerDef("replication_events_size") + BufferReplicationTaskTimer = NewTimerDef("buffer_replication_tasks") + UnbufferReplicationTaskTimer = NewTimerDef("unbuffer_replication_tasks") + HistoryConflictsCounter = NewCounterDef("history_conflicts") + CompleteTaskFailedCounter = NewCounterDef("complete_task_fail_count") + AcquireLockFailedCounter = NewCounterDef("acquire_lock_failed") + WorkflowContextCleared = NewCounterDef("workflow_context_cleared") + MutableStateSize = NewBytesHistogramDef("mutable_state_size") + ExecutionInfoSize = NewBytesHistogramDef("execution_info_size") + ExecutionStateSize = NewBytesHistogramDef("execution_state_size") + ActivityInfoSize = NewBytesHistogramDef("activity_info_size") + TimerInfoSize = NewBytesHistogramDef("timer_info_size") + ChildInfoSize = NewBytesHistogramDef("child_info_size") + RequestCancelInfoSize = NewBytesHistogramDef("request_cancel_info_size") + SignalInfoSize = NewBytesHistogramDef("signal_info_size") + SignalRequestIDSize = NewBytesHistogramDef("signal_request_id_size") + BufferedEventsSize = NewBytesHistogramDef("buffered_events_size") + ActivityInfoCount = NewDimensionlessHistogramDef("activity_info_count") + TimerInfoCount = NewDimensionlessHistogramDef("timer_info_count") + ChildInfoCount = NewDimensionlessHistogramDef("child_info_count") + SignalInfoCount = NewDimensionlessHistogramDef("signal_info_count") + RequestCancelInfoCount = NewDimensionlessHistogramDef("request_cancel_info_count") + SignalRequestIDCount = NewDimensionlessHistogramDef("signal_request_id_count") + BufferedEventsCount = NewDimensionlessHistogramDef("buffered_events_count") + TaskCount = NewDimensionlessHistogramDef("task_count") + TotalActivityCount = NewDimensionlessHistogramDef("total_activity_count") + TotalUserTimerCount = NewDimensionlessHistogramDef("total_user_timer_count") + TotalChildExecutionCount = NewDimensionlessHistogramDef("total_child_execution_count") + TotalRequestCancelExternalCount = NewDimensionlessHistogramDef("total_request_cancel_external_count") + TotalSignalExternalCount = NewDimensionlessHistogramDef("total_signal_external_count") + TotalSignalCount = NewDimensionlessHistogramDef("total_signal_count") + WorkflowRetryBackoffTimerCount = NewCounterDef("workflow_retry_backoff_timer") + WorkflowCronBackoffTimerCount = NewCounterDef("workflow_cron_backoff_timer") + WorkflowDelayedStartBackoffTimerCount = NewCounterDef("workflow_delayed_start_backoff_timer") + WorkflowCleanupDeleteCount = NewCounterDef("workflow_cleanup_delete") + WorkflowCleanupArchiveCount = NewCounterDef("workflow_cleanup_archive") + WorkflowCleanupNopCount = NewCounterDef("workflow_cleanup_nop") + WorkflowCleanupDeleteHistoryInlineCount = NewCounterDef("workflow_cleanup_delete_history_inline") + WorkflowSuccessCount = NewCounterDef("workflow_success") + WorkflowCancelCount = NewCounterDef("workflow_cancel") + WorkflowFailedCount = NewCounterDef("workflow_failed") + WorkflowTimeoutCount = NewCounterDef("workflow_timeout") + WorkflowTerminateCount = NewCounterDef("workflow_terminate") + WorkflowContinuedAsNewCount = NewCounterDef("workflow_continued_as_new") + LastRetrievedMessageID = NewGaugeDef("last_retrieved_message_id") + LastProcessedMessageID = NewGaugeDef("last_processed_message_id") + ReplicationTasksSend = NewCounterDef("replication_tasks_send") + ReplicationTasksRecv = NewCounterDef("replication_tasks_recv") + ReplicationTasksRecvBacklog = NewDimensionlessHistogramDef("replication_tasks_recv_backlog") + ReplicationTasksApplied = NewCounterDef("replication_tasks_applied") + ReplicationTasksFailed = NewCounterDef("replication_tasks_failed") + // ReplicationTasksLag is a heuristic for how far behind the remote DC is for a given cluster. It measures the + // difference between task IDs so its unit should be "tasks". + // It currently has units of "ms", which is incorrect. See https://github.com/temporalio/temporal/issues/4483. + ReplicationTasksLag = NewTimeHistogramDef("replication_tasks_lag") + // ReplicationTasksFetched records the number of tasks fetched by the poller. + // It has the same unit issue as ReplicationTasksLag. + ReplicationTasksFetched = NewTimeHistogramDef("replication_tasks_fetched") + // ReplicationTasksReturned is the same as ReplicationTasksFetched. + ReplicationTasksReturned = NewTimeHistogramDef("replication_tasks_returned") ReplicationLatency = NewTimerDef("replication_latency") - ReplicationTasksFetched = NewTimerDef("replication_tasks_fetched") - ReplicationTasksReturned = NewTimerDef("replication_tasks_returned") ReplicationTasksAppliedLatency = NewTimerDef("replication_tasks_applied_latency") ReplicationDLQFailed = NewCounterDef("replication_dlq_enqueue_failed") ReplicationDLQMaxLevelGauge = NewGaugeDef("replication_dlq_max_level") @@ -1585,7 +1591,7 @@ var ( ArchiverDeleteSuccessCount = NewCounterDef("archiver_delete_success") ArchiverHandleVisibilityFailedAllRetiresCount = NewCounterDef("archiver_handle_visibility_failed_all_retries") ArchiverHandleVisibilitySuccessCount = NewCounterDef("archiver_handle_visibility_success") - ArchiverBacklogSizeGauge = NewCounterDef("archiver_backlog_size") + ArchiverBacklogSizeGauge = NewGaugeDef("archiver_backlog_size") ArchiverPumpTimeoutCount = NewCounterDef("archiver_pump_timeout") ArchiverPumpSignalThresholdCount = NewCounterDef("archiver_pump_signal_threshold") ArchiverPumpTimeoutWithoutSignalsCount = NewCounterDef("archiver_pump_timeout_without_signals")