Skip to content

Commit

Permalink
15234 FIX Periodic service discovery: Vanished clustered services can…
Browse files Browse the repository at this point in the history
… now be removed automatically

Change-Id: I2373b90c33b80ac9f4da10838fe1ff48ceee9388
  • Loading branch information
mo-ki committed Feb 7, 2023
1 parent 3cfb6c9 commit dc43ba2
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 6 deletions.
22 changes: 22 additions & 0 deletions .werks/15234
@@ -0,0 +1,22 @@
Title: Periodic service discovery: Vanished clustered services can now be removed automatically
Class: fix
Compatible: compat
Component: checks
Date: 1675348187
Edition: cre
Knowledge: doc
Level: 1
State: unknown
Version: 2.2.0i1

Up to this Werk vanished clustered services could never be removed using the periodic service discovery.
We have now added a configuration option that allows users to achieve just that, but beware:

By default we keep a record of vanished services on the node if they are assigned to a cluster.
When a clustered service switches from one node to another, it might not be seen on either node for one check cycle.
Keeping clustered services indefinitely keeps us from loosing them in this case.
However this means that truly vanished clustered servces will never be removed from the cluster.
If you choose to include clustered service in the removal operation, vanished services will be removed from clusters, at the risk of loosing services due to the described race condition.

If you have specific needs, you can always adapt the services according to your needs manually using the service discovery page.

17 changes: 12 additions & 5 deletions cmk/base/agent_based/discovery/autodiscovery.py
Expand Up @@ -107,6 +107,7 @@ def automation_discovery(
check_plugins: Mapping[CheckPluginName, CheckPlugin],
find_service_description: Callable[[HostName, CheckPluginName, Item], ServiceName],
mode: DiscoveryMode,
keep_clustered_vanished_services: bool,
service_filters: _ServiceFilters | None,
on_error: OnError,
) -> DiscoveryResult:
Expand Down Expand Up @@ -171,6 +172,7 @@ def automation_discovery(
result,
find_service_description,
mode,
keep_clustered_vanished_services,
)
config_cache.set_autochecks(host_name, list(final_services.values()))

Expand Down Expand Up @@ -242,9 +244,10 @@ def _get_post_discovery_autocheck_services( # pylint: disable=too-many-branches
result: DiscoveryResult,
find_service_description: Callable[[HostName, CheckPluginName, Item], ServiceName],
mode: DiscoveryMode,
keep_clustered_vanished_services: bool,
) -> Mapping[ServiceID, AutocheckServiceWithNodes]:
"""
The output contains a selction of services in the states "new", "old", "ignored", "vanished"
The output contains a selection of services in the states "new", "old", "ignored", "vanished"
(depending on the value of `mode`) and "clusterd_".
Service in with the state "custom", "active" and "manual" are currently not checked.
Expand Down Expand Up @@ -292,10 +295,11 @@ def _get_post_discovery_autocheck_services( # pylint: disable=too-many-branches
result.self_kept += 1

else:
# Silently keep clustered services
post_discovery_services.update(
(s.service.id(), s) for s in discovered_services_with_nodes
)
if check_source != "clustered_vanished" or keep_clustered_vanished_services:
# Silently keep clustered services
post_discovery_services.update(
(s.service.id(), s) for s in discovered_services_with_nodes
)
if check_source == "clustered_new":
result.clustered_new += len(discovered_services_with_nodes)
elif check_source == "clustered_old":
Expand Down Expand Up @@ -463,6 +467,9 @@ def _discover_marked_host(
check_plugins=check_plugins,
find_service_description=find_service_description,
mode=DiscoveryMode(params.rediscovery.get("mode")),
keep_clustered_vanished_services=params.rediscovery.get(
"keep_clustered_vanished_services", True
),
service_filters=_ServiceFilters.from_settings(params.rediscovery),
on_error=on_error,
)
Expand Down
1 change: 1 addition & 0 deletions cmk/base/automations/check_mk.py
Expand Up @@ -210,6 +210,7 @@ def execute(self, args: list[str]) -> DiscoveryResult:
check_plugins=CheckPluginMapper(),
find_service_description=config.service_description,
mode=mode,
keep_clustered_vanished_services=True,
service_filters=None,
on_error=on_error,
)
Expand Down
21 changes: 20 additions & 1 deletion cmk/gui/plugins/wato/check_mk_configuration.py
Expand Up @@ -4104,6 +4104,25 @@ def _valuespec_automatic_rediscover_parameters() -> Dictionary:
default_value=0,
),
),
(
"keep_clustered_vanished_services",
DropdownChoice(
title=_("Vanished clustered services"),
help=_(
"By default we keep a record of vanished services on the node if they are assigned to a cluster."
" When a clustered service switches from one node to another, it might not be seen on either node for one check cycle."
" Keeping clustered services indefinitely keeps us from loosing them in this case."
" However this means that truly vanished clustered servces will never be removed from the cluster."
" If you choose to include clustered service in the removal operation, vanished services will be removed from clusters,"
" at the risk of loosing services due to the described race condition."
),
choices=[
(True, _("Always keep vanished clustered services")),
(False, _("Include vanished clustered services during removal")),
],
default_value=True,
),
),
(
"group_time",
Age(
Expand Down Expand Up @@ -4196,7 +4215,7 @@ def _valuespec_automatic_rediscover_parameters() -> Dictionary:
),
),
],
optional_keys=["service_filters"],
optional_keys=["service_filters", "keep_clustered_vanished_services"],
)


Expand Down
Expand Up @@ -388,6 +388,7 @@ def test__get_post_discovery_services(
result,
find_service_description=lambda *args: f"Test Description {args[-1]}",
mode=mode,
keep_clustered_vanished_services=True,
).values()
]

Expand Down

0 comments on commit dc43ba2

Please sign in to comment.