From d1b2054ac7ce9d0fa11d28c22733afedb3311b96 Mon Sep 17 00:00:00 2001 From: Alexander Zalyalov Date: Wed, 15 Oct 2025 09:32:45 +0000 Subject: [PATCH 1/3] state storage in hc --- ydb/core/base/statestorage.cpp | 4 +- ydb/core/base/statestorage.h | 2 + ydb/core/health_check/health_check.cpp | 79 ++++++++++++++++++++++ ydb/core/health_check/health_check_ut.cpp | 63 +++++++++++++++++ ydb/public/api/protos/ydb_monitoring.proto | 7 ++ 5 files changed, 153 insertions(+), 2 deletions(-) diff --git a/ydb/core/base/statestorage.cpp b/ydb/core/base/statestorage.cpp index 8cb100f3c5fa..ac1859979478 100644 --- a/ydb/core/base/statestorage.cpp +++ b/ydb/core/base/statestorage.cpp @@ -356,13 +356,13 @@ TIntrusivePtr BuildStateStorageInfoImpl(const char* namePrefi memset(name + offset, 0, TActorId::MaxServiceIDLength - offset); for (size_t i = 0; i < config.RingGroupsSize(); i++) { auto& ringGroup = config.GetRingGroups(i); - info->RingGroups.push_back({GetRingGroupState(ringGroup), ringGroup.GetWriteOnly(), ringGroup.GetNToSelect(), {}}); + info->RingGroups.push_back({GetRingGroupState(ringGroup), ringGroup.GetWriteOnly(), ringGroup.GetNToSelect(), TBridgePileId::FromProto(&ringGroup, &NKikimrConfig::TDomainsConfig::TStateStorage::TRing::GetBridgePileId), {}}); CopyStateStorageRingInfo(ringGroup, info->RingGroups.back(), name, offset, ringGroup.GetRingGroupActorIdOffset()); memset(name + offset, 0, TActorId::MaxServiceIDLength - offset); } if (config.HasRing()) { auto& ring = config.GetRing(); - info->RingGroups.push_back({ERingGroupState::PRIMARY, false, ring.GetNToSelect(), {}}); + info->RingGroups.push_back({ERingGroupState::PRIMARY, false, ring.GetNToSelect(), {}, {}}); CopyStateStorageRingInfo(ring, info->RingGroups.back(), name, offset, ring.GetRingGroupActorIdOffset()); } return info; diff --git a/ydb/core/base/statestorage.h b/ydb/core/base/statestorage.h index 31eebf02bcae..057a4c159667 100644 --- a/ydb/core/base/statestorage.h +++ b/ydb/core/base/statestorage.h @@ -1,4 +1,5 @@ #pragma once +#include "bridge.h" #include "defs.h" #include "events.h" #include @@ -514,6 +515,7 @@ struct TStateStorageInfo : public TThrRefBase { ERingGroupState State; bool WriteOnly = false; ui32 NToSelect = 0; + TBridgePileId BridgePileId; TVector Rings; TString ToString() const; diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index 537bf555bd63..d69f4457b798 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -155,6 +156,9 @@ class TSelfCheckRequest : public TActorBootstrapped { QuotaUsage, BridgeGroupState, PileComputeState, + StateStorage, + StateStorageRing, + StateStorageNode, }; enum ETimeoutTag { @@ -675,6 +679,7 @@ class TSelfCheckRequest : public TActorBootstrapped { std::optional> PDisks; std::optional> NodeWardenStorageConfig; std::optional> DatabaseBoardInfo; + std::optional> StateStorageInfo; THashSet UnknownStaticGroups; const NKikimrConfig::THealthCheckConfig& HealthCheckConfig; @@ -837,6 +842,12 @@ class TSelfCheckRequest : public TActorBootstrapped { NodeWardenStorageConfig = RequestStorageConfig(); } + if (!IsSpecificDatabaseFilter()) { + StateStorageInfo = TRequestResponse(Span.CreateChild(TComponentTracingLevels::TTablet::Detailed, "TEvStateStorage::TEvListStateStorageResult")); + Send(MakeStateStorageProxyID(), new TEvStateStorage::TEvListStateStorage(), 0/*flags*/, 0/*cookie*/, Span.GetTraceId()); + ++Requests; + } + NodesInfo = TRequestResponse(Span.CreateChild(TComponentTracingLevels::TTablet::Detailed, "TEvInterconnect::TEvListNodes")); Send(GetNameserviceActorId(), new TEvInterconnect::TEvListNodes(), 0/*flags*/, 0/*cookie*/, Span.GetTraceId()); @@ -921,6 +932,19 @@ class TSelfCheckRequest : public TActorBootstrapped { } } + void Handle(TEvStateStorage::TEvListStateStorageResult::TPtr& ev) { + if (StateStorageInfo->Set(std::move(ev))) { + for (const auto& group : StateStorageInfo->Get()->Info->RingGroups) { + for (const auto& ring : group.Rings) { + for (const auto& replica : ring.Replicas) { + RequestGenericNode(replica.NodeId()); + } + } + } + RequestDone("TEvListStateStorageResult"); + } + } + STATEFN(StateWait) { switch (ev->GetTypeRewrite()) { hFunc(TEvents::TEvUndelivered, Handle); @@ -946,6 +970,7 @@ class TSelfCheckRequest : public TActorBootstrapped { hFunc(TEvStateStorage::TEvBoardInfo, Handle); hFunc(TEvents::TEvWakeup, HandleTimeout); hFunc(TEvNodeWardenStorageConfig, Handle); + hFunc(TEvStateStorage::TEvListStateStorageResult, Handle); } } @@ -3463,6 +3488,59 @@ class TSelfCheckRequest : public TActorBootstrapped { } } + void FillStateStorage(TOverallStateContext& context) { + if (!StateStorageInfo || !StateStorageInfo->IsOk()) { + return; + } + TSelfCheckResult ssContext; + ssContext.Type = "STATE_STORAGE"; + const auto info = StateStorageInfo->Get()->Info; + for (const auto& ringGroup : info->RingGroups) { + if (ringGroup.State != ERingGroupState::PRIMARY && ringGroup.State != ERingGroupState::SYNCHRONIZED) { + continue; + } + TSelfCheckResult* currentContext = &ssContext; + TSelfCheckContext pileContext(&ssContext, "PILE_STATE_STORAGE"); + if ((bool)ringGroup.BridgePileId) { + const auto& pileName = NodeWardenStorageConfig->Get()->BridgeInfo->GetPile(ringGroup.BridgePileId)->Name; + pileContext.Location.mutable_compute()->mutable_state_storage()->mutable_pile()->set_name(pileName); + currentContext = &pileContext; + } + ui32 disabledRings = 0; + ui32 badRings = 0; + for (size_t ringIdx = 0; ringIdx < ringGroup.Rings.size(); ++ringIdx) { + const auto& ring = ringGroup.Rings[ringIdx]; + TSelfCheckContext ringContext(currentContext, "STATE_STORAGE_RING"); + ringContext.Location.mutable_compute()->mutable_state_storage()->set_ring(ringIdx); + if (ring.IsDisabled) { + ++disabledRings; + continue; + } + for (const auto& replica : ring.Replicas) { + const auto node = replica.NodeId(); + if (!NodeSystemState[node].IsOk()) { + TSelfCheckContext nodeContext(&ringContext, "STATE_STORAGE_NODE"); + nodeContext.Location.mutable_compute()->mutable_state_storage()->mutable_node()->set_id(node); + nodeContext.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "State storage node is not available", ETags::StateStorageNode); + } + } + ringContext.ReportWithMaxChildStatus("Ring has unavailable nodes", ETags::StateStorageRing, {ETags::StateStorageNode}); + if (ringContext.GetOverallStatus() == Ydb::Monitoring::StatusFlag::RED) { + ++badRings; + } + } + if (disabledRings + badRings > (ringGroup.NToSelect - 1) / 2) { + currentContext->ReportStatus(Ydb::Monitoring::StatusFlag::RED, "State storage is not functional", ETags::StateStorage); + } else if (badRings > 1) { + currentContext->ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Some state storage rings have unavailable replicas", ETags::StateStorage); + } else if (badRings > 0) { + currentContext->ReportStatus(Ydb::Monitoring::StatusFlag::BLUE, "State storage has unavailable replicas", ETags::StateStorage); + } + } + context.UpdateMaxStatus(ssContext.GetOverallStatus()); + context.AddIssues(ssContext.IssueRecords); + } + void FillResult(TOverallStateContext context) { if (IsSpecificDatabaseFilter()) { FillDatabaseResult(context, FilterDatabase, DatabaseState[FilterDatabase]); @@ -3470,6 +3548,7 @@ class TSelfCheckRequest : public TActorBootstrapped { for (auto& [path, state] : DatabaseState) { FillDatabaseResult(context, path, state); } + FillStateStorage(context); } if (DatabaseState.empty()) { Ydb::Monitoring::DatabaseStatus& databaseStatus(*context.Result->add_database_status()); diff --git a/ydb/core/health_check/health_check_ut.cpp b/ydb/core/health_check/health_check_ut.cpp index ffaef109ad19..ce1b500f29aa 100644 --- a/ydb/core/health_check/health_check_ut.cpp +++ b/ydb/core/health_check/health_check_ut.cpp @@ -2860,9 +2860,72 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { runtime.DispatchEvents({}, TDuration::MilliSeconds(500)); block.Stop().Unblock(); auto result = runtime.GrabEdgeEvent(handle)->Result; + Cerr << result.ShortDebugString() << Endl; UNIT_ASSERT_VALUES_EQUAL(result.self_check_result(), Ydb::Monitoring::SelfCheck::GOOD); } + void TestStateStorage(ui32 deadNodes, std::optional expectedStatus) { + TPortManager tp; + ui16 port = tp.GetPort(2134); + ui16 grpcPort = tp.GetPort(2135); + auto settings = TServerSettings(port) + .SetNodeCount(9) + .SetUseRealThreads(false) + .SetDomainName("Root"); + TServer server(settings); + server.EnableGRpc(grpcPort); + TClient client(settings); + TTestActorRuntime& runtime = *server.GetRuntime(); + + TActorId sender = runtime.AllocateEdgeActor(); + TAutoPtr handle; + + TIntrusivePtr info = new TStateStorageInfo(); + info->RingGroups.push_back({ERingGroupState::PRIMARY, false, 9, {}, {}}); + info->RingGroups.back().Rings.resize(9); + info->RingGroups.back().NToSelect = 9; + for (ui32 i = 0; i < 9; ++i) { + info->RingGroups.back().Rings[i].Replicas.emplace_back(runtime.GetNodeId(i), "FAKE"); + } + + auto ssObserver = runtime.AddObserver([&](auto&& ev) { ev->Get()->Info = info; }); + + auto disconnectObserver = runtime.AddObserver([&](auto&& ev) { + auto actor = ev->Recipient; + auto nodeId = ev->Sender.NodeId(); + auto nodeIdx = nodeId - runtime.GetNodeId(0); + if (nodeIdx < deadNodes) { + runtime.Send(new IEventHandle(actor, actor, new TEvInterconnect::TEvNodeDisconnected(nodeId)), actor.NodeId() - runtime.GetNodeId(0)); + ev.Reset(); + } + }); + + runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0)); + auto result = runtime.GrabEdgeEvent(handle)->Result; + Cerr << result.ShortDebugString() << Endl; + if (expectedStatus) { + CheckHcResultHasIssuesWithStatus(result, "STATE_STORAGE", *expectedStatus, 1); + } else { + UNIT_ASSERT_VALUES_EQUAL(result.self_check_result(), Ydb::Monitoring::SelfCheck::GOOD); + } + } + + Y_UNIT_TEST(TestStateStorageOk) { + TestStateStorage(0, std::nullopt); + } + + Y_UNIT_TEST(TestStateStorageBlue) { + TestStateStorage(1, Ydb::Monitoring::StatusFlag::BLUE); + } + + Y_UNIT_TEST(TestStateStorageYellow) { + TestStateStorage(3, Ydb::Monitoring::StatusFlag::YELLOW); + } + + Y_UNIT_TEST(TestStateStorageRed) { + TestStateStorage(6, Ydb::Monitoring::StatusFlag::RED); + } + Y_UNIT_TEST(CLusterNotBootstrapped) { TPortManager tp; ui16 port = tp.GetPort(2134); diff --git a/ydb/public/api/protos/ydb_monitoring.proto b/ydb/public/api/protos/ydb_monitoring.proto index 3448ab88f4c6..2aac0324bdc5 100644 --- a/ydb/public/api/protos/ydb_monitoring.proto +++ b/ydb/public/api/protos/ydb_monitoring.proto @@ -182,12 +182,19 @@ message LocationBridgePile { string name = 1; } +message LocationStateStorage { + uint32 ring = 1; + LocationNode node = 2; + LocationBridgePile pile = 3; +} + message LocationCompute { LocationNode node = 1; LocationComputePool pool = 2; LocationComputeTablet tablet = 3; LocationComputeSchema schema = 4; LocationBridgePile pile = 5; + LocationStateStorage state_storage = 6; } message LocationDatabase { From 58de228c00098f0a7a6192d7db6f66f41acfcfaf Mon Sep 17 00:00:00 2001 From: Alexander Zalyalov Date: Mon, 20 Oct 2025 08:53:41 +0000 Subject: [PATCH 2/3] + scheme board + board + correct merging --- ydb/core/base/statestorage.h | 4 + ydb/core/base/statestorage_impl.h | 11 +++ ydb/core/base/statestorage_proxy.cpp | 5 ++ ydb/core/health_check/health_check.cpp | 104 ++++++++++++++++++---- ydb/core/health_check/health_check_ut.cpp | 8 +- 5 files changed, 112 insertions(+), 20 deletions(-) diff --git a/ydb/core/base/statestorage.h b/ydb/core/base/statestorage.h index 057a4c159667..981f6c1dbe41 100644 --- a/ydb/core/base/statestorage.h +++ b/ydb/core/base/statestorage.h @@ -31,6 +31,7 @@ struct TEvStateStorage { EvPublishActorGone, EvRingGroupPassAway, EvConfigVersionInfo, + EvListBoard, // replies (local, from proxy) EvInfo = EvLookup + 512, @@ -40,6 +41,7 @@ struct TEvStateStorage { EvDeleteResult, EvListSchemeBoardResult, EvListStateStorageResult, + EvListBoardResult, // replicas interface EvReplicaLookup = EvLock + 2 * 512, @@ -384,6 +386,8 @@ struct TEvStateStorage { struct TEvPublishActorGone; struct TEvUpdateGroupConfig; struct TEvRingGroupPassAway; + struct TEvListBoard; + struct TEvListBoardResult; struct TEvReplicaShutdown : public TEventPB { }; diff --git a/ydb/core/base/statestorage_impl.h b/ydb/core/base/statestorage_impl.h index 41de77c33325..d9e166affefd 100644 --- a/ydb/core/base/statestorage_impl.h +++ b/ydb/core/base/statestorage_impl.h @@ -218,6 +218,17 @@ struct TEvStateStorage::TEvListStateStorageResult : public TEventLocal { +}; + +struct TEvStateStorage::TEvListBoardResult : public TEventLocal { + TIntrusiveConstPtr Info; + + TEvListBoardResult(const TIntrusiveConstPtr &info) + : Info(info) + {} +}; + struct TEvStateStorage::TEvPublishActorGone : public TEventLocal { TActorId Replica; diff --git a/ydb/core/base/statestorage_proxy.cpp b/ydb/core/base/statestorage_proxy.cpp index d211e1d23d65..8229164d2ecb 100644 --- a/ydb/core/base/statestorage_proxy.cpp +++ b/ydb/core/base/statestorage_proxy.cpp @@ -1070,6 +1070,10 @@ class TStateStorageProxy : public TActor { Send(ev->Sender, new TEvStateStorage::TEvListStateStorageResult(Info), 0, ev->Cookie); } + void Handle(TEvStateStorage::TEvListBoard::TPtr &ev) { + Send(ev->Sender, new TEvStateStorage::TEvListBoardResult(Info), 0, ev->Cookie); + } + void Handle(TEvStateStorage::TEvUpdateGroupConfig::TPtr &ev) { auto *msg = ev->Get(); Info = msg->GroupConfig; @@ -1142,6 +1146,7 @@ class TStateStorageProxy : public TActor { hFunc(TEvStateStorage::TEvListStateStorage, Handle); hFunc(TEvStateStorage::TEvUpdateGroupConfig, Handle); hFunc(TEvStateStorage::TEvRingGroupPassAway, Handle); + hFunc(TEvStateStorage::TEvListBoard, Handle); fFunc(TEvents::TSystem::Unsubscribe, HandleUnsubscribe); default: if (Info->RingGroups.size() > 1) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index d69f4457b798..ec2afe01cef5 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -320,7 +320,7 @@ class TSelfCheckRequest : public TActorBootstrapped { if (issueLog.status() != Ydb::Monitoring::StatusFlag::UNSPECIFIED) { id << Ydb::Monitoring::StatusFlag_Status_Name(issueLog.status()) << '-'; } - id << crc16(issueLog.message()); + id << crc16(TStringBuilder() << issueLog.message() << issueLog.type()); if (location.database().name()) { id << '-' << crc32(location.database().name()); } @@ -367,6 +367,15 @@ class TSelfCheckRequest : public TActorBootstrapped { if (location.compute().schema().path()) { id << '-' << crc32(location.compute().schema().path()); } + if (location.compute().state_storage().pile().name()) { + id << '-' << location.compute().state_storage().pile().name(); + } + if (location.compute().state_storage().ring()) { + id << '-' << location.compute().state_storage().ring(); + } + if (location.compute().state_storage().node().id()) { + id << '-' << location.compute().state_storage().node().id(); + } return id.Str(); } @@ -396,10 +405,10 @@ class TSelfCheckRequest : public TActorBootstrapped { if (Location.ByteSizeLong() > 0) { issueLog.mutable_location()->CopyFrom(Location); } - issueLog.set_id(GetIssueId(issueLog)); if (Type) { issueLog.set_type(Type); } + issueLog.set_id(GetIssueId(issueLog)); issueLog.set_level(Level); if (!reason.empty()) { for (const TString& r : reason) { @@ -680,6 +689,8 @@ class TSelfCheckRequest : public TActorBootstrapped { std::optional> NodeWardenStorageConfig; std::optional> DatabaseBoardInfo; std::optional> StateStorageInfo; + std::optional> SchemeBoardInfo; + std::optional> BoardInfo; THashSet UnknownStaticGroups; const NKikimrConfig::THealthCheckConfig& HealthCheckConfig; @@ -845,7 +856,11 @@ class TSelfCheckRequest : public TActorBootstrapped { if (!IsSpecificDatabaseFilter()) { StateStorageInfo = TRequestResponse(Span.CreateChild(TComponentTracingLevels::TTablet::Detailed, "TEvStateStorage::TEvListStateStorageResult")); Send(MakeStateStorageProxyID(), new TEvStateStorage::TEvListStateStorage(), 0/*flags*/, 0/*cookie*/, Span.GetTraceId()); - ++Requests; + SchemeBoardInfo = TRequestResponse(Span.CreateChild(TComponentTracingLevels::TTablet::Detailed, "TEvStateStorage::TEvListSchemeBoardResult")); + Send(MakeStateStorageProxyID(), new TEvStateStorage::TEvListSchemeBoard(false), 0/*flags*/, 0/*cookie*/, Span.GetTraceId()); + BoardInfo = TRequestResponse(Span.CreateChild(TComponentTracingLevels::TTablet::Detailed, "TEvStateStorage::TEvListBoardResult")); + Send(MakeStateStorageProxyID(), new TEvStateStorage::TEvListBoard(), 0/*flags*/, 0/*cookie*/, Span.GetTraceId()); + Requests += 3; } @@ -945,6 +960,32 @@ class TSelfCheckRequest : public TActorBootstrapped { } } + void Handle(TEvStateStorage::TEvListSchemeBoardResult::TPtr& ev) { + if (SchemeBoardInfo->Set(std::move(ev))) { + for (const auto& group : SchemeBoardInfo->Get()->Info->RingGroups) { + for (const auto& ring : group.Rings) { + for (const auto& replica : ring.Replicas) { + RequestGenericNode(replica.NodeId()); + } + } + } + RequestDone("TEvListSсhemeBoardResult"); + } + } + + void Handle(TEvStateStorage::TEvListBoardResult::TPtr& ev) { + if (BoardInfo->Set(std::move(ev))) { + for (const auto& group : BoardInfo->Get()->Info->RingGroups) { + for (const auto& ring : group.Rings) { + for (const auto& replica : ring.Replicas) { + RequestGenericNode(replica.NodeId()); + } + } + } + RequestDone("TEvListBoardResult"); + } + } + STATEFN(StateWait) { switch (ev->GetTypeRewrite()) { hFunc(TEvents::TEvUndelivered, Handle); @@ -971,6 +1012,8 @@ class TSelfCheckRequest : public TActorBootstrapped { hFunc(TEvents::TEvWakeup, HandleTimeout); hFunc(TEvNodeWardenStorageConfig, Handle); hFunc(TEvStateStorage::TEvListStateStorageResult, Handle); + hFunc(TEvStateStorage::TEvListSchemeBoardResult, Handle); + hFunc(TEvStateStorage::TEvListBoardResult, Handle); } } @@ -3024,6 +3067,18 @@ class TSelfCheckRequest : public TActorBootstrapped { message = std::regex_replace(message.c_str(), std::regex("^PDisk "), "PDisks "); break; } + case ETags::StateStorageRing: { + message = std::regex_replace(message.c_str(), std::regex("^Ring has "), "Rings have "); + message = std::regex_replace(message.c_str(), std::regex("^Ring is "), "Rings are "); + message = std::regex_replace(message.c_str(), std::regex("^Ring "), "Rings "); + break; + } + case ETags::StateStorageNode: { + message = std::regex_replace(message.c_str(), std::regex("^Ring has "), "Rings have "); + message = std::regex_replace(message.c_str(), std::regex("^Ring is "), "Rings are "); + message = std::regex_replace(message.c_str(), std::regex("^Ring "), "Rings "); + break; + } default: break; } @@ -3079,6 +3134,10 @@ class TSelfCheckRequest : public TActorBootstrapped { isSimilar = it->IssueLog.location().storage().pool().group().pile().name() == similar.begin()->IssueLog.location().storage().pool().group().pile().name(); } + if (isSimilar && similar.begin()->IssueLog.location().compute().state_storage().has_pile()) { + isSimilar = it->IssueLog.location().compute().state_storage().pile().name() + == similar.begin()->IssueLog.location().compute().state_storage().pile().name(); + } if (isSimilar) { auto move = it++; similar.splice(similar.end(), records, move); @@ -3320,6 +3379,8 @@ class TSelfCheckRequest : public TActorBootstrapped { MergeLevelRecords(mergeContext, ETags::VDiskState, ETags::BridgeGroupState); MergeLevelRecords(mergeContext, ETags::VDiskState, ETags::GroupState); MergeLevelRecords(mergeContext, ETags::PDiskState, ETags::VDiskState); + MergeLevelRecords(mergeContext, ETags::StateStorageRing); + MergeLevelRecords(mergeContext, ETags::StateStorageNode, ETags::StateStorageRing); } mergeContext.FillRecords(records); } @@ -3488,20 +3549,16 @@ class TSelfCheckRequest : public TActorBootstrapped { } } - void FillStateStorage(TOverallStateContext& context) { - if (!StateStorageInfo || !StateStorageInfo->IsOk()) { - return; - } + void FillStateStorage(TOverallStateContext& context, TString type, TIntrusiveConstPtr info) { TSelfCheckResult ssContext; - ssContext.Type = "STATE_STORAGE"; - const auto info = StateStorageInfo->Get()->Info; + ssContext.Type = type; for (const auto& ringGroup : info->RingGroups) { if (ringGroup.State != ERingGroupState::PRIMARY && ringGroup.State != ERingGroupState::SYNCHRONIZED) { continue; } TSelfCheckResult* currentContext = &ssContext; - TSelfCheckContext pileContext(&ssContext, "PILE_STATE_STORAGE"); - if ((bool)ringGroup.BridgePileId) { + TSelfCheckContext pileContext(&ssContext, TStringBuilder() << "PILE_" << type); + if ((bool)ringGroup.BridgePileId && NodeWardenStorageConfig->IsOk()) { const auto& pileName = NodeWardenStorageConfig->Get()->BridgeInfo->GetPile(ringGroup.BridgePileId)->Name; pileContext.Location.mutable_compute()->mutable_state_storage()->mutable_pile()->set_name(pileName); currentContext = &pileContext; @@ -3510,8 +3567,8 @@ class TSelfCheckRequest : public TActorBootstrapped { ui32 badRings = 0; for (size_t ringIdx = 0; ringIdx < ringGroup.Rings.size(); ++ringIdx) { const auto& ring = ringGroup.Rings[ringIdx]; - TSelfCheckContext ringContext(currentContext, "STATE_STORAGE_RING"); - ringContext.Location.mutable_compute()->mutable_state_storage()->set_ring(ringIdx); + TSelfCheckContext ringContext(currentContext, TStringBuilder() << type << "_RING"); + ringContext.Location.mutable_compute()->mutable_state_storage()->set_ring(ringIdx + 1); if (ring.IsDisabled) { ++disabledRings; continue; @@ -3519,9 +3576,9 @@ class TSelfCheckRequest : public TActorBootstrapped { for (const auto& replica : ring.Replicas) { const auto node = replica.NodeId(); if (!NodeSystemState[node].IsOk()) { - TSelfCheckContext nodeContext(&ringContext, "STATE_STORAGE_NODE"); + TSelfCheckContext nodeContext(&ringContext, TStringBuilder() << type << "_NODE"); nodeContext.Location.mutable_compute()->mutable_state_storage()->mutable_node()->set_id(node); - nodeContext.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "State storage node is not available", ETags::StateStorageNode); + nodeContext.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Node is not available", ETags::StateStorageNode); } } ringContext.ReportWithMaxChildStatus("Ring has unavailable nodes", ETags::StateStorageRing, {ETags::StateStorageNode}); @@ -3530,13 +3587,14 @@ class TSelfCheckRequest : public TActorBootstrapped { } } if (disabledRings + badRings > (ringGroup.NToSelect - 1) / 2) { - currentContext->ReportStatus(Ydb::Monitoring::StatusFlag::RED, "State storage is not functional", ETags::StateStorage); + currentContext->ReportStatus(Ydb::Monitoring::StatusFlag::RED, "There is not enough functional rings", ETags::StateStorage); } else if (badRings > 1) { - currentContext->ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Some state storage rings have unavailable replicas", ETags::StateStorage); + currentContext->ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Multiple rings have unavailable replicas", ETags::StateStorage); } else if (badRings > 0) { - currentContext->ReportStatus(Ydb::Monitoring::StatusFlag::BLUE, "State storage has unavailable replicas", ETags::StateStorage); + currentContext->ReportStatus(Ydb::Monitoring::StatusFlag::BLUE, "One ring has unavailable replicas", ETags::StateStorage); } } + MergeRecords(ssContext.IssueRecords); context.UpdateMaxStatus(ssContext.GetOverallStatus()); context.AddIssues(ssContext.IssueRecords); } @@ -3548,7 +3606,15 @@ class TSelfCheckRequest : public TActorBootstrapped { for (auto& [path, state] : DatabaseState) { FillDatabaseResult(context, path, state); } - FillStateStorage(context); + if (StateStorageInfo && StateStorageInfo->IsOk()) { + FillStateStorage(context, "STATE_STORAGE", StateStorageInfo->Get()->Info); + } + if (SchemeBoardInfo && SchemeBoardInfo->IsOk()) { + FillStateStorage(context, "SCHEME_BOARD", SchemeBoardInfo->Get()->Info); + } + if (BoardInfo && BoardInfo->IsOk()) { + FillStateStorage(context, "BOARD", BoardInfo->Get()->Info); + } } if (DatabaseState.empty()) { Ydb::Monitoring::DatabaseStatus& databaseStatus(*context.Result->add_database_status()); diff --git a/ydb/core/health_check/health_check_ut.cpp b/ydb/core/health_check/health_check_ut.cpp index ce1b500f29aa..207194708e6c 100644 --- a/ydb/core/health_check/health_check_ut.cpp +++ b/ydb/core/health_check/health_check_ut.cpp @@ -2889,6 +2889,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { } auto ssObserver = runtime.AddObserver([&](auto&& ev) { ev->Get()->Info = info; }); + auto sbObserver = runtime.AddObserver([&](auto&& ev) { ev->Get()->Info = info; }); + auto bObserver = runtime.AddObserver([&](auto&& ev) { ev->Get()->Info = info; }); auto disconnectObserver = runtime.AddObserver([&](auto&& ev) { auto actor = ev->Recipient; @@ -2900,11 +2902,15 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { } }); - runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0)); + auto *request = new NHealthCheck::TEvSelfCheckRequest(); + request->Request.set_merge_records(true); + runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0)); auto result = runtime.GrabEdgeEvent(handle)->Result; Cerr << result.ShortDebugString() << Endl; if (expectedStatus) { CheckHcResultHasIssuesWithStatus(result, "STATE_STORAGE", *expectedStatus, 1); + CheckHcResultHasIssuesWithStatus(result, "SCHEME_BOARD", *expectedStatus, 1); + CheckHcResultHasIssuesWithStatus(result, "BOARD", *expectedStatus, 1); } else { UNIT_ASSERT_VALUES_EQUAL(result.self_check_result(), Ydb::Monitoring::SelfCheck::GOOD); } From f82a5a271975137a45986a02a1752409039bd9bf Mon Sep 17 00:00:00 2001 From: Alexander Zalyalov Date: Mon, 20 Oct 2025 16:11:49 +0000 Subject: [PATCH 3/3] review --- ydb/core/base/statestorage_proxy.cpp | 2 +- ydb/core/health_check/health_check.cpp | 36 ++++++++++---------------- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/ydb/core/base/statestorage_proxy.cpp b/ydb/core/base/statestorage_proxy.cpp index 8229164d2ecb..0a6b01f540fe 100644 --- a/ydb/core/base/statestorage_proxy.cpp +++ b/ydb/core/base/statestorage_proxy.cpp @@ -1071,7 +1071,7 @@ class TStateStorageProxy : public TActor { } void Handle(TEvStateStorage::TEvListBoard::TPtr &ev) { - Send(ev->Sender, new TEvStateStorage::TEvListBoardResult(Info), 0, ev->Cookie); + Send(ev->Sender, new TEvStateStorage::TEvListBoardResult(BoardInfo), 0, ev->Cookie); } void Handle(TEvStateStorage::TEvUpdateGroupConfig::TPtr &ev) { diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index ec2afe01cef5..7abdf08f2285 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -947,41 +947,33 @@ class TSelfCheckRequest : public TActorBootstrapped { } } - void Handle(TEvStateStorage::TEvListStateStorageResult::TPtr& ev) { - if (StateStorageInfo->Set(std::move(ev))) { - for (const auto& group : StateStorageInfo->Get()->Info->RingGroups) { - for (const auto& ring : group.Rings) { - for (const auto& replica : ring.Replicas) { - RequestGenericNode(replica.NodeId()); - } + void RequestNodes(TIntrusiveConstPtr info) { + for (const auto& group : info->RingGroups) { + for (const auto& ring : group.Rings) { + for (const auto& replica : ring.Replicas) { + RequestGenericNode(replica.NodeId()); } } + } + } + + void Handle(TEvStateStorage::TEvListStateStorageResult::TPtr& ev) { + if (StateStorageInfo->Set(std::move(ev))) { + RequestNodes(StateStorageInfo->Get()->Info); RequestDone("TEvListStateStorageResult"); } } void Handle(TEvStateStorage::TEvListSchemeBoardResult::TPtr& ev) { if (SchemeBoardInfo->Set(std::move(ev))) { - for (const auto& group : SchemeBoardInfo->Get()->Info->RingGroups) { - for (const auto& ring : group.Rings) { - for (const auto& replica : ring.Replicas) { - RequestGenericNode(replica.NodeId()); - } - } - } + RequestNodes(SchemeBoardInfo->Get()->Info); RequestDone("TEvListSсhemeBoardResult"); } } void Handle(TEvStateStorage::TEvListBoardResult::TPtr& ev) { if (BoardInfo->Set(std::move(ev))) { - for (const auto& group : BoardInfo->Get()->Info->RingGroups) { - for (const auto& ring : group.Rings) { - for (const auto& replica : ring.Replicas) { - RequestGenericNode(replica.NodeId()); - } - } - } + RequestNodes(BoardInfo->Get()->Info); RequestDone("TEvListBoardResult"); } } @@ -3558,7 +3550,7 @@ class TSelfCheckRequest : public TActorBootstrapped { } TSelfCheckResult* currentContext = &ssContext; TSelfCheckContext pileContext(&ssContext, TStringBuilder() << "PILE_" << type); - if ((bool)ringGroup.BridgePileId && NodeWardenStorageConfig->IsOk()) { + if ((bool)ringGroup.BridgePileId && NodeWardenStorageConfig && NodeWardenStorageConfig->IsOk()) { const auto& pileName = NodeWardenStorageConfig->Get()->BridgeInfo->GetPile(ringGroup.BridgePileId)->Name; pileContext.Location.mutable_compute()->mutable_state_storage()->mutable_pile()->set_name(pileName); currentContext = &pileContext;