From 9c5caf89553787b9c5692b49171a683afff68b94 Mon Sep 17 00:00:00 2001 From: st-shchetinin Date: Thu, 26 Jun 2025 17:16:59 +0300 Subject: [PATCH 1/6] done --- ydb/core/cms/cms.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp index ea729ac4d397..412596cdf7d2 100644 --- a/ydb/core/cms/cms.cpp +++ b/ydb/core/cms/cms.cpp @@ -715,7 +715,9 @@ bool TCms::TryToLockStateStorageReplica(const TAction& action, } Y_ABORT_UNLESS(ClusterInfo->StateStorageInfo->RingGroups.size() > 0); - const ui32 nToSelect = ClusterInfo->StateStorageInfo->RingGroups[0].NToSelect; + // If the cluster is not in Bridge Mode, then pileId = 0 + const ui32 pileId = ClusterInfo->NodeIdToPileId[node.NodeId]; + const ui32 nToSelect = ClusterInfo->StateStorageInfo->RingGroups[pileId].NToSelect; const ui32 currentRing = ClusterInfo->GetRingId(node.NodeId); ui8 currentRingState = TStateStorageRingInfo::Unknown; ui32 restartRings = 0; @@ -724,6 +726,12 @@ bool TCms::TryToLockStateStorageReplica(const TAction& action, auto now = AppData(ctx)->TimeProvider->Now(); TDuration duration = TDuration::MicroSeconds(action.GetDuration()) + opts.PermissionDuration; for (auto ringInfo : ClusterInfo->StateStorageRings) { + Y_ABORT_UNLESS(ringInfo->Replicas.size() > 0); + // If the cluster is not in Bridge Mode, then it is always false + if (ClusterInfo->NodeIdToPileId[ringInfo->Replicas[0]->NodeId] != pileId) { + continue; + } + auto state = ringInfo->CountState(now, State->Config.DefaultRetryTime, duration); LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::CMS, "Ring: " << ringInfo->RingId << "; State: " << TStateStorageRingInfo::RingStateToString(state)); From 872b9a97db2d57dcd0a935b4f7cb2da8c53c1b34 Mon Sep 17 00:00:00 2001 From: st-shchetinin Date: Fri, 27 Jun 2025 12:22:35 +0300 Subject: [PATCH 2/6] fix --- ydb/core/cms/cluster_info.cpp | 37 ++++++++++++++++++++------------- ydb/core/cms/cluster_info.h | 3 ++- ydb/core/cms/cms.cpp | 10 ++------- ydb/core/cms/info_collector.cpp | 1 + 4 files changed, 28 insertions(+), 23 deletions(-) diff --git a/ydb/core/cms/cluster_info.cpp b/ydb/core/cms/cluster_info.cpp index ee69f20175fb..824647523dda 100644 --- a/ydb/core/cms/cluster_info.cpp +++ b/ydb/core/cms/cluster_info.cpp @@ -916,22 +916,31 @@ void TClusterInfo::MigrateOldInfo(TClusterInfoPtr old) void TClusterInfo::ApplyStateStorageInfo(TIntrusiveConstPtr info) { StateStorageInfoReceived = true; Y_ABORT_UNLESS(info->RingGroups.size() > 0); - auto& groupInfo = info->RingGroups[0]; - for (ui32 ringId = 0; ringId < groupInfo.Rings.size(); ++ringId) { - auto &ring = groupInfo.Rings[ringId]; - TStateStorageRingInfoPtr ringInfo = MakeIntrusive(); - ringInfo->RingId = ringId; - if (ring.IsDisabled) - ringInfo->SetDisabled(); - - for(auto replica : ring.Replicas) { - CheckNodeExistenceWithVerify(replica.NodeId()); - ringInfo->AddNode(Nodes[replica.NodeId()]); - StateStorageReplicas.insert(replica.NodeId()); - StateStorageNodeToRingId[replica.NodeId()] = ringId; + const ui64 rGroupSize = IsBridgeMode ? 1 : info->RingGroups.size(); + StateStorageRings.resize(rGroupSize); + + for (ui64 rGroupId = 0; rGroupId < rGroupSize; ++rGroupId) { + // if not in bridge mode, then we don't need to add rings from extra groups + if (!IsBridgeMode && rGroupId > 0) { + break; } + auto& groupInfo = info->RingGroups[rGroupId]; + for (ui32 ringId = 0; ringId < groupInfo.Rings.size(); ++ringId) { + auto &ring = groupInfo.Rings[ringId]; + TStateStorageRingInfoPtr ringInfo = MakeIntrusive(); + ringInfo->RingId = ringId; + if (ring.IsDisabled) + ringInfo->SetDisabled(); + + for(auto replica : ring.Replicas) { + CheckNodeExistenceWithVerify(replica.NodeId()); + ringInfo->AddNode(Nodes[replica.NodeId()]); + StateStorageReplicas.insert(replica.NodeId()); + StateStorageNodeToRingId[replica.NodeId()] = ringId; + } - StateStorageRings.push_back(ringInfo); + StateStorageRings[rGroupId].push_back(ringInfo); + } } } diff --git a/ydb/core/cms/cluster_info.h b/ydb/core/cms/cluster_info.h index 74c069a42835..e5a53166c6c6 100644 --- a/ydb/core/cms/cluster_info.h +++ b/ydb/core/cms/cluster_info.h @@ -1043,10 +1043,11 @@ class TClusterInfo : public TThrRefBase { THashMap> SysNodesCheckers; TIntrusiveConstPtr StateStorageInfo; - TVector StateStorageRings; + TVector> StateStorageRings; std::vector Piles; THashMap NodeIdToPileId; + bool IsBridgeMode = false; }; inline bool ActionRequiresHost(NKikimrCms::TAction::EType type) { diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp index 412596cdf7d2..7d4ebaf6ad6f 100644 --- a/ydb/core/cms/cms.cpp +++ b/ydb/core/cms/cms.cpp @@ -716,7 +716,7 @@ bool TCms::TryToLockStateStorageReplica(const TAction& action, Y_ABORT_UNLESS(ClusterInfo->StateStorageInfo->RingGroups.size() > 0); // If the cluster is not in Bridge Mode, then pileId = 0 - const ui32 pileId = ClusterInfo->NodeIdToPileId[node.NodeId]; + const ui32 pileId = node.PileId; const ui32 nToSelect = ClusterInfo->StateStorageInfo->RingGroups[pileId].NToSelect; const ui32 currentRing = ClusterInfo->GetRingId(node.NodeId); ui8 currentRingState = TStateStorageRingInfo::Unknown; @@ -725,13 +725,7 @@ bool TCms::TryToLockStateStorageReplica(const TAction& action, ui32 disabledRings = 0; auto now = AppData(ctx)->TimeProvider->Now(); TDuration duration = TDuration::MicroSeconds(action.GetDuration()) + opts.PermissionDuration; - for (auto ringInfo : ClusterInfo->StateStorageRings) { - Y_ABORT_UNLESS(ringInfo->Replicas.size() > 0); - // If the cluster is not in Bridge Mode, then it is always false - if (ClusterInfo->NodeIdToPileId[ringInfo->Replicas[0]->NodeId] != pileId) { - continue; - } - + for (auto ringInfo : ClusterInfo->StateStorageRings[node.PileId]) { auto state = ringInfo->CountState(now, State->Config.DefaultRetryTime, duration); LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::CMS, "Ring: " << ringInfo->RingId << "; State: " << TStateStorageRingInfo::RingStateToString(state)); diff --git a/ydb/core/cms/info_collector.cpp b/ydb/core/cms/info_collector.cpp index ec5bdb5b5eb1..1ffde78eaa47 100644 --- a/ydb/core/cms/info_collector.cpp +++ b/ydb/core/cms/info_collector.cpp @@ -227,6 +227,7 @@ void TInfoCollector::Handle(TEvInterconnect::TEvNodesInfo::TPtr& ev) { RequestBridgeInfo(); const auto& pileMap = ev->Get()->PileMap; + Info->IsBridgeMode = !!pileMap; Info->NodeIdToPileId = FlipPileMap(pileMap); for (const auto& node : ev->Get()->Nodes) { Info->AddNode(node, &TlsActivationContext->AsActorContext()); From be86bc2e061f11e8554949e7bcdc4e9b9e692267 Mon Sep 17 00:00:00 2001 From: st-shchetinin Date: Fri, 27 Jun 2025 14:38:18 +0300 Subject: [PATCH 3/6] fix rGroupSize --- ydb/core/cms/cluster_info.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ydb/core/cms/cluster_info.cpp b/ydb/core/cms/cluster_info.cpp index 824647523dda..f82ad598994c 100644 --- a/ydb/core/cms/cluster_info.cpp +++ b/ydb/core/cms/cluster_info.cpp @@ -916,14 +916,10 @@ void TClusterInfo::MigrateOldInfo(TClusterInfoPtr old) void TClusterInfo::ApplyStateStorageInfo(TIntrusiveConstPtr info) { StateStorageInfoReceived = true; Y_ABORT_UNLESS(info->RingGroups.size() > 0); - const ui64 rGroupSize = IsBridgeMode ? 1 : info->RingGroups.size(); + const ui64 rGroupSize = IsBridgeMode ? info->RingGroups.size() : 1; StateStorageRings.resize(rGroupSize); for (ui64 rGroupId = 0; rGroupId < rGroupSize; ++rGroupId) { - // if not in bridge mode, then we don't need to add rings from extra groups - if (!IsBridgeMode && rGroupId > 0) { - break; - } auto& groupInfo = info->RingGroups[rGroupId]; for (ui32 ringId = 0; ringId < groupInfo.Rings.size(); ++ringId) { auto &ring = groupInfo.Rings[ringId]; From 1494f7c19e7e6b298c2a4127eed98e1e9b8b78b9 Mon Sep 17 00:00:00 2001 From: st-shchetinin Date: Fri, 27 Jun 2025 15:27:14 +0300 Subject: [PATCH 4/6] check IsBridgeMode for pileId --- ydb/core/cms/cms.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp index 7d4ebaf6ad6f..66a9e7a432a0 100644 --- a/ydb/core/cms/cms.cpp +++ b/ydb/core/cms/cms.cpp @@ -715,8 +715,7 @@ bool TCms::TryToLockStateStorageReplica(const TAction& action, } Y_ABORT_UNLESS(ClusterInfo->StateStorageInfo->RingGroups.size() > 0); - // If the cluster is not in Bridge Mode, then pileId = 0 - const ui32 pileId = node.PileId; + const ui32 pileId = ClusterInfo->IsBridgeMode ? node.PileId : 0; const ui32 nToSelect = ClusterInfo->StateStorageInfo->RingGroups[pileId].NToSelect; const ui32 currentRing = ClusterInfo->GetRingId(node.NodeId); ui8 currentRingState = TStateStorageRingInfo::Unknown; @@ -725,7 +724,7 @@ bool TCms::TryToLockStateStorageReplica(const TAction& action, ui32 disabledRings = 0; auto now = AppData(ctx)->TimeProvider->Now(); TDuration duration = TDuration::MicroSeconds(action.GetDuration()) + opts.PermissionDuration; - for (auto ringInfo : ClusterInfo->StateStorageRings[node.PileId]) { + for (auto ringInfo : ClusterInfo->StateStorageRings[pileId]) { auto state = ringInfo->CountState(now, State->Config.DefaultRetryTime, duration); LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::CMS, "Ring: " << ringInfo->RingId << "; State: " << TStateStorageRingInfo::RingStateToString(state)); From 43982c67f7b1324fc74102935d81ac19405cef3e Mon Sep 17 00:00:00 2001 From: st-shchetinin Date: Fri, 27 Jun 2025 15:27:56 +0300 Subject: [PATCH 5/6] copilot fix --- ydb/core/cms/info_collector.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ydb/core/cms/info_collector.cpp b/ydb/core/cms/info_collector.cpp index 1ffde78eaa47..4074191e26e2 100644 --- a/ydb/core/cms/info_collector.cpp +++ b/ydb/core/cms/info_collector.cpp @@ -227,7 +227,7 @@ void TInfoCollector::Handle(TEvInterconnect::TEvNodesInfo::TPtr& ev) { RequestBridgeInfo(); const auto& pileMap = ev->Get()->PileMap; - Info->IsBridgeMode = !!pileMap; + Info->IsBridgeMode = static_cast(pileMap); Info->NodeIdToPileId = FlipPileMap(pileMap); for (const auto& node : ev->Get()->Nodes) { Info->AddNode(node, &TlsActivationContext->AsActorContext()); From 52e1ebeab1bfaf851197192a719591a75e210bf7 Mon Sep 17 00:00:00 2001 From: st-shchetinin Date: Fri, 27 Jun 2025 15:34:09 +0300 Subject: [PATCH 6/6] pileId -> ringGroupId --- ydb/core/cms/cms.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp index 66a9e7a432a0..153bc5627ab1 100644 --- a/ydb/core/cms/cms.cpp +++ b/ydb/core/cms/cms.cpp @@ -715,8 +715,8 @@ bool TCms::TryToLockStateStorageReplica(const TAction& action, } Y_ABORT_UNLESS(ClusterInfo->StateStorageInfo->RingGroups.size() > 0); - const ui32 pileId = ClusterInfo->IsBridgeMode ? node.PileId : 0; - const ui32 nToSelect = ClusterInfo->StateStorageInfo->RingGroups[pileId].NToSelect; + const ui32 ringGroupId = ClusterInfo->IsBridgeMode ? node.PileId : 0; + const ui32 nToSelect = ClusterInfo->StateStorageInfo->RingGroups[ringGroupId].NToSelect; const ui32 currentRing = ClusterInfo->GetRingId(node.NodeId); ui8 currentRingState = TStateStorageRingInfo::Unknown; ui32 restartRings = 0; @@ -724,7 +724,7 @@ bool TCms::TryToLockStateStorageReplica(const TAction& action, ui32 disabledRings = 0; auto now = AppData(ctx)->TimeProvider->Now(); TDuration duration = TDuration::MicroSeconds(action.GetDuration()) + opts.PermissionDuration; - for (auto ringInfo : ClusterInfo->StateStorageRings[pileId]) { + for (auto ringInfo : ClusterInfo->StateStorageRings[ringGroupId]) { auto state = ringInfo->CountState(now, State->Config.DefaultRetryTime, duration); LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::CMS, "Ring: " << ringInfo->RingId << "; State: " << TStateStorageRingInfo::RingStateToString(state));