Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

limit the number of connect attempts for cached tablet in volume proxy #799

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cloud/blockstore/config/storage.proto
Original file line number Diff line number Diff line change
Expand Up @@ -948,4 +948,8 @@ message TStorageServiceConfig
// Timeout for attempts to acquire the shadow disk when writes to the source
// disk are not blocked (in ms).
optional uint32 MaxAcquireShadowDiskTotalTimeoutWhenNonBlocked = 358;

// Number of attempts to connect to tablet for cached tablets before
// falling back to describe volume.
optional uint32 VolumeProxyCacheRetryLimit = 359;
yegorskii marked this conversation as resolved.
Show resolved Hide resolved
}
1 change: 1 addition & 0 deletions cloud/blockstore/libs/storage/core/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,7 @@ TDuration MSeconds(ui32 value)
xxx(MaxAcquireShadowDiskRetryDelayWhenNonBlocked, TDuration, Seconds(10) )\
xxx(MaxAcquireShadowDiskTotalTimeoutWhenBlocked, TDuration, Seconds(5) )\
xxx(MaxAcquireShadowDiskTotalTimeoutWhenNonBlocked, TDuration, Seconds(600) )\
xxx(VolumeProxyCacheRetryLimit, ui32, 8 )\
yegorskii marked this conversation as resolved.
Show resolved Hide resolved

// BLOCKSTORE_STORAGE_CONFIG_RW

Expand Down
2 changes: 2 additions & 0 deletions cloud/blockstore/libs/storage/core/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,8 @@ class TStorageConfig
TDuration GetMaxAcquireShadowDiskRetryDelayWhenNonBlocked() const;
TDuration GetMaxAcquireShadowDiskTotalTimeoutWhenBlocked() const;
TDuration GetMaxAcquireShadowDiskTotalTimeoutWhenNonBlocked() const;

ui32 GetVolumeProxyCacheRetryLimit() const;
};

ui64 GetAllocationUnit(
Expand Down
36 changes: 29 additions & 7 deletions cloud/blockstore/libs/storage/volume_proxy/volume_proxy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ class TVolumeProxyActor final
, RefCount(refCount)
{}

ui32 ConnectFailures = 0;
ui64 TabletId = 0;
int RefCount = 0;
};
Expand Down Expand Up @@ -488,11 +489,23 @@ void TVolumeProxyActor::HandleConnect(
msg->TabletId,
FormatError(error).data());

if (auto it = BaseDiskIdToTabletId.find(conn->DiskId);
it != BaseDiskIdToTabletId.end())
{
it->second.ConnectFailures++;
}

CancelActiveRequests(ctx, *conn);
DestroyConnection(ctx, *conn, error);
return;
}

if (auto it = BaseDiskIdToTabletId.find(conn->DiskId);
it != BaseDiskIdToTabletId.end())
{
it->second.ConnectFailures = 0;
}

if (conn->State == FAILED) {
// Tablet recovered
conn->State = STARTED;
Expand Down Expand Up @@ -552,6 +565,12 @@ void TVolumeProxyActor::HandleDescribeResponse(
*conn,
volumeDescr.GetVolumeTabletId(),
msg->Path);

if (auto it = BaseDiskIdToTabletId.find(conn->DiskId);
it != BaseDiskIdToTabletId.end())
{
it->second.ConnectFailures = 0;
}
}

template <typename TMethod>
Expand Down Expand Up @@ -585,13 +604,16 @@ void TVolumeProxyActor::HandleRequest(
{
auto itr = BaseDiskIdToTabletId.find(diskId);
if (itr != BaseDiskIdToTabletId.end()) {
PostponeRequest(ctx, conn, IEventHandlePtr(ev.Release()));
StartConnection(
ctx,
conn,
itr->second.TabletId,
"PartitionConfig");
break;
if (itr->second.ConnectFailures < Config->GetVolumeProxyCacheRetryLimit())
{
PostponeRequest(ctx, conn, IEventHandlePtr(ev.Release()));
StartConnection(
ctx,
conn,
itr->second.TabletId,
"PartitionConfig");
break;
}
}

conn.State = RESOLVING;
Expand Down
161 changes: 161 additions & 0 deletions cloud/blockstore/libs/storage/volume_proxy/volume_proxy_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,167 @@ Y_UNIT_TEST_SUITE(TVolumeProxyTest)
RebootTablet(runtime, volumeTabletId, service1.GetSender(), nodeIdx1);
UNIT_ASSERT_VALUES_EQUAL(2, disconnections);
}

Y_UNIT_TEST(ShouldRunDescribeForCachedTabletsIfNumberOfConnFailExceedsThreshold)
yegorskii marked this conversation as resolved.
Show resolved Hide resolved
{
NProto::TStorageServiceConfig config;
config.SetVolumeProxyCacheRetryLimit(3);

TTestEnv env;
ui32 nodeIdx = SetupTestEnv(env, std::move(config));

auto& runtime = env.GetRuntime();
TServiceClient service(runtime, nodeIdx);

service.CreateVolume();
service.WaitForVolume();

ui64 volumeTabletId;
runtime.SetEventFilter([&] (auto& runtime, auto& event) {
Y_UNUSED(runtime);
switch (event->GetTypeRewrite()) {
case TEvSSProxy::EvDescribeVolumeResponse: {
auto* msg = event->template Get<TEvSSProxy::TEvDescribeVolumeResponse>();
const auto& volumeDescription =
msg->PathDescription.GetBlockStoreVolumeDescription();
volumeTabletId = volumeDescription.GetVolumeTabletId();
break;
}
}
return false;
}
);
service.DescribeVolume();

service.SendRequest(
MakeVolumeProxyServiceId(),
std::make_unique<TEvVolume::TEvMapBaseDiskIdToTabletId>(
DefaultDiskId,
volumeTabletId));

service.DestroyVolume();

TDispatchOptions options;
options.FinalEvents.emplace_back(TEvTabletPipe::EvClientDestroyed);
runtime.DispatchEvents(options);

{
service.SendStatVolumeRequest();
auto response = service.RecvStatVolumeResponse();
UNIT_ASSERT_VALUES_EQUAL(E_REJECTED, response->GetStatus());
}

{
service.SendStatVolumeRequest();
auto response = service.RecvStatVolumeResponse();
UNIT_ASSERT_VALUES_EQUAL(E_REJECTED, response->GetStatus());
}

{
service.SendStatVolumeRequest();
auto response = service.RecvStatVolumeResponse();
UNIT_ASSERT_VALUES_EQUAL(E_REJECTED, response->GetStatus());
}

{
service.SendStatVolumeRequest();
auto response = service.RecvStatVolumeResponse();
auto code = response->GetStatus();
UNIT_ASSERT_VALUES_EQUAL(
FACILITY_SCHEMESHARD,
FACILITY_FROM_CODE(code));
UNIT_ASSERT_VALUES_EQUAL(
NKikimrScheme::StatusPathDoesNotExist,
static_cast<NKikimrScheme::EStatus>(STATUS_FROM_CODE(code)));
}
}

Y_UNIT_TEST(ShouldResetFailCounterIfDisconnectedCachedVolumeIsOnlineAgain)
{
NProto::TStorageServiceConfig config;
config.SetVolumeProxyCacheRetryLimit(3);

TTestEnv env;
ui32 nodeIdx = SetupTestEnv(env, std::move(config));

auto& runtime = env.GetRuntime();
TServiceClient service(runtime, nodeIdx);

service.CreateVolume();
service.WaitForVolume();

ui64 volumeTabletId;
runtime.SetEventFilter([&] (auto& runtime, auto& event) {
Y_UNUSED(runtime);
switch (event->GetTypeRewrite()) {
case TEvSSProxy::EvDescribeVolumeResponse: {
auto* msg = event->template Get<TEvSSProxy::TEvDescribeVolumeResponse>();
const auto& volumeDescription =
msg->PathDescription.GetBlockStoreVolumeDescription();
volumeTabletId = volumeDescription.GetVolumeTabletId();
break;
}
}
return false;
}
);
service.DescribeVolume();

service.SendRequest(
MakeVolumeProxyServiceId(),
std::make_unique<TEvVolume::TEvMapBaseDiskIdToTabletId>(
DefaultDiskId,
volumeTabletId));

service.StatVolume();

RebootTablet(runtime, volumeTabletId, service.GetSender(), nodeIdx);

TActorId proxy;
bool failConnects = true;
runtime.SetEventFilter([&] (auto& runtime, auto& event) {
Y_UNUSED(runtime);
switch (event->GetTypeRewrite()) {
case TEvTabletPipe::EvClientConnected: {
auto* msg = event->template Get<TEvTabletPipe::TEvClientConnected>();
if (msg->TabletId == volumeTabletId) {
proxy = event->Recipient;
if (failConnects) {
auto& code =
const_cast<NKikimrProto::EReplyStatus&>(msg->Status);
code = NKikimrProto::ERROR;
}
}
break;
}
case TEvSSProxy::EvDescribeVolumeResponse: {
if (failConnects && event->Recipient == proxy) {
UNIT_ASSERT(false);
}
break;
}
}
return false;
}
);

{
service.SendStatVolumeRequest();
auto response = service.RecvStatVolumeResponse();
UNIT_ASSERT_VALUES_EQUAL(E_REJECTED, response->GetStatus());
}

{
service.SendStatVolumeRequest();
auto response = service.RecvStatVolumeResponse();
UNIT_ASSERT_VALUES_EQUAL(E_REJECTED, response->GetStatus());
}

failConnects = false;

service.StatVolume();
service.StatVolume();
}
}

} // namespace NCloud::NBlockStore::NStorage