Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

limit the number of connect attempts for cached tablet in volume proxy #799

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cloud/blockstore/config/storage.proto
Original file line number Diff line number Diff line change
Expand Up @@ -948,4 +948,8 @@ message TStorageServiceConfig
// Timeout for attempts to acquire the shadow disk when writes to the source
// disk are not blocked (in ms).
optional uint32 MaxAcquireShadowDiskTotalTimeoutWhenNonBlocked = 358;

// Duration of attempts to connect to tablet for cached tablets before
// switching to describe volume.
yegorskii marked this conversation as resolved.
Show resolved Hide resolved
optional uint32 VolumeProxyCacheRetryDuration = 359;
}
2 changes: 2 additions & 0 deletions cloud/blockstore/libs/storage/core/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,8 @@ TDuration MSeconds(ui32 value)
\
xxx(UnconfirmedBlobCountHardLimit, ui32, 1000 )\
\
xxx(VolumeProxyCacheRetryDuration, TDuration, Seconds(15) )\
\
xxx(MaxShadowDiskFillBandwidth, ui32, 500 )\
xxx(MinAcquireShadowDiskRetryDelayWhenBlocked, TDuration, MSeconds(250) )\
xxx(MaxAcquireShadowDiskRetryDelayWhenBlocked, TDuration, Seconds(1) )\
Expand Down
2 changes: 2 additions & 0 deletions cloud/blockstore/libs/storage/core/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,8 @@ class TStorageConfig
TDuration GetMaxAcquireShadowDiskRetryDelayWhenNonBlocked() const;
TDuration GetMaxAcquireShadowDiskTotalTimeoutWhenBlocked() const;
TDuration GetMaxAcquireShadowDiskTotalTimeoutWhenNonBlocked() const;

TDuration GetVolumeProxyCacheRetryDuration() const;
};

ui64 GetAllocationUnit(
Expand Down
37 changes: 30 additions & 7 deletions cloud/blockstore/libs/storage/volume_proxy/volume_proxy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ class TVolumeProxyActor final
, RefCount(refCount)
{}

TInstant DisconnectTs;
ui64 TabletId = 0;
int RefCount = 0;
};
Expand Down Expand Up @@ -488,11 +489,23 @@ void TVolumeProxyActor::HandleConnect(
msg->TabletId,
FormatError(error).data());

if (auto it = BaseDiskIdToTabletId.find(conn->DiskId);
it != BaseDiskIdToTabletId.end() && !it->second.DisconnectTs)
{
it->second.DisconnectTs = ctx.Now();
}

CancelActiveRequests(ctx, *conn);
DestroyConnection(ctx, *conn, error);
return;
}

if (auto it = BaseDiskIdToTabletId.find(conn->DiskId);
it != BaseDiskIdToTabletId.end())
{
it->second.DisconnectTs = {};
}

if (conn->State == FAILED) {
// Tablet recovered
conn->State = STARTED;
Expand Down Expand Up @@ -552,6 +565,12 @@ void TVolumeProxyActor::HandleDescribeResponse(
*conn,
volumeDescr.GetVolumeTabletId(),
msg->Path);

if (auto it = BaseDiskIdToTabletId.find(conn->DiskId);
it != BaseDiskIdToTabletId.end())
{
it->second.DisconnectTs = {};
}
}

template <typename TMethod>
Expand Down Expand Up @@ -585,13 +604,17 @@ void TVolumeProxyActor::HandleRequest(
{
auto itr = BaseDiskIdToTabletId.find(diskId);
if (itr != BaseDiskIdToTabletId.end()) {
PostponeRequest(ctx, conn, IEventHandlePtr(ev.Release()));
StartConnection(
ctx,
conn,
itr->second.TabletId,
"PartitionConfig");
break;
auto deadline =
itr->second.DisconnectTs + Config->GetVolumeProxyCacheRetryDuration();
if (!itr->second.DisconnectTs || deadline > ctx.Now()) {
PostponeRequest(ctx, conn, IEventHandlePtr(ev.Release()));
StartConnection(
ctx,
conn,
itr->second.TabletId,
"PartitionConfig");
break;
}
}

conn.State = RESOLVING;
Expand Down
147 changes: 147 additions & 0 deletions cloud/blockstore/libs/storage/volume_proxy/volume_proxy_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,153 @@ Y_UNIT_TEST_SUITE(TVolumeProxyTest)
RebootTablet(runtime, volumeTabletId, service1.GetSender(), nodeIdx1);
UNIT_ASSERT_VALUES_EQUAL(2, disconnections);
}

Y_UNIT_TEST(ShouldRunDescribeForCachedTabletsIfNumberOfConnFailExceedsThreshold)
yegorskii marked this conversation as resolved.
Show resolved Hide resolved
{
constexpr ui32 timeout = 3'000;

NProto::TStorageServiceConfig config;
config.SetVolumeProxyCacheRetryDuration(timeout);

TTestEnv env;
ui32 nodeIdx = SetupTestEnv(env, std::move(config));

auto& runtime = env.GetRuntime();
TServiceClient service(runtime, nodeIdx);

service.CreateVolume();
service.WaitForVolume();

ui64 volumeTabletId;
runtime.SetEventFilter([&] (auto& runtime, auto& event) {
Y_UNUSED(runtime);
switch (event->GetTypeRewrite()) {
case TEvSSProxy::EvDescribeVolumeResponse: {
auto* msg = event->template Get<TEvSSProxy::TEvDescribeVolumeResponse>();
const auto& volumeDescription =
msg->PathDescription.GetBlockStoreVolumeDescription();
volumeTabletId = volumeDescription.GetVolumeTabletId();
break;
}
}
return false;
}
);
service.DescribeVolume();

service.SendRequest(
MakeVolumeProxyServiceId(),
std::make_unique<TEvVolume::TEvMapBaseDiskIdToTabletId>(
DefaultDiskId,
volumeTabletId));

service.DestroyVolume();

TDispatchOptions options;
options.FinalEvents.emplace_back(TEvTabletPipe::EvClientDestroyed);
runtime.DispatchEvents(options);

{
service.SendStatVolumeRequest();
auto response = service.RecvStatVolumeResponse();
UNIT_ASSERT_VALUES_EQUAL(E_REJECTED, response->GetStatus());
}

runtime.AdvanceCurrentTime(TDuration::MilliSeconds(timeout));

{
service.SendStatVolumeRequest();
auto response = service.RecvStatVolumeResponse();
auto code = response->GetStatus();
UNIT_ASSERT_VALUES_EQUAL(
FACILITY_SCHEMESHARD,
FACILITY_FROM_CODE(code));
UNIT_ASSERT_VALUES_EQUAL(
NKikimrScheme::StatusPathDoesNotExist,
static_cast<NKikimrScheme::EStatus>(STATUS_FROM_CODE(code)));
}
}

Y_UNIT_TEST(ShouldResetFailCounterIfDisconnectedCachedVolumeIsOnlineAgain)
{
NProto::TStorageServiceConfig config;
config.SetVolumeProxyCacheRetryDuration(3'000);

TTestEnv env;
ui32 nodeIdx = SetupTestEnv(env, std::move(config));

auto& runtime = env.GetRuntime();
TServiceClient service(runtime, nodeIdx);

service.CreateVolume();
service.WaitForVolume();

ui64 volumeTabletId;
runtime.SetEventFilter([&] (auto& runtime, auto& event) {
Y_UNUSED(runtime);
switch (event->GetTypeRewrite()) {
case TEvSSProxy::EvDescribeVolumeResponse: {
auto* msg = event->template Get<TEvSSProxy::TEvDescribeVolumeResponse>();
const auto& volumeDescription =
msg->PathDescription.GetBlockStoreVolumeDescription();
volumeTabletId = volumeDescription.GetVolumeTabletId();
break;
}
}
return false;
}
);
service.DescribeVolume();

service.SendRequest(
MakeVolumeProxyServiceId(),
std::make_unique<TEvVolume::TEvMapBaseDiskIdToTabletId>(
DefaultDiskId,
volumeTabletId));

service.StatVolume();

RebootTablet(runtime, volumeTabletId, service.GetSender(), nodeIdx);

TActorId proxy;
bool failConnects = true;
runtime.SetEventFilter([&] (auto& runtime, auto& event) {
Y_UNUSED(runtime);
switch (event->GetTypeRewrite()) {
case TEvTabletPipe::EvClientConnected: {
auto* msg = event->template Get<TEvTabletPipe::TEvClientConnected>();
if (msg->TabletId == volumeTabletId) {
proxy = event->Recipient;
if (failConnects) {
auto& code =
const_cast<NKikimrProto::EReplyStatus&>(msg->Status);
code = NKikimrProto::ERROR;
}
}
break;
}
case TEvSSProxy::EvDescribeVolumeResponse: {
if (failConnects && event->Recipient == proxy) {
UNIT_ASSERT(false);
}
break;
}
}
return false;
}
);

{
service.SendStatVolumeRequest();
auto response = service.RecvStatVolumeResponse();
UNIT_ASSERT_VALUES_EQUAL(E_REJECTED, response->GetStatus());
}

failConnects = false;

service.StatVolume();
service.StatVolume();
}
}

} // namespace NCloud::NBlockStore::NStorage
Loading