Skip to content

Commit

Permalink
[BACKPORT 2.18][#22618,#22608] docdb: Fix DeleteTabletListAndSendRequ…
Browse files Browse the repository at this point in the history
…ests deadlock and deleting xrepl tablets

Summary:
Original commit: aead171 / D35458

Updating retained_by_* maps in DeleteTabletListAndSendRequests before committing the tablet locks.
This ensures that if there are multiple calls to DeleteTabletListAndSendRequests (eg due to a few
CleanupSplitTablets calls) that xrepl tablets are only hidden on the first call and ignored for
later calls.

Also fixing the deadlock in DeleteTabletListAndSendRequests that can arise with mutex_ and tablet
locks.
Jira: DB-11526, DB-11517

Test Plan: ybd --cxx-test xcluster-tablet-split-itest --gtest_filter "CdcTabletSplitThreeMastersITest.TestRaceAfterHidingAndRetainingTablet"

Reviewers: xCluster, hsunder, skumar, vkushwaha

Reviewed By: hsunder

Tags: #jenkins-ready

Differential Revision: https://phorge.dev.yugabyte.com/D35541
  • Loading branch information
hulien22 committed Jun 10, 2024
1 parent 305555b commit d072e37
Show file tree
Hide file tree
Showing 2 changed files with 172 additions and 91 deletions.
82 changes: 78 additions & 4 deletions src/yb/integration-tests/xcluster-tablet-split-itest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "yb/tserver/tablet_server.h"
#include "yb/util/backoff_waiter.h"
#include "yb/util/logging.h"
#include "yb/util/sync_point.h"
#include "yb/util/thread.h"
#include "yb/util/tostring.h"
#include "yb/util/tsan_util.h"
Expand All @@ -60,6 +61,7 @@ DECLARE_bool(enable_tablet_split_of_xcluster_bootstrapping_tables);
DECLARE_int32(cdc_state_checkpoint_update_interval_ms);
DECLARE_bool(enable_collect_cdc_metrics);
DECLARE_int32(update_metrics_interval_ms);
DECLARE_int32(cleanup_split_tablets_interval_sec);

DECLARE_bool(enable_automatic_tablet_splitting);
DECLARE_int64(tablet_split_low_phase_shard_count_per_node);
Expand Down Expand Up @@ -221,10 +223,8 @@ class CdcTabletSplitITest : public XClusterTabletSplitITestBase<TabletSplitITest
Result<std::unique_ptr<MiniCluster>> CreateNewUniverseAndTable(
const string& cluster_id, client::TableHandle* table) {
// First create the new cluster.
MiniClusterOptions opts;
opts.num_tablet_servers = 3;
opts.cluster_id = cluster_id;
std::unique_ptr<MiniCluster> cluster = std::make_unique<MiniCluster>(opts);
std::unique_ptr<MiniCluster> cluster =
std::make_unique<MiniCluster>(GetClusterOptions(cluster_id));
RETURN_NOT_OK(cluster->Start());
RETURN_NOT_OK(cluster->WaitForTabletServerCount(3));
auto cluster_client = VERIFY_RESULT(cluster->CreateClient());
Expand All @@ -238,6 +238,13 @@ class CdcTabletSplitITest : public XClusterTabletSplitITestBase<TabletSplitITest
return cluster;
}

MiniClusterOptions GetClusterOptions(const string& cluster_id) {
MiniClusterOptions opts;
opts.num_tablet_servers = 3;
opts.cluster_id = cluster_id;
return opts;
}

Status GetChangesWithRetries(
cdc::CDCServiceProxy* cdc_proxy, const cdc::GetChangesRequestPB& change_req,
cdc::GetChangesResponsePB* change_resp) {
Expand Down Expand Up @@ -330,6 +337,73 @@ TEST_F(CdcTabletSplitITest, GetChangesOnSplitParentTablet) {
ASSERT_TRUE(status.IsNotFound() || status.IsTabletSplit());
}

class CdcTabletSplitThreeMastersITest : public CdcTabletSplitITest {
protected:
MiniClusterOptions GetClusterOptions(const string& cluster_id) {
MiniClusterOptions opts;
opts.num_tablet_servers = 3;
opts.num_masters = 3;
opts.cluster_id = cluster_id;
return opts;
}
};

#ifndef NDEBUG
// Tests multiple CleanupSplitTablets trying to delete xRepl tablets at the same time. Ensures that
// the parent tablets are only hidden and not deleted even with multiple calls.
TEST_F(CdcTabletSplitThreeMastersITest, TestRaceAfterHidingAndRetainingTablet) {
const auto AddToMapsDelay = 10s;

ANNOTATE_UNPROTECTED_WRITE(FLAGS_cdc_state_checkpoint_update_interval_ms) = 0;
ANNOTATE_UNPROTECTED_WRITE(FLAGS_cleanup_split_tablets_interval_sec) = 1;

// Block any tablets from getting deleted until after the master stepdown.
yb::SyncPoint::GetInstance()->LoadDependency(
{{"CdcTabletSplitThreeMastersITest::TestRaceAfterHidingAndRetainingTablet::StepdownMaster",
"CatalogManager::DeleteTabletListAndSendRequests::Start"}});

// Add a delay before adding the retained_by_* maps.
yb::SyncPoint::GetInstance()->SetCallBack(
"CatalogManager::DeleteTabletListAndSendRequests::AddToMaps",
[AddToMapsDelay](void* _) { SleepFor(AddToMapsDelay); });
yb::SyncPoint::GetInstance()->EnableProcessing();

// Create a cdc stream for this tablet.
auto cdc_proxy = std::make_unique<cdc::CDCServiceProxy>(
&client_->proxy_cache(),
HostPort::FromBoundEndpoint(cluster_->mini_tablet_servers().front()->bound_rpc_addr()));
CDCStreamId stream_id;
cdc::CreateCDCStream(cdc_proxy, table_->id(), &stream_id);
// Ensure that the cdc_state table is ready before inserting rows and splitting.
ASSERT_OK(WaitForCdcStateTableToBeReady());

// Write some rows to the tablet.
const auto split_hash_code = ASSERT_RESULT(WriteRowsAndGetMiddleHashCode(kDefaultNumRows));
const auto source_tablet_id = ASSERT_RESULT(SplitTabletAndValidate(
split_hash_code, kDefaultNumRows, /* parent_tablet_protected_from_deletion */ true));

// Since we have the DeleteTabletListAndSendRequests::Start syncpoint, the parent tablet will not
// be hidden yet. The tserver should be repeatedly sending DeleteNotServingTablet requests.

// Perform a master failover by restarting the master.
ASSERT_OK(tools::RunAdminToolCommand(cluster_->GetMasterAddresses(), "master_leader_stepdown"));
TEST_SYNC_POINT(
"CdcTabletSplitThreeMastersITest::TestRaceAfterHidingAndRetainingTablet::StepdownMaster");

// Give some time for other delete threads to come in.
SleepFor(AddToMapsDelay * 2);

// Should still have 3 tablets in total.
ASSERT_EQ(ListTabletIdsForTable(cluster_.get(), table_->id()).size(), 3);
// But parent tablet should be hidden.
ASSERT_EQ(ListActiveTabletIdsForTable(cluster_.get(), table_->id()).size(), 2);

// Disable processing to allow for smooth shutdown.
yb::SyncPoint::GetInstance()->DisableProcessing();
ANNOTATE_UNPROTECTED_WRITE(FLAGS_cleanup_split_tablets_interval_sec) = 60;
}
#endif // NDEBUG

// For testing xCluster setups. Since most test utility functions expect there to be only one
// cluster, they implicitly use cluster_ / client_ / table_ everywhere. For this test, we default
// those to point to the producer cluster, but allow calls to SwitchToProducer/Consumer, to swap
Expand Down
181 changes: 94 additions & 87 deletions src/yb/master/catalog_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10823,102 +10823,92 @@ Status CatalogManager::DeleteTabletListAndSendRequests(
HideOnly hide_only;
};

std::vector<TabletInfoPtr> marked_as_hidden;
{
std::vector<TabletData> tablets_data;
tablets_data.reserve(tablets.size());
std::vector<TabletInfo*> tablet_infos;
tablet_infos.reserve(tablets_data.size());

// Grab tablets and tablet write locks. The list should already be in tablet_id sorted order.
{
SharedLock read_lock(mutex_);
for (const auto& tablet : tablets) {
tablets_data.push_back(TabletData {
.tablet = tablet,
.lock = tablet->LockForWrite(),
// Hide tablet if it is retained by snapshot schedule, or is part of a cdc stream.
.hide_only = HideOnly(!retained_by_snapshot_schedules.empty()),
});

if (!tablets_data.back().hide_only &&
(IsTableXClusterProducer(*tablet->table()) || IsTablePartOfCDCSDK(*tablet->table()))) {
// For xCluster and CDCSDK , the only time we try to delete a tablet that is part of an
// active stream is during tablet splitting, where we need to keep the parent tablet
// around until we have replicated its SPLIT_OP record.

// There are a few cases to handle here:
// 1. This tablet is not yet hidden, and so this is the first time we are hiding it. Hide
// the tablet and also add it to retained_by_xcluster_ or retained_by_cdcsdk_ so that
// it stays hidden.
// 2. The tablet is hidden and part of retained_by_xcluster_ or retained_by_cdcsdk_. Keep
// this tablet hidden.
// 3. This tablet is hidden but not part of retained_by_xcluster_ or retained_by_cdcsdk_.
// This means that the bg task DoProcessCDCParentTabletDeletion processed it and found
// that all streams for this tablet have replicated the split record. We can now delete
// it as long as it isn't being kept by a snapshot schedule.
if (!tablets_data.back().lock->ListedAsHidden() ||
retained_by_xcluster_.contains(tablet->id()) ||
retained_by_cdcsdk_.contains(tablet->id())) {
tablets_data.back().hide_only = HideOnly(true);
}
}

tablet_infos.emplace_back(tablet.get());
}
}
TEST_SYNC_POINT("CatalogManager::DeleteTabletListAndSendRequests::Start");

// Use the same hybrid time for all hidden tablets.
HybridTime hide_hybrid_time = master_->clock()->Now();
std::vector<TabletInfoPtr> marked_as_hidden;
std::vector<TabletData> tablets_data;
tablets_data.reserve(tablets.size());
std::vector<TabletInfo*> tablet_infos;
tablet_infos.reserve(tablets_data.size());

// Mark the tablets as deleted.
// Grab tablets and tablet write locks. The list should already be in tablet_id sorted order.
for (const auto& tablet : tablets) {
tablets_data.push_back(TabletData{
.tablet = tablet,
.lock = tablet->LockForWrite(),
// Hide tablet if it is retained by snapshot schedule.
.hide_only = HideOnly(!retained_by_snapshot_schedules.empty()),
});
tablet_infos.emplace_back(tablet.get());
}
{
// Also hide tablet if it is part of a cdc stream.

// For xCluster and CDCSDK , the only time we try to delete a tablet that is part of an
// active stream is during tablet splitting, where we need to keep the parent tablet
// around until we have replicated its SPLIT_OP record.

// There are a few cases to handle here:
// 1. This tablet is not yet hidden, and so this is the first time we are hiding it. Hide
// the tablet and also add it to retained_by_xcluster_ or retained_by_cdcsdk_ so that
// it stays hidden.
// 2. The tablet is hidden and part of retained_by_xcluster_ or retained_by_cdcsdk_. Keep
// this tablet hidden.
// 3. This tablet is hidden but not part of retained_by_xcluster_ or retained_by_cdcsdk_.
// This means that the bg task DoProcessCDCParentTabletDeletion processed it and found
// that all streams for this tablet have replicated the split record. We can now delete
// it as long as it isn't being kept by a snapshot schedule.
SharedLock read_lock(mutex_);
for (auto& tablet_data : tablets_data) {
auto& tablet = tablet_data.tablet;
auto& tablet_lock = tablet_data.lock;

bool was_hidden = tablet_lock->ListedAsHidden();
// Inactive tablet now, so remove it from partitions_.
// After all the tablets have been deleted from the tservers, we remove it from tablets_.
tablet->table()->RemoveTablet(tablet->id(), DeactivateOnly::kTrue);

if (tablet_data.hide_only) {
LOG(INFO) << "Hiding tablet " << tablet->tablet_id();
tablet_lock.mutable_data()->pb.set_hide_hybrid_time(hide_hybrid_time.ToUint64());
*tablet_lock.mutable_data()->pb.mutable_retained_by_snapshot_schedules() =
retained_by_snapshot_schedules;
} else {
LOG(INFO) << "Deleting tablet " << tablet->tablet_id();
if (!transaction_status_tablets) {
tablet_lock.mutable_data()->set_state(SysTabletsEntryPB::DELETED, deletion_msg);
}
continue;
}
if (tablet_lock->ListedAsHidden() && !was_hidden) {
marked_as_hidden.push_back(tablet);
if (IsTableXClusterProducer(*tablet_data.tablet->table()) ||
IsTablePartOfCDCSDK(*tablet_data.tablet->table())) {
if (!tablet_data.lock->ListedAsHidden() ||
retained_by_xcluster_.contains(tablet_data.tablet->id()) ||
retained_by_cdcsdk_.contains(tablet_data.tablet->id())) {
tablet_data.hide_only = HideOnly(true);
}
}
}
}

// Update all the tablet states in raft in bulk.
RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet_infos));
// Use the same hybrid time for all hidden tablets.
HybridTime hide_hybrid_time = master_->clock()->Now();

// Commit the change.
for (auto& tablet_data : tablets_data) {
auto& tablet = tablet_data.tablet;
auto& tablet_lock = tablet_data.lock;
// Mark the tablets as deleted.
for (auto& tablet_data : tablets_data) {
auto& tablet = tablet_data.tablet;
auto& tablet_lock = tablet_data.lock;

tablet_lock.Commit();
LOG(INFO) << (tablet_data.hide_only ? "Hid" : "Deleted") << " tablet " << tablet->tablet_id();
bool was_hidden = tablet_lock->ListedAsHidden();
// Inactive tablet now, so remove it from partitions_.
// After all the tablets have been deleted from the tservers, we remove it from tablets_.
tablet->table()->RemoveTablet(tablet->id(), DeactivateOnly::kTrue);

if (transaction_status_tablets) {
auto leader = VERIFY_RESULT(tablet->GetLeader());
RETURN_NOT_OK(SendPrepareDeleteTransactionTabletRequest(
tablet, leader->permanent_uuid(), deletion_msg, tablet_data.hide_only));
} else {
DeleteTabletReplicas(tablet.get(), deletion_msg, tablet_data.hide_only, KeepData::kFalse);
if (tablet_data.hide_only) {
LOG(INFO) << "Hiding tablet " << tablet->tablet_id();
tablet_lock.mutable_data()->pb.set_hide_hybrid_time(hide_hybrid_time.ToUint64());
*tablet_lock.mutable_data()->pb.mutable_retained_by_snapshot_schedules() =
retained_by_snapshot_schedules;
} else {
LOG(INFO) << "Deleting tablet " << tablet->tablet_id();
if (!transaction_status_tablets) {
tablet_lock.mutable_data()->set_state(SysTabletsEntryPB::DELETED, deletion_msg);
}
}
if (tablet_lock->ListedAsHidden() && !was_hidden) {
marked_as_hidden.push_back(tablet);
}
}

// Update all the tablet states in raft in bulk.
RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet_infos));

// Record any hidden tablets before releasing tablet locks.
if (!marked_as_hidden.empty()) {
TEST_SYNC_POINT("CatalogManager::DeleteTabletListAndSendRequests::AddToMaps");
LockGuard lock(mutex_);
hidden_tablets_.insert(hidden_tablets_.end(), marked_as_hidden.begin(), marked_as_hidden.end());
// Also keep track of all tablets that were hid due to xCluster.
Expand All @@ -10932,13 +10922,12 @@ Status CatalogManager::DeleteTabletListAndSendRequests(
// We are hiding this tablet for PITR and not for xCluster.
continue;
}
HiddenReplicationParentTabletInfo info {
.table_id_ = tablet->table()->id(),
.parent_tablet_id_ = tablet_lock->pb.has_split_parent_tablet_id()
? tablet_lock->pb.split_parent_tablet_id()
: "",
.split_tablets_ = {children.Get(0), children.Get(1)}
};
HiddenReplicationParentTabletInfo info{
.table_id_ = tablet->table()->id(),
.parent_tablet_id_ = tablet_lock->pb.has_split_parent_tablet_id()
? tablet_lock->pb.split_parent_tablet_id()
: "",
.split_tablets_ = {children.Get(0), children.Get(1)}};
if (is_table_cdc_producer) {
retained_by_xcluster_.emplace(tablet->id(), info);
}
Expand All @@ -10949,6 +10938,24 @@ Status CatalogManager::DeleteTabletListAndSendRequests(
}
}

// Commit the change.
for (auto& tablet_data : tablets_data) {
auto& tablet = tablet_data.tablet;
auto& tablet_lock = tablet_data.lock;

tablet_lock.Commit();

LOG(INFO) << (tablet_data.hide_only ? "Hid" : "Deleted") << " tablet " << tablet->tablet_id();

if (transaction_status_tablets) {
auto leader = VERIFY_RESULT(tablet->GetLeader());
RETURN_NOT_OK(SendPrepareDeleteTransactionTabletRequest(
tablet, leader->permanent_uuid(), deletion_msg, tablet_data.hide_only));
} else {
DeleteTabletReplicas(tablet.get(), deletion_msg, tablet_data.hide_only, KeepData::kFalse);
}
}

return Status::OK();
}

Expand Down

0 comments on commit d072e37

Please sign in to comment.