Skip to content

Commit

Permalink
[BACKPORT 2.14][#21371] docdb: Hidden split parent tablets showing as…
Browse files Browse the repository at this point in the history
… leaderless in master's leaderless tablet endpoint

Summary:
Original commit: 5f38333 / D33401
Customer Impact: Universe is showing the load balanced status as true, but upon checking replica info, it's discovered that there are leaderless tablets, which are actually hidden split parent tablets. If you have Point-In-Time Recovery (PITR) enabled, the tablets aren't actually deleted if you drop the table until the retention window ends; they are just put into a hidden state.

This issue can also occur in non-PITR scenarios where the split parent tablets are left undeleted for a long time (>120s). This can happen if one of the split children failed to register.

Cause: When the tablet is marked as TABLET_DATA_SPLIT_COMPLETED, metrics for this tablet are skipped from being sent to the master. Consequently, the leader lease is not updated on the master.

The fix is to also report metrics for tablets with state TABLET_DATA_SPLIT_COMPLETED.
Jira: DB-10268

Test Plan: MasterPathHandlersItest.TestHiddenSplitParentTablet

Reviewers: asrivastava, zdrudi

Reviewed By: asrivastava

Subscribers: bogdan, qhu, ybase

Tags: #jenkins-ready

Differential Revision: https://phorge.dev.yugabyte.com/D33716
  • Loading branch information
Huqicheng committed Apr 29, 2024
1 parent d2f6d93 commit fee5dbf
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 2 deletions.
49 changes: 49 additions & 0 deletions src/yb/integration-tests/master_path_handlers-itest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,10 @@ DECLARE_int32(follower_unavailable_considered_failed_sec);

DECLARE_int32(cleanup_split_tablets_interval_sec);
DECLARE_int32(catalog_manager_bg_task_wait_ms);
DECLARE_int32(tserver_heartbeat_metrics_interval_ms);
DECLARE_bool(enable_automatic_tablet_splitting);
DECLARE_int32(leaderless_tablet_alert_delay_secs);
DECLARE_bool(TEST_skip_deleting_split_tablets);

namespace yb {
namespace master {
Expand Down Expand Up @@ -323,6 +325,53 @@ TEST_F_EX(MasterPathHandlersItest, Forward, MultiMasterPathHandlersItest) {
}
}

class TabletSplitMasterPathHandlersItest : public MasterPathHandlersItest {
public:
void SetUp() override {
ANNOTATE_UNPROTECTED_WRITE(FLAGS_cleanup_split_tablets_interval_sec) = 1;
ANNOTATE_UNPROTECTED_WRITE(FLAGS_enable_automatic_tablet_splitting) = false;
ANNOTATE_UNPROTECTED_WRITE(FLAGS_tserver_heartbeat_metrics_interval_ms) = 1000;
MasterPathHandlersItest::SetUp();
}

void InsertRows(const client::TableHandle& table, int num_rows_to_insert) {
auto session = client_->NewSession();
session->SetTimeout(60s);
for (int i = 0; i < num_rows_to_insert; i++) {
auto insert = table.NewInsertOp();
auto req = insert->mutable_request();
QLAddInt32HashValue(req, i);
ASSERT_OK(session->TEST_ApplyAndFlush(insert));
}
}
};

// Undeleted split parent tablets shouldn't be shown as leaderless.
TEST_F_EX(
MasterPathHandlersItest, TestUndeletedParentTablet, TabletSplitMasterPathHandlersItest) {
const auto kLeaderlessTabletAlertDelaySecs = 5;
ANNOTATE_UNPROTECTED_WRITE(FLAGS_leaderless_tablet_alert_delay_secs) =
kLeaderlessTabletAlertDelaySecs;
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_deleting_split_tablets) = true;

CreateTestTable(1 /* num_tablets */);

client::TableHandle table;
ASSERT_OK(table.Open(table_name, client_.get()));
InsertRows(table, /* num_rows_to_insert = */ 500);

auto& catalog_manager = ASSERT_RESULT(cluster_->GetLeaderMiniMaster())->catalog_manager();
auto tablet = catalog_manager.GetTableInfo(table->id())->GetTablets()[0];

ASSERT_OK(yb_admin_client_->FlushTables(
{table_name}, false /* add_indexes */, 30 /* timeout_secs */, false /* is_compaction */));
ASSERT_OK(catalog_manager.TEST_SplitTablet(tablet, 1 /* split_hash_code */));

SleepFor(kLeaderlessTabletAlertDelaySecs * 1s);
string result = GetLeaderlessTabletsString();
ASSERT_EQ(result.find(tablet->id()), string::npos);
}

class MasterPathHandlersExternalItest : public MasterPathHandlersBaseItest<ExternalMiniCluster> {
public:
void InitCluster() override {
Expand Down
3 changes: 1 addition & 2 deletions src/yb/tserver/tserver_metrics_heartbeat_data_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,7 @@ void TServerMetricsHeartbeatDataProvider::DoAddData(
uncompressed_file_sizes += sizes.second;
num_files += tablet->GetCurrentVersionNumSSTFiles();
if (should_add_tablet_data && tablet_peer->log_available() &&
tablet_peer->tablet_metadata()->tablet_data_state() ==
tablet::TabletDataState::TABLET_DATA_READY) {
CanServeTabletData(tablet_peer->tablet_metadata()->tablet_data_state())) {
auto storage_metadata = req->add_storage_metadata();
storage_metadata->set_tablet_id(tablet_peer->tablet_id());
storage_metadata->set_sst_file_size(sizes.first);
Expand Down

0 comments on commit fee5dbf

Please sign in to comment.