Skip to content

Commit

Permalink
[BACKPORT 2.20][PLAT-9504][PLAT-11231] Check no tablets and ip not in…
Browse files Browse the repository at this point in the history
… master config on releaseInstance and before AnsibleDestroyServer subtask

Summary:
Original commit:

252ef97 / D30103

Add a new pre-check subtask to `ReleaseInstanceFromUniverse` and as a subtask right before any ansibleDestroyServer subtask called `CheckNodeSafeToDelete`.

`CheckNodeSafeToDelete` checks that there are no tserver tablets assigned to this node if applicable and there is no master on this node that is in the universe quorum. We base these checks off of the ip of the node.

We will fail the `ReleaseInstanceFromUniverse` task if this pre-check fails. In addition, we also fail if we are taking down the server and tserver still has tablets or the server's master process is still part of the quorum. This is used as an extra pre-caution but is a short-term fix until we have a cluster whitelist implemented from the db side.

Test Plan:
Create a 4 node rf3 universe (2 1 1 for the azs):

Perform a remove node -> release node for one of the nodes in the az that has 2 nodes. Make sure that on the release node, the task succeeds.

Test #2
Perform a remove node. Manually start up the tserver on the node and remove the blacklist using ybadmin command. Perform a release node from universe. Check that the release node task fails.

Test #3
Perform a remove node. Manually start up the master process on the node we just removed. Add the node back to the quorum using yb-admin. Perform a release node from the universe. Check that the release node task fails.

(Can also just change the node state of the node to `Removed` and then run the ReleaseNodeFromUniverse task)

Other testing:
- Check that we are able to do a full move, delete a universe.

Reviewers: sanketh, nsingh, yshchetinin

Reviewed By: nsingh

Subscribers: yugaware

Tags: #jenkins-ready

Differential Revision: https://phorge.dev.yugabyte.com/D33265
  • Loading branch information
charleswang234 committed Mar 18, 2024
1 parent d0def64 commit a36d3c5
Show file tree
Hide file tree
Showing 23 changed files with 371 additions and 122 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import com.yugabyte.yw.common.ConfigHelper;
import com.yugabyte.yw.common.ImageBundleUtil;
import com.yugabyte.yw.common.NodeManager;
import com.yugabyte.yw.common.NodeUIApiHelper;
import com.yugabyte.yw.common.PlatformExecutorFactory;
import com.yugabyte.yw.common.RestoreManagerYb;
import com.yugabyte.yw.common.ShellResponse;
Expand Down Expand Up @@ -83,6 +84,7 @@ public abstract class AbstractTaskBase implements ITask {
protected final BackupHelper backupHelper;
protected final AutoFlagUtil autoFlagUtil;
protected final ImageBundleUtil imageBundleUtil;
protected final NodeUIApiHelper nodeUIApiHelper;

@Inject
protected AbstractTaskBase(BaseTaskDependencies baseTaskDependencies) {
Expand All @@ -106,6 +108,7 @@ protected AbstractTaskBase(BaseTaskDependencies baseTaskDependencies) {
this.backupHelper = baseTaskDependencies.getBackupHelper();
this.autoFlagUtil = baseTaskDependencies.getAutoFlagUtil();
this.imageBundleUtil = baseTaskDependencies.getImageBundleUtil();
this.nodeUIApiHelper = baseTaskDependencies.getNodeUIApiHelper();
}

protected ITaskParams taskParams() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import com.yugabyte.yw.common.ConfigHelper;
import com.yugabyte.yw.common.ImageBundleUtil;
import com.yugabyte.yw.common.NodeManager;
import com.yugabyte.yw.common.NodeUIApiHelper;
import com.yugabyte.yw.common.PlatformExecutorFactory;
import com.yugabyte.yw.common.RestoreManagerYb;
import com.yugabyte.yw.common.TableManager;
Expand Down Expand Up @@ -54,4 +55,5 @@ public class BaseTaskDependencies {
private final AutoFlagUtil autoFlagUtil;
private final Commissioner commissioner;
private final ImageBundleUtil imageBundleUtil;
private final NodeUIApiHelper nodeUIApiHelper;
}
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,9 @@ public enum SubTaskGroupType {
UpdateProxyConfig,

// OS Patching.
OSPatching
OSPatching,

ValidatingNode
}

public List<SubTaskDetails> taskDetails;
Expand Down Expand Up @@ -610,6 +612,10 @@ public static SubTaskDetails createSubTask(SubTaskGroupType subTaskGroupType) {
title = "OS Patching";
description = "Performing OS Patching on the universe nodes";
break;
case ValidatingNode:
title = "Validating node status";
description = "Validating node's current state before proceeding";
break;
default:
LOG.warn("UserTaskDetails: Missing SubTaskDetails for : {}", subTaskGroupType);
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ public void run() {
nodesToBeRemoved,
params().isForceDelete,
true /* deleteNodeFromDB */,
true /* deleteRootVolumes */)
true /* deleteRootVolumes */,
false /* skipDestroyPrecheck */)
.setSubTaskGroupType(SubTaskGroupType.RemovingUnusedServers);

// Remove the cluster entry from the universe db entry.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ public void run() {
currentNodeDetails,
true /* isForceDelete */,
false /* deleteNode */,
true /* deleteRootVolumes */)
true /* deleteRootVolumes */,
false /* skipDestroyPrecheck */)
.setSubTaskGroupType(SubTaskGroupType.DeletingNode);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@ DnsManager.DnsCommandType.Delete, params().isForceDelete, universe)
universe.getNodes(),
params().isForceDelete,
true /* delete node */,
true /* deleteRootVolumes */)
true /* deleteRootVolumes */,
true /* skipDestroyPrecheck */)
.setSubTaskGroupType(SubTaskGroupType.RemovingUnusedServers);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,8 @@ private void editCluster(
nodesToBeRemoved,
false /* isForceDelete */,
true /* deleteNode */,
true /* deleteRootVolumes */)
true /* deleteRootVolumes */,
false /* skipDestroyPrecheck */)
.setSubTaskGroupType(SubTaskGroupType.RemovingUnusedServers);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ public void run() {
nodesToBeRemoved,
params().isForceDelete,
true /* deleteNodeFromDB */,
true /* deleteRootVolumes */)
true /* deleteRootVolumes */,
false /* skipDestroyPrecheck */)
.setSubTaskGroupType(SubTaskGroupType.RemovingUnusedServers);

// Remove the cluster entry from the universe db entry.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,14 @@ public void validateParams(boolean isFirstTry) {
}
}

@Override
protected void createPrecheckTasks(Universe universe) {

NodeDetails currentNode = universe.getNode(taskParams().nodeName);
Collection<NodeDetails> currentNodeDetails = Collections.singleton(currentNode);
createCheckNodeSafeToDeleteTasks(universe, currentNodeDetails);
}

@Override
public void run() {
log.info(
Expand Down Expand Up @@ -105,16 +113,18 @@ public void run() {
createWaitForMasterLeaderTask().setSubTaskGroupType(SubTaskGroupType.ReleasingInstance);

if (instanceExists) {
// Set the node states to Removing.
// Set the node states to Terminating.
createSetNodeStateTasks(currentNodeDetails, NodeDetails.NodeState.Terminating)
.setSubTaskGroupType(SubTaskGroupType.ReleasingInstance);

// Create tasks to terminate that instance. Force delete and ignore errors.
createDestroyServerTasks(
universe,
currentNodeDetails,
true /* isForceDelete */,
false /* deleteNode */,
true /* deleteRootVolumes */)
true /* deleteRootVolumes */,
false /* skipDestroyPrecheck */)
.setSubTaskGroupType(SubTaskGroupType.ReleasingInstance);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,15 @@

package com.yugabyte.yw.commissioner.tasks;

import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.net.HostAndPort;
import com.yugabyte.yw.commissioner.BaseTaskDependencies;
import com.yugabyte.yw.commissioner.ITask.Retryable;
import com.yugabyte.yw.commissioner.UserTaskDetails.SubTaskGroupType;
import com.yugabyte.yw.commissioner.tasks.params.NodeTaskParams;
import com.yugabyte.yw.common.ApiHelper;
import com.yugabyte.yw.common.DnsManager;
import com.yugabyte.yw.common.NodeActionType;
import com.yugabyte.yw.common.NodeUIApiHelper;
import com.yugabyte.yw.common.PlacementInfoUtil;
import com.yugabyte.yw.common.RetryTaskUntilCondition;
import com.yugabyte.yw.common.config.GlobalConfKeys;
import com.yugabyte.yw.common.config.UniverseConfKeys;
import com.yugabyte.yw.common.nodeui.DumpEntitiesResponse;
import com.yugabyte.yw.forms.NodeActionFormData;
import com.yugabyte.yw.forms.UniverseDefinitionTaskParams.Cluster;
import com.yugabyte.yw.forms.UniverseDefinitionTaskParams.UserIntent;
Expand All @@ -43,7 +37,6 @@
import javax.inject.Inject;
import lombok.extern.slf4j.Slf4j;
import org.yb.util.TabletServerInfo;
import play.libs.Json;

// Allows the removal of a node from a universe. Ensures the task waits for the right set of
// server data move primitives. And stops using the underlying instance, though YW still owns it.
Expand All @@ -52,15 +45,11 @@
@Retryable
public class RemoveNodeFromUniverse extends UniverseDefinitionTaskBase {

private final ApiHelper apiHelper;

public static final String DUMP_ENTITIES_URL_SUFFIX = "/dump-entities";

@Inject
protected RemoveNodeFromUniverse(
BaseTaskDependencies baseTaskDependencies, NodeUIApiHelper apiHelper) {
protected RemoveNodeFromUniverse(BaseTaskDependencies baseTaskDependencies) {
super(baseTaskDependencies);
this.apiHelper = apiHelper;
}

@Override
Expand Down Expand Up @@ -300,38 +289,4 @@ public void performPrecheck() {
}
log.debug("Pre-check succeeded");
}

private Set<String> getTserverTablets(Universe universe, NodeDetails currentNode) {
// Wait for a maximum of 10 seconds for url to succeed.
NodeDetails masterLeaderNode = universe.getMasterLeaderNode();
HostAndPort masterLeaderHostPort =
HostAndPort.fromParts(
masterLeaderNode.cloudInfo.private_ip, masterLeaderNode.masterHttpPort);
String masterLeaderUrl =
String.format("http://%s%s", masterLeaderHostPort.toString(), DUMP_ENTITIES_URL_SUFFIX);

RetryTaskUntilCondition<DumpEntitiesResponse> waitForCheck =
new RetryTaskUntilCondition<>(
() -> {
log.debug("Making url request to endpoint: {}", masterLeaderUrl);
JsonNode masterLeaderDumpJson = apiHelper.getRequest(masterLeaderUrl);
DumpEntitiesResponse dumpEntities =
Json.fromJson(masterLeaderDumpJson, DumpEntitiesResponse.class);
return dumpEntities;
},
(d) -> {
if (d.getError() != null) {
log.warn("Url request to {} failed with error {}", masterLeaderUrl, d.getError());
return false;
}
return true;
});

DumpEntitiesResponse dumpEntitiesResponse = waitForCheck.retryWithBackoff(1, 2, 10);

HostAndPort currentNodeHP =
HostAndPort.fromParts(currentNode.cloudInfo.private_ip, currentNode.tserverRpcPort);

return dumpEntitiesResponse.getTabletsByTserverAddress(currentNodeHP);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import com.yugabyte.yw.commissioner.tasks.subtasks.ChangeAdminPassword;
import com.yugabyte.yw.commissioner.tasks.subtasks.ChangeMasterConfig;
import com.yugabyte.yw.commissioner.tasks.subtasks.CheckFollowerLag;
import com.yugabyte.yw.commissioner.tasks.subtasks.CheckNodeSafeToDelete;
import com.yugabyte.yw.commissioner.tasks.subtasks.CreateAlertDefinitions;
import com.yugabyte.yw.commissioner.tasks.subtasks.CreateTable;
import com.yugabyte.yw.commissioner.tasks.subtasks.DeleteBackup;
Expand Down Expand Up @@ -135,6 +136,7 @@
import com.yugabyte.yw.common.NodeManager;
import com.yugabyte.yw.common.PlacementInfoUtil;
import com.yugabyte.yw.common.PlatformServiceException;
import com.yugabyte.yw.common.RetryTaskUntilCondition;
import com.yugabyte.yw.common.ShellResponse;
import com.yugabyte.yw.common.UniverseInProgressException;
import com.yugabyte.yw.common.Util;
Expand All @@ -146,6 +148,7 @@
import com.yugabyte.yw.common.config.UniverseConfKeys;
import com.yugabyte.yw.common.gflags.AutoFlagUtil;
import com.yugabyte.yw.common.gflags.SpecificGFlags;
import com.yugabyte.yw.common.nodeui.DumpEntitiesResponse;
import com.yugabyte.yw.forms.BackupRequestParams;
import com.yugabyte.yw.forms.BackupTableParams;
import com.yugabyte.yw.forms.BulkImportParams;
Expand Down Expand Up @@ -374,6 +377,8 @@ public enum ServerType {
EITHER
}

public static final String DUMP_ENTITIES_URL_SUFFIX = "/dump-entities";

@Inject
protected UniverseTaskBase(BaseTaskDependencies baseTaskDependencies) {
super(baseTaskDependencies);
Expand Down Expand Up @@ -1542,16 +1547,25 @@ public SubTaskGroup createWaitForYbcServerTask(Collection<NodeDetails> nodeDetai
* @param isForceDelete if this is true, ignore ansible errors
* @param deleteNode if true, the node info is deleted from the universe db.
* @param deleteRootVolumes if true, the volumes are deleted.
* @param skipDestroyPrecheck if true, skips the pre-check validation subtask before destroying
* server.
*/
public SubTaskGroup createDestroyServerTasks(
Universe universe,
Collection<NodeDetails> nodes,
boolean isForceDelete,
boolean deleteNode,
boolean deleteRootVolumes) {
boolean deleteRootVolumes,
boolean skipDestroyPrecheck) {
SubTaskGroup subTaskGroup = createSubTaskGroup("AnsibleDestroyServers");
UserIntent userIntent = universe.getUniverseDetails().getPrimaryCluster().userIntent;
nodes = filterUniverseNodes(universe, nodes, n -> true);

// TODO: Update to use node whitelist when the db implements this.
if (!skipDestroyPrecheck) {
createCheckNodeSafeToDeleteTasks(universe, nodes);
}

for (NodeDetails node : nodes) {
// Check if the private ip for the node is set. If not, that means we don't have
// a clean state to delete the node. Log it, free up the onprem node
Expand Down Expand Up @@ -1993,6 +2007,98 @@ public SubTaskGroup createCheckFollowerLagTask(NodeDetails node, ServerType serv
return subTaskGroup;
}

/*
* Create subtask to determine that the node is not part of the universe quorum.
* Checks that no tablets exists on the tserver (if applicable) and node ip is not
* part of the quorum.
*
* @param universe universe for which the node belongs.
* @param node node we want to check.
*/
public void createCheckNodeSafeToDeleteTasks(Universe universe, Collection<NodeDetails> nodes) {
boolean clusterMembershipCheckEnabled =
confGetter.getConfForScope(universe, UniverseConfKeys.clusterMembershipCheckEnabled);
if (clusterMembershipCheckEnabled) {
SubTaskGroup subTaskGroup = createSubTaskGroup("CheckNodeSafeToDelete");
for (NodeDetails node : nodes) {
NodeTaskParams params = new NodeTaskParams();
params.setUniverseUUID(taskParams().getUniverseUUID());
params.nodeName = node.getNodeName();
CheckNodeSafeToDelete task = createTask(CheckNodeSafeToDelete.class);
task.initialize(params);
subTaskGroup.addSubTask(task);
}
getRunnableTask().addSubTaskGroup(subTaskGroup);
subTaskGroup.setSubTaskGroupType(SubTaskGroupType.ValidatingNode);
}
}

/*
* For a given node, finds the tablets assigned to its tserver (if relevant).
*
* @param universe universe for which the node belongs.
* @param currentNode node we want to check.
* @return a set of tablets for the associated tserver.
*/
public Set<String> getTserverTablets(Universe universe, NodeDetails currentNode) {
// Wait for a maximum of 10 seconds for url to succeed.
NodeDetails masterLeaderNode = universe.getMasterLeaderNode();
HostAndPort masterLeaderHostPort =
HostAndPort.fromParts(
masterLeaderNode.cloudInfo.private_ip, masterLeaderNode.masterHttpPort);
String masterLeaderUrl =
String.format("http://%s%s", masterLeaderHostPort.toString(), DUMP_ENTITIES_URL_SUFFIX);

RetryTaskUntilCondition<DumpEntitiesResponse> waitForCheck =
new RetryTaskUntilCondition<>(
() -> {
log.debug("Making url request to endpoint: {}", masterLeaderUrl);
JsonNode masterLeaderDumpJson = nodeUIApiHelper.getRequest(masterLeaderUrl);
DumpEntitiesResponse dumpEntities =
Json.fromJson(masterLeaderDumpJson, DumpEntitiesResponse.class);
return dumpEntities;
},
(d) -> {
if (d.getError() != null) {
log.warn("Url request to {} failed with error {}", masterLeaderUrl, d.getError());
return false;
}
return true;
});

DumpEntitiesResponse dumpEntitiesResponse = waitForCheck.retryWithBackoff(1, 2, 10);

HostAndPort currentNodeHP =
HostAndPort.fromParts(currentNode.cloudInfo.private_ip, currentNode.tserverRpcPort);

return dumpEntitiesResponse.getTabletsByTserverAddress(currentNodeHP);
}

/*
* Checks whether or not the node has a master process in the universe quorum
*
* @param universe universe for which the node belongs
* @param currentNode node we want to check for
* @return whether or not the node has a master process in the universe in the quorum
*/
protected boolean nodeInMasterConfig(Universe universe, NodeDetails node) {
String ip = node.cloudInfo.private_ip;
String masterAddresses = universe.getMasterAddresses();

try (YBClient client =
ybService.getClient(masterAddresses, universe.getCertificateNodetoNode())) {
ListMastersResponse response = client.listMasters();
List<ServerInfo> servers = response.getMasters();
return servers.stream().anyMatch(s -> s.getHost().equals(ip));
} catch (Exception e) {
String msg =
String.format(
"Error when fetching listMasters rpc for node %s - %s",
node.nodeName, e.getMessage());
throw new RuntimeException(msg, e);
}
}

/**
* Create tasks to execute Cluster CTL command against specific process in parallel
*
Expand Down
Loading

0 comments on commit a36d3c5

Please sign in to comment.