Permalink
Browse files

Failure detector fix for host swap : DDS-2536

  • Loading branch information...
1 parent e72630c commit 78ee7c142605ded927f2a5b97a1450347afccb91 Chinmay Soman committed Mar 23, 2012
@@ -153,4 +153,10 @@ public int hashCode() {
public int compareTo(Node other) {
return Integer.valueOf(this.id).compareTo(other.getId());
}
+
+ public boolean isEqualState(Node other) {
+ return id == other.getId() && host.equalsIgnoreCase(other.getHost())
+ && httpPort == other.getHttpPort() && socketPort == other.getSocketPort()
+ && adminPort == other.getAdminPort() && zoneId == other.getZoneId();
+ }
}
@@ -45,6 +45,9 @@
protected final Map<Node, NodeStatus> nodeStatusMap;
+ // Also maintain the list of nodes by IDs (in order to handle host swaps)
+ protected final Map<Integer, Node> nodeMap;
+
protected final Logger logger = Logger.getLogger(getClass().getName());
protected AbstractFailureDetector(FailureDetectorConfig failureDetectorConfig) {
@@ -54,10 +57,12 @@ protected AbstractFailureDetector(FailureDetectorConfig failureDetectorConfig) {
this.failureDetectorConfig = failureDetectorConfig;
listeners = new ConcurrentHashMap<FailureDetectorListener, Object>();
nodeStatusMap = new ConcurrentHashMap<Node, NodeStatus>();
+ nodeMap = new ConcurrentHashMap<Integer, Node>();
for(Node node: failureDetectorConfig.getNodes()) {
nodeStatusMap.put(node, createNodeStatus(failureDetectorConfig.getTime()
.getMilliseconds()));
+ nodeMap.put(node.getId(), node);
}
}
@@ -212,14 +217,24 @@ protected void setUnavailable(Node node, UnreachableStoreException e) {
protected NodeStatus getNodeStatus(Node node) {
NodeStatus nodeStatus = nodeStatusMap.get(node);
+ Node currentTrackedNode = nodeMap.get(node.getId());
- if(nodeStatus == null) {
+ if(nodeStatus == null || !currentTrackedNode.isEqualState(node)) {
if(logger.isEnabledFor(Level.WARN))
logger.warn("creating new node status for node " + node.getId()
+ " for failure detector");
+ // If the host is being replaced, remove old tracking information
+ if(nodeStatus != null) {
+ nodeStatusMap.remove(currentTrackedNode);
+ nodeMap.remove(currentTrackedNode);
+ failureDetectorConfig.removeNode(currentTrackedNode);
+ }
+
nodeStatus = createNodeStatus(failureDetectorConfig.getTime().getMilliseconds());
nodeStatusMap.put(node, nodeStatus);
+ nodeMap.put(node.getId(), node);
+
if(!failureDetectorConfig.getNodes().contains(node)) {
failureDetectorConfig.addNode(node);
}
@@ -23,7 +23,6 @@
import java.util.HashSet;
import java.util.List;
-import com.google.common.collect.ImmutableSet;
import voldemort.client.ClientConfig;
import voldemort.cluster.Node;
import voldemort.server.VoldemortConfig;
@@ -32,6 +31,7 @@
import voldemort.utils.Utils;
import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
/**
* FailureDetectorConfig simply holds all the data that was available to it upon
@@ -567,6 +567,11 @@ public synchronized void addNode(Node node) {
nodes.add(node);
}
+ public synchronized void removeNode(Node node) {
+ Utils.notNull(node);
+ nodes.remove(node);
+ }
+
public StoreVerifier getStoreVerifier() {
return storeVerifier;
}

0 comments on commit 78ee7c1

Please sign in to comment.