Skip to content
This repository
Browse code

Failure detector fix for host swap : DDS-2536

  • Loading branch information...
commit 78ee7c142605ded927f2a5b97a1450347afccb91 1 parent e72630c
authored March 22, 2012
6  src/java/voldemort/cluster/Node.java
@@ -153,4 +153,10 @@ public int hashCode() {
153 153
     public int compareTo(Node other) {
154 154
         return Integer.valueOf(this.id).compareTo(other.getId());
155 155
     }
  156
+
  157
+    public boolean isEqualState(Node other) {
  158
+        return id == other.getId() && host.equalsIgnoreCase(other.getHost())
  159
+               && httpPort == other.getHttpPort() && socketPort == other.getSocketPort()
  160
+               && adminPort == other.getAdminPort() && zoneId == other.getZoneId();
  161
+    }
156 162
 }
17  src/java/voldemort/cluster/failuredetector/AbstractFailureDetector.java
@@ -45,6 +45,9 @@
45 45
 
46 46
     protected final Map<Node, NodeStatus> nodeStatusMap;
47 47
 
  48
+    // Also maintain the list of nodes by IDs (in order to handle host swaps)
  49
+    protected final Map<Integer, Node> nodeMap;
  50
+
48 51
     protected final Logger logger = Logger.getLogger(getClass().getName());
49 52
 
50 53
     protected AbstractFailureDetector(FailureDetectorConfig failureDetectorConfig) {
@@ -54,10 +57,12 @@ protected AbstractFailureDetector(FailureDetectorConfig failureDetectorConfig) {
54 57
         this.failureDetectorConfig = failureDetectorConfig;
55 58
         listeners = new ConcurrentHashMap<FailureDetectorListener, Object>();
56 59
         nodeStatusMap = new ConcurrentHashMap<Node, NodeStatus>();
  60
+        nodeMap = new ConcurrentHashMap<Integer, Node>();
57 61
 
58 62
         for(Node node: failureDetectorConfig.getNodes()) {
59 63
             nodeStatusMap.put(node, createNodeStatus(failureDetectorConfig.getTime()
60 64
                                                                           .getMilliseconds()));
  65
+            nodeMap.put(node.getId(), node);
61 66
         }
62 67
     }
63 68
 
@@ -212,14 +217,24 @@ protected void setUnavailable(Node node, UnreachableStoreException e) {
212 217
 
213 218
     protected NodeStatus getNodeStatus(Node node) {
214 219
         NodeStatus nodeStatus = nodeStatusMap.get(node);
  220
+        Node currentTrackedNode = nodeMap.get(node.getId());
215 221
 
216  
-        if(nodeStatus == null) {
  222
+        if(nodeStatus == null || !currentTrackedNode.isEqualState(node)) {
217 223
             if(logger.isEnabledFor(Level.WARN))
218 224
                 logger.warn("creating new node status for node " + node.getId()
219 225
                             + " for failure detector");
220 226
 
  227
+            // If the host is being replaced, remove old tracking information
  228
+            if(nodeStatus != null) {
  229
+                nodeStatusMap.remove(currentTrackedNode);
  230
+                nodeMap.remove(currentTrackedNode);
  231
+                failureDetectorConfig.removeNode(currentTrackedNode);
  232
+            }
  233
+
221 234
             nodeStatus = createNodeStatus(failureDetectorConfig.getTime().getMilliseconds());
222 235
             nodeStatusMap.put(node, nodeStatus);
  236
+            nodeMap.put(node.getId(), node);
  237
+
223 238
             if(!failureDetectorConfig.getNodes().contains(node)) {
224 239
                 failureDetectorConfig.addNode(node);
225 240
             }
7  src/java/voldemort/cluster/failuredetector/FailureDetectorConfig.java
@@ -23,7 +23,6 @@
23 23
 import java.util.HashSet;
24 24
 import java.util.List;
25 25
 
26  
-import com.google.common.collect.ImmutableSet;
27 26
 import voldemort.client.ClientConfig;
28 27
 import voldemort.cluster.Node;
29 28
 import voldemort.server.VoldemortConfig;
@@ -32,6 +31,7 @@
32 31
 import voldemort.utils.Utils;
33 32
 
34 33
 import com.google.common.collect.ImmutableList;
  34
+import com.google.common.collect.ImmutableSet;
35 35
 
36 36
 /**
37 37
  * FailureDetectorConfig simply holds all the data that was available to it upon
@@ -567,6 +567,11 @@ public synchronized void addNode(Node node) {
567 567
         nodes.add(node);
568 568
     }
569 569
 
  570
+    public synchronized void removeNode(Node node) {
  571
+        Utils.notNull(node);
  572
+        nodes.remove(node);
  573
+    }
  574
+
570 575
     public StoreVerifier getStoreVerifier() {
571 576
         return storeVerifier;
572 577
     }

0 notes on commit 78ee7c1

Please sign in to comment.
Something went wrong with that request. Please try again.