diff --git a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortSwapJob.java b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortSwapJob.java index bc29bcfe52..7344274a8f 100644 --- a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortSwapJob.java +++ b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortSwapJob.java @@ -119,10 +119,11 @@ public void run() throws Exception { dataDir, e); } + // While processing an admin request, HDFSFailedLock could take long time because of multiple HDFS operations, + // especially when the name node is in a different data center. So extend timeout to 5 minutes. AdminClientConfig adminConfig = new AdminClientConfig().setMaxConnectionsPerNode(cluster.getNumberOfNodes()) - .setAdminConnectionTimeoutSec(15) .setMaxBackoffDelayMs(maxBackoffDelayMs) - .setAdminSocketTimeoutSec(60); + .setAdminSocketTimeoutSec(60 * 5); ClientConfig clientConfig = new ClientConfig().setBootstrapUrls(cluster.getBootStrapUrls()) .setConnectionTimeout(httpTimeoutMs, diff --git a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/swapper/HdfsFailedFetchLock.java b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/swapper/HdfsFailedFetchLock.java index 3ec3c584dd..b6ca432933 100644 --- a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/swapper/HdfsFailedFetchLock.java +++ b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/swapper/HdfsFailedFetchLock.java @@ -199,6 +199,7 @@ private void handleIOException(IOException e, String action, int attempt) @Override public synchronized void acquireLock() throws Exception { + logger.info("Try to acquire HDFS distributed lock."); if (lockAcquired) { logger.info("HdfsFailedFetchLock.acquireLock() called while it is already acquired!"); return; @@ -236,6 +237,7 @@ public synchronized void acquireLock() throws Exception { if (!this.lockAcquired) { throw new VoldemortException(exceptionMessage(ACQUIRE_LOCK)); } + logger.info("HDFS distributed lock acquired."); } @Override diff --git a/src/java/voldemort/server/protocol/admin/AdminServiceRequestHandler.java b/src/java/voldemort/server/protocol/admin/AdminServiceRequestHandler.java index e111e46393..5c205051fa 100644 --- a/src/java/voldemort/server/protocol/admin/AdminServiceRequestHandler.java +++ b/src/java/voldemort/server/protocol/admin/AdminServiceRequestHandler.java @@ -2083,6 +2083,7 @@ private Message handleFetchFailure(VAdminProto.HandleFetchFailureRequest handleF for (Integer nodeId: nodesFailedInThisFetch) { logger.warn("Will disable store '" + storeName + "' on node " + nodeId); distributedLock.addDisabledNode(nodeId, storeName, pushVersion); + logger.warn("Store '" + storeName + "' is disabled on node " + nodeId); if (firstNode) { firstNode = false; } else {