Permalink
Browse files

Rebalancing tested for all failures modes locally.

  • Loading branch information...
bbansal committed Dec 10, 2009
1 parent 6c1f77f commit db4c0faeac2318d65c9a5fe2863f17778faf05e1
View
@@ -39,4 +39,4 @@ if [ -z $VOLD_OPTS ]; then
fi
export CLASSPATH
-java $VOLD_OPTS -cp $CLASSPATH $@
+java -Dlog4j.configuration=src/java/log4j.properties $VOLD_OPTS -cp $CLASSPATH $@
@@ -16,9 +16,9 @@
# limitations under the License.
#
-if [ $# -lt 2 ];
+if [ $# -lt 3 ];
then
- echo 'USAGE: bin/voldemort-shell.sh currentCluster.xml targetCluster.xml stores.xml'
+ echo 'USAGE: bin/voldemort-shell.sh currentCluster.xml targetCluster.xml stores.xml numParallelRebalancing'
exit 1
fi
View
@@ -45,4 +45,4 @@ if [ -z $VOLD_OPTS ]; then
VOLD_OPTS="-Xmx2G -server -Dcom.sun.management.jmxremote"
fi
-java $VOLD_OPTS -cp $CLASSPATH voldemort.server.VoldemortServer $@
+java -Dlog4j.configuration=src/java/log4j.properties $VOLD_OPTS -cp $CLASSPATH voldemort.server.VoldemortServer $@
@@ -13,6 +13,7 @@ log4j.logger.httpclient.wire=INFO
log4j.logger.org.mortbay.log=WARN
log4j.logger.voldemort.store.routed=INFO
log4j.logger.voldemort.server.niosocket=INFO
-log4j.logger.voldemort.utils=DEBUG
-log4j.logger.voldemort.client.rebalance=DEBUG
-log4j.logger.voldemort.server.protocol.admin=DEBUG
+log4j.logger.voldemort.utils=INFO
+log4j.logger.voldemort.client.rebalance=INFO
+log4j.logger.voldemort.server.protocol.admin=INFO
+log4j.logger.voldemort.server=INFO
@@ -91,8 +91,8 @@ public void run() {
while(rebalanceSubTaskList.size() > 0) {
RebalanceStealInfo rebalanceSubTask = rebalanceSubTaskList.remove(0);
- logger.debug("Starting rebalancing for stealerNode:" + stealerNode
- + " rebalanceInfo:" + rebalanceSubTask);
+ logger.info("Starting rebalancing for stealerNode:" + stealerNode
+ + " rebalanceInfo:" + rebalanceSubTask);
try {
@@ -101,8 +101,8 @@ public void run() {
// attempt to rebalance for all stores.
attemptRebalanceSubTask(rebalanceSubTask);
- logger.debug("Successfully finished RebalanceSubTask attempt:"
- + rebalanceSubTask);
+ logger.info("Successfully finished RebalanceSubTask attempt:"
+ + rebalanceSubTask);
} catch(Exception e) {
logger.warn("rebalancing task (" + rebalanceSubTask
+ ") failed with exception:", e);
@@ -275,7 +275,7 @@ public VoldemortConfig(Props props) {
// rebalancing parameters
this.maxRebalancingAttempt = props.getInt("max.rebalancing.attempts", 3);
this.rebalancingTimeoutInSeconds = props.getInt("rebalancing.timeout.seconds", 60 * 60);
- this.rebalancingServicePeriod = props.getInt("rebalancing.service.period.ms", 5 * 60 * 1000);
+ this.rebalancingServicePeriod = props.getInt("rebalancing.service.period.ms", 1000);
// network class loader disable by default.
this.enableNetworkClassLoader = props.getBoolean("enable.network.classloader", false);
@@ -264,7 +264,13 @@ public void handleUpdatePartitionEntries(VAdminProto.UpdatePartitionEntriesReque
Versioned<byte[]> value = ProtoUtils.decodeVersioned(partitionEntry.getVersioned());
if(filter.accept(key, value)) {
- storageEngine.put(key, value);
+ try {
+ storageEngine.put(key, value);
+
+ } catch(ObsoleteVersionException e) {
+ // log and ignore
+ logger.debug("updateEntries (Streaming put) threw ObsoleteVersionException .. Ignoring.");
+ }
if(throttler != null) {
throttler.maybeThrottle(entrySize(Pair.create(key, value)));
@@ -298,12 +304,6 @@ public void handleUpdatePartitionEntries(VAdminProto.UpdatePartitionEntriesReque
throw new VoldemortException("Rebalance service is not enabled for node:"
+ metadataStore.getNodeId());
- if(!rebalancer.acquireRebalancingPermit()) {
- throw new VoldemortException("Node:"
- + metadataStore.getNodeId()
- + " is already rebalancing cannot start new rebalancing request.");
- }
-
RebalanceStealInfo rebalanceStealInfo = new RebalanceStealInfo(request.getStealerId(),
request.getDonorId(),
request.getPartitionsList(),
@@ -313,6 +313,12 @@ public void handleUpdatePartitionEntries(VAdminProto.UpdatePartitionEntriesReque
int requestId = rebalancer.rebalanceLocalNode(request.getCurrentStore(),
rebalanceStealInfo);
+ if(-1 == requestId) {
+ throw new VoldemortException("Node:"
+ + metadataStore.getNodeId()
+ + " is already rebalancing cannot start new rebalancing request.");
+ }
+
response.setRequestId(requestId)
.setDescription(rebalanceStealInfo.toString())
.setStatus("started")
@@ -363,7 +369,7 @@ public void operate() {
entry.getSecond());
} catch(ObsoleteVersionException e) {
// log and ignore
- logger.warn("FetchAndUpdate threw ObsoleteVersionException .. Ignoring.");
+ logger.debug("FetchAndUpdate threw ObsoleteVersionException .. Ignoring.");
}
throttler.maybeThrottle(entrySize(entry));
@@ -26,7 +26,6 @@
private final AtomicBoolean rebalancePermit = new AtomicBoolean(false);
private final MetadataStore metadataStore;
- private final AdminClient adminClient;
private final AsyncOperationRunner asyncRunner;
private final VoldemortConfig config;
@@ -36,53 +35,52 @@ public Rebalancer(MetadataStore metadataStore,
this.metadataStore = metadataStore;
this.asyncRunner = asyncRunner;
this.config = config;
- this.adminClient = RebalanceUtils.createTempAdminClient(config, metadataStore.getCluster());
}
public void start() {
// add startup time stuff here.
}
- /**
- * After the current operation finishes, no longer gossip.
- */
- public void stop() {
- try {
- adminClient.stop();
- } catch(Exception e) {
- logger.error("Error while closing adminClient.", e);
- }
- }
+ public void stop() {}
- public boolean acquireRebalancingPermit() {
+ private boolean acquireRebalancingPermit() {
if(rebalancePermit.compareAndSet(false, true))
return true;
return false;
}
- public void releaseRebalancingPermit() {
+ private void releaseRebalancingPermit() {
if(!rebalancePermit.compareAndSet(true, false)) {
throw new VoldemortException("Invalid state rebalancePermit must be true here.");
}
}
public void run() {
+ logger.debug("rebalancer run() called.");
if(VoldemortState.REBALANCING_MASTER_SERVER.equals(metadataStore.getServerState())
&& acquireRebalancingPermit()) {
+
+ // free permit here for rebalanceLocalNode to acquire.
+ releaseRebalancingPermit();
+
RebalanceStealInfo stealInfo = metadataStore.getRebalancingStealInfo();
- logger.warn("Rebalance server found incomplete rebalancing attempt restarting "
- + stealInfo);
-
- if(stealInfo.getAttempt() < config.getMaxRebalancingAttempt()) {
- attemptRebalance(stealInfo);
- } else {
- logger.warn("Rebalancing for rebalancing task:" + stealInfo
- + " failed multiple times, Aborting more trials...");
- }
- // clean all rebalancing state
- metadataStore.cleanAllRebalancingState();
+ try {
+ logger.warn("Rebalance server found incomplete rebalancing attempt " + stealInfo
+ + " restarting ...");
+
+ if(stealInfo.getAttempt() < config.getMaxRebalancingAttempt()) {
+ attemptRebalance(stealInfo);
+ } else {
+ logger.warn("Rebalancing for rebalancing task:" + stealInfo
+ + " failed multiple times, Aborting more trials...");
+ metadataStore.cleanAllRebalancingState();
+ }
+ } catch(Exception e) {
+ logger.error("RebalanceService rebalancing attempt " + stealInfo
+ + " failed with exception", e);
+ }
}
}
@@ -91,8 +89,14 @@ private void attemptRebalance(RebalanceStealInfo stealInfo) {
List<String> unbalanceStoreList = ImmutableList.copyOf(stealInfo.getUnbalancedStoreList());
for(String storeName: unbalanceStoreList) {
+ AdminClient adminClient = RebalanceUtils.createTempAdminClient(config,
+ metadataStore.getCluster());
try {
int rebalanceAsyncId = rebalanceLocalNode(storeName, stealInfo);
+ if(-1 == rebalanceAsyncId) {
+ logger.warn("rebalancer is already running, aborting this rebalanceService run() ..");
+ return;
+ }
adminClient.waitForCompletion(stealInfo.getStealerId(),
rebalanceAsyncId,
@@ -102,6 +106,8 @@ private void attemptRebalance(RebalanceStealInfo stealInfo) {
stealInfo.getUnbalancedStoreList().remove(storeName);
} catch(Exception e) {
logger.warn("rebalanceSubTask:" + stealInfo + " failed for store:" + storeName, e);
+ } finally {
+ adminClient.stop();
}
}
}
@@ -121,6 +127,10 @@ private void attemptRebalance(RebalanceStealInfo stealInfo) {
* @return taskId for asynchronous task.
*/
public int rebalanceLocalNode(final String storeName, final RebalanceStealInfo stealInfo) {
+
+ if(!acquireRebalancingPermit())
+ return -1;
+
int requestId = asyncRunner.getUniqueRequestId();
asyncRunner.submitOperation(requestId, new AsyncOperation(requestId, stealInfo.toString()) {
@@ -129,7 +139,10 @@ public int rebalanceLocalNode(final String storeName, final RebalanceStealInfo s
@Override
public void operate() throws Exception {
+ AdminClient adminClient = RebalanceUtils.createTempAdminClient(config,
+ metadataStore.getCluster());
try {
+ logger.info("Rebalancer: rebalance " + stealInfo + " starting.");
checkCurrentState(metadataStore, stealInfo);
setRebalancingState(metadataStore, stealInfo);
@@ -139,32 +152,39 @@ public void operate() throws Exception {
storeName,
stealInfo.getPartitionList(),
null);
- logger.debug("rebalance internal async Id:" + fetchAndUpdateAsyncId);
adminClient.waitForCompletion(metadataStore.getNodeId(),
fetchAndUpdateAsyncId,
24 * 60 * 60,
TimeUnit.SECONDS);
- logger.info("rebalance " + stealInfo + " completed successfully.");
+
+ logger.info("Rebalancer: rebalance " + stealInfo + " completed successfully.");
// clean state only if successfull.
metadataStore.cleanAllRebalancingState();
} finally {
-
+ // free the permit in all cases.
+ releaseRebalancingPermit();
+ adminClient.stop();
+ fetchAndUpdateAsyncId = -1;
}
}
@Override
@JmxGetter(name = "asyncTaskStatus")
public AsyncOperationStatus getStatus() {
- if(-1 != fetchAndUpdateAsyncId && !asyncRunner.isComplete(fetchAndUpdateAsyncId))
- updateStatus(asyncRunner.getStatus(fetchAndUpdateAsyncId));
+ if(-1 != fetchAndUpdateAsyncId)
+ try {
+ updateStatus(asyncRunner.getStatus(fetchAndUpdateAsyncId));
+ } catch(Exception e) {
+ // ignore : handle race condition between asyncRunner
+ // removing value and fetchAndUpdate setting to -1
+ }
return super.getStatus();
}
});
- logger.debug("rebalance node request_id:" + requestId);
return requestId;
}

0 comments on commit db4c0fa

Please sign in to comment.