Skip to content

Commit

Permalink
LIHADOOP-43781: TonY should support enforcing maximum total number of…
Browse files Browse the repository at this point in the history
… tasks (#174)

* LIHADOOP-43781: TonY should support enforcing maximum total number of tasks

* Added test for zero instances

* Fix typo: tony.max-tasks -> tony.task.max-tasks and fixed TestTonyConfigurationFields

* Added support for max instances per task type

* Added comments for TonyConfigurationKeys.getMaxInstancesKey
  • Loading branch information
erwa committed Feb 5, 2019
1 parent 69bdf38 commit a60967f
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 0 deletions.
31 changes: 31 additions & 0 deletions tony-core/src/main/java/com/linkedin/tony/TonyClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import com.google.common.collect.ImmutableSet;
import com.linkedin.tony.rpc.TaskUrl;
import com.linkedin.tony.rpc.impl.ApplicationRpcClient;
import com.linkedin.tony.tensorflow.TensorFlowContainerRequest;
import com.linkedin.tony.util.HdfsUtils;
import com.linkedin.tony.util.Utils;
import com.linkedin.tony.util.VersionInfo;
Expand Down Expand Up @@ -274,6 +275,7 @@ public boolean init(String[] args) throws ParseException, IOException {
}

initTonyConf(tonyConf, cliParser);
validateTonyConf(tonyConf);

String amMemoryString = tonyConf.get(TonyConfigurationKeys.AM_MEMORY,
TonyConfigurationKeys.DEFAULT_AM_MEMORY);
Expand Down Expand Up @@ -384,6 +386,35 @@ public void initTonyConf(Configuration tonyConf, CommandLine cliParser) throws I
tonyConf.addResource(new Path(tonyConfDir + File.separatorChar + Constants.TONY_SITE_CONF));
}

/**
* Validates that the configuration does not request more task instances than allowed for a given task type
* or more than the max total instances allowed across all task types. Throws a {@link RuntimeException}
* if any limits are exceeded.
* @param tonyConf the configuration to validate
*/
@VisibleForTesting
static void validateTonyConf(Configuration tonyConf) {
Map<String, TensorFlowContainerRequest> containerRequestMap = Utils.parseContainerRequests(tonyConf);

// check that we don't request more than the max allowed for any task type
for (Map.Entry<String, TensorFlowContainerRequest> entry : containerRequestMap.entrySet()) {
int numInstancesRequested = entry.getValue().getNumInstances();
int maxAllowedInstances = tonyConf.getInt(TonyConfigurationKeys.getMaxInstancesKey(entry.getKey()), -1);
if (maxAllowedInstances >= 0 && numInstancesRequested > maxAllowedInstances) {
throw new RuntimeException("Job requested " + numInstancesRequested + " " + entry.getKey() + " task instances "
+ "but the limit is " + maxAllowedInstances + " " + entry.getKey() + " task instances.");
}
}

// check that we don't request more than the allowed total tasks
int maxTotalInstances = tonyConf.getInt(TonyConfigurationKeys.TONY_MAX_TOTAL_INSTANCES, -1);
int totalRequestedInstances = containerRequestMap.values().stream().mapToInt(req -> req.getNumInstances()).sum();
if (maxTotalInstances >= 0 && totalRequestedInstances > maxTotalInstances) {
throw new RuntimeException("Job requested " + totalRequestedInstances + " total task instances but limit is "
+ maxTotalInstances + ".");
}
}

public Configuration getTonyConf() {
return this.tonyConf;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ private TonyConfigurationKeys() {
// Task configurations
public static final String TONY_TASK_PREFIX = TONY_PREFIX + "task.";

/**
* Max total number of task instances that can be requested across all task types.
*/
public static final String TONY_MAX_TOTAL_INSTANCES = TONY_TASK_PREFIX + "max-total-instances";

public static final String TASK_EXECUTOR_JVM_OPTS = TONY_TASK_PREFIX + "executor.jvm.opts";
public static final String DEFAULT_TASK_EXECUTOR_JVM_OPTS = "-Xmx1536m";

Expand Down Expand Up @@ -142,6 +147,15 @@ public static String getInstancesKey(String jobName) {
return String.format(TONY_PREFIX + "%s.instances", jobName);
}

/**
* Configuration key for property controlling how many {@code jobName} task instances a job can request.
* @param jobName the task type for which to get the max instances config key
* @return the max instances configuration key for the {@code jobName}
*/
public static String getMaxInstancesKey(String jobName) {
return String.format(TONY_PREFIX + "%s.max-instances", jobName);
}

public static int getDefaultInstances(String jobName) {
switch (jobName) {
case Constants.PS_JOB_NAME:
Expand All @@ -151,6 +165,7 @@ public static int getDefaultInstances(String jobName) {
return 0;
}
}

public static String getMemoryKey(String jobName) {
return String.format(TONY_PREFIX + "%s.memory", jobName);
}
Expand Down
6 changes: 6 additions & 0 deletions tony-core/src/main/resources/tony-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@
</property>

<!-- Task configurations -->
<property>
<name>tony.task.max-total-instances</name>
<value>-1</value>
<description>Maximum number of tasks that can be requested across all tony.X.instances configs.</description>
</property>

<property>
<description>JVM opts for each TaskExecutor.</description>
<name>tony.task.executor.jvm.opts</name>
Expand Down
33 changes: 33 additions & 0 deletions tony-core/src/test/java/com/linkedin/tony/TestTonyClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package com.linkedin.tony;

import java.util.HashMap;
import org.apache.hadoop.conf.Configuration;
import org.testng.annotations.Test;

import static org.testng.Assert.assertEquals;
Expand All @@ -28,4 +29,36 @@ public void testBuildCommand() {
+ " 1><LOG_DIR>/amstdout.log 2><LOG_DIR>/amstderr.log";
assertEquals(command, expected);
}

@Test
public void testValidateTonyConfValidConf() {
Configuration conf = new Configuration();
conf.setInt("tony.foo.instances", 2);
conf.setInt("tony.bar.instances", 2);
TonyClient.validateTonyConf(conf);
}

@Test
public void testValidateTonyConfZeroInstances() {
Configuration conf = new Configuration();
conf.setInt(TonyConfigurationKeys.TONY_MAX_TOTAL_INSTANCES, 0);
TonyClient.validateTonyConf(conf);
}

@Test(expectedExceptions = RuntimeException.class)
public void testValidateTonyConfTooManyTotalInstances() {
Configuration conf = new Configuration();
conf.setInt(TonyConfigurationKeys.TONY_MAX_TOTAL_INSTANCES, 3);
conf.setInt("tony.foo.instances", 2);
conf.setInt("tony.bar.instances", 2);
TonyClient.validateTonyConf(conf);
}

@Test(expectedExceptions = RuntimeException.class)
public void testValidateTonyConfTooManyFooInstances() {
Configuration conf = new Configuration();
conf.setInt(TonyConfigurationKeys.getMaxInstancesKey("foo"), 1);
conf.setInt("tony.foo.instances", 2);
TonyClient.validateTonyConf(conf);
}
}

0 comments on commit a60967f

Please sign in to comment.