Fix bottleneck in InternalResourceGroup::internalGenerateCpuQuota

internalGenerateCpuQuota() is called from InternalResourceGroupManager::refreshAndStartQueries(), which is called every millisecond to start queries. Under a high query submission rate internalGenerateCpuQuota() becomes a bottleneck when Math.multiplyExact() throws an exception (which is possible with the default value of cpuQuotaGenerationMillisPerSecond) while holding the monitor of the root resource group. Plenty of methods in the query starting path grabs the monitor of the root resource group, and since internalGenerateCpuQuota() locks the root for extended amount of time (due to filling in the stack traces when exceptions are thrown) the rate at which the coordinator starts queries drops significantly. This change fixes that by using the guava math methods, which don't throw exceptions on overflow. In a performance test before the fix, given a config that allows 200 concurrent queries the coordinator was only able to run < 50 queries in the cluster. After this fix it was able to saturate the entire cluster.
trinodb · Aug 6, 2018 · 2ac73e1 · 2ac73e1
1 parent 85013d8
commit 2ac73e1
Showing 1 changed file with 7 additions and 18 deletions.
diff --git a/...ain/src/main/java/com/facebook/presto/execution/resourceGroups/InternalResourceGroup.java b/...ain/src/main/java/com/facebook/presto/execution/resourceGroups/InternalResourceGroup.java
@@ -60,6 +60,9 @@
 import static com.google.common.base.Preconditions.checkArgument;
 import static com.google.common.base.Preconditions.checkState;
 import static com.google.common.collect.ImmutableList.toImmutableList;
+import static com.google.common.math.LongMath.saturatedAdd;
+import static com.google.common.math.LongMath.saturatedMultiply;
+import static com.google.common.math.LongMath.saturatedSubtract;
 import static io.airlift.units.DataSize.Unit.BYTE;
 import static java.lang.Math.min;
 import static java.util.Objects.requireNonNull;
@@ -718,12 +721,7 @@ private void queryFinished(QueryExecution query)
             if (query.getState() == QueryState.FINISHED || query.getQueryInfo().getErrorType() == USER_ERROR) {
                 InternalResourceGroup group = this;
                 while (group != null) {
-                    try {
-                        group.cpuUsageMillis = Math.addExact(group.cpuUsageMillis, query.getTotalCpuTime().toMillis());
-                    }
-                    catch (ArithmeticException e) {
-                        group.cpuUsageMillis = Long.MAX_VALUE;
-                    }
+                    group.cpuUsageMillis = saturatedAdd(group.cpuUsageMillis, query.getTotalCpuTime().toMillis());
                     group = group.parent.orElse(null);
                 }
             }
@@ -780,20 +778,11 @@ protected void internalGenerateCpuQuota(long elapsedSeconds)
     {
         checkState(Thread.holdsLock(root), "Must hold lock to generate cpu quota");
         synchronized (root) {
-            long newQuota;
-            try {
-                newQuota = Math.multiplyExact(elapsedSeconds, cpuQuotaGenerationMillisPerSecond);
-            }
-            catch (ArithmeticException e) {
-                newQuota = Long.MAX_VALUE;
-            }
-            try {
-                cpuUsageMillis = Math.subtractExact(cpuUsageMillis, newQuota);
-            }
-            catch (ArithmeticException e) {
+            long newQuota = saturatedMultiply(elapsedSeconds, cpuQuotaGenerationMillisPerSecond);
+            cpuUsageMillis = saturatedSubtract(cpuUsageMillis, newQuota);
+            if (cpuUsageMillis < 0 || cpuUsageMillis == Long.MAX_VALUE) {
                 cpuUsageMillis = 0;
             }
-            cpuUsageMillis = Math.max(0, cpuUsageMillis);
             for (InternalResourceGroup group : subGroups.values()) {
                 group.internalGenerateCpuQuota(elapsedSeconds);
             }