Skip to content

Commit

Permalink
Improve JMX stats and add HTTP endpoint for runaway splits diagnostics
Browse files Browse the repository at this point in the history
  • Loading branch information
Varun Gajjala committed Sep 7, 2018
1 parent c4244f7 commit 5dfd1ed
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 31 deletions.
Expand Up @@ -68,7 +68,6 @@
import static java.util.concurrent.Executors.newCachedThreadPool;
import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor;
import static java.util.concurrent.TimeUnit.MICROSECONDS;
import static java.util.concurrent.TimeUnit.MINUTES;
import static java.util.concurrent.TimeUnit.NANOSECONDS;

@ThreadSafe
Expand All @@ -77,7 +76,7 @@ public class TaskExecutor
private static final Logger log = Logger.get(TaskExecutor.class);

// print out split call stack if it has been running for a certain amount of time
private static final Duration LONG_SPLIT_WARNING_THRESHOLD = new Duration(1000, TimeUnit.SECONDS);
private static final Duration LONG_SPLIT_WARNING_THRESHOLD = new Duration(600, TimeUnit.SECONDS);

private static final AtomicLong NEXT_RUNNER_ID = new AtomicLong();

Expand Down Expand Up @@ -197,7 +196,6 @@ public synchronized void start()
for (int i = 0; i < runnerThreads; i++) {
addRunnerThread();
}
splitMonitorExecutor.scheduleWithFixedDelay(this::monitorActiveSplits, 1, 1, MINUTES);
}

@PreDestroy
Expand Down Expand Up @@ -427,25 +425,6 @@ private synchronized PrioritizedSplitRunner pollNextSplitWorker()
return null;
}

private void monitorActiveSplits()
{
for (RunningSplitInfo splitInfo : runningSplitInfos) {
Duration duration = Duration.succinctNanos(ticker.read() - splitInfo.getStartTime());
if (duration.compareTo(LONG_SPLIT_WARNING_THRESHOLD) < 0) {
return;
}
if (splitInfo.isPrinted()) {
continue;
}
splitInfo.setPrinted();

String currentMaxActiveSplit = splitInfo.getThreadId();
Exception exception = new Exception("Long running split");
exception.setStackTrace(splitInfo.getThread().getStackTrace());
log.warn(exception, "Split thread %s has been running longer than %s", currentMaxActiveSplit, duration);
}
}

private class TaskRunner
implements Runnable
{
Expand Down Expand Up @@ -775,14 +754,52 @@ private synchronized int getRunningTasksForLevel(int level)
return count;
}

public String getMaxActiveSplitsInfo()
{
// Sample output:
//
// 2 splits have been continuously active for more than 600.00ms seconds
//
// "20180907_054754_00000_88xi4.1.0-2" tid=99
// at java.util.Formatter$FormatSpecifier.<init>(Formatter.java:2708)
// at java.util.Formatter.parse(Formatter.java:2560)
// at java.util.Formatter.format(Formatter.java:2501)
// at ... (more lines of stacktrace)
//
// "20180907_054754_00000_88xi4.1.0-3" tid=106
// at java.util.Formatter$FormatSpecifier.<init>(Formatter.java:2709)
// at java.util.Formatter.parse(Formatter.java:2560)
// at java.util.Formatter.format(Formatter.java:2501)
// at ... (more line of stacktrace)
StringBuilder stackTrace = new StringBuilder();
int maxActiveSplitCount = 0;
String message = "%s splits have been continuously active for more than %s seconds\n";
for (RunningSplitInfo splitInfo : runningSplitInfos) {
Duration duration = Duration.succinctNanos(ticker.read() - splitInfo.getStartTime());
if (duration.compareTo(LONG_SPLIT_WARNING_THRESHOLD) >= 0) {
maxActiveSplitCount++;
stackTrace.append("\n");
stackTrace.append(String.format("\"%s\" tid=%s", splitInfo.getThreadId(), splitInfo.getThread().getId())).append("\n");
for (StackTraceElement traceElement : splitInfo.getThread().getStackTrace()) {
stackTrace.append("\tat ").append(traceElement).append("\n");
}
}
}

return String.format(message, maxActiveSplitCount, LONG_SPLIT_WARNING_THRESHOLD).concat(stackTrace.toString());
}

@Managed
public long getMaxActiveSplitTime()
public long getRunAwaySplitCount()
{
Iterator<RunningSplitInfo> iterator = runningSplitInfos.iterator();
if (iterator.hasNext()) {
return NANOSECONDS.toMillis(ticker.read() - iterator.next().getStartTime());
int count = 0;
for (RunningSplitInfo splitInfo : runningSplitInfos) {
Duration duration = Duration.succinctNanos(ticker.read() - splitInfo.getStartTime());
if (duration.compareTo(LONG_SPLIT_WARNING_THRESHOLD) > 0) {
count++;
}
}
return 0;
return count;
}

private static class RunningSplitInfo
Expand Down
Expand Up @@ -281,6 +281,8 @@ protected void setup(Binder binder)
// task execution
jaxrsBinder(binder).bind(TaskResource.class);
newExporter(binder).export(TaskResource.class).withGeneratedName();
jaxrsBinder(binder).bind(TaskExecutorResource.class);
newExporter(binder).export(TaskExecutorResource.class).withGeneratedName();
binder.bind(TaskManagementExecutor.class).in(Scopes.SINGLETON);
binder.bind(SqlTaskManager.class).in(Scopes.SINGLETON);
binder.bind(TaskManager.class).to(Key.get(SqlTaskManager.class));
Expand Down
@@ -0,0 +1,44 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.server;

import com.facebook.presto.execution.executor.TaskExecutor;

import javax.inject.Inject;
import javax.ws.rs.GET;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.core.MediaType;

import static java.util.Objects.requireNonNull;

@Path("/v1/maxActiveSplits")
public class TaskExecutorResource
{
private final TaskExecutor taskExecutor;

@Inject
public TaskExecutorResource(
TaskExecutor taskExecutor)
{
this.taskExecutor = requireNonNull(taskExecutor, "taskExecutor is null");
}

@GET
@Produces(MediaType.TEXT_PLAIN)
public String getMaxActiveSplit()
{
return taskExecutor.getMaxActiveSplitsInfo();
}
}
Expand Up @@ -74,8 +74,10 @@ public void testTasksComplete()
beginPhase.arriveAndAwaitAdvance();
assertEquals(driver1.getCompletedPhases(), 0);
assertEquals(driver2.getCompletedPhases(), 0);
ticker.increment(10, MILLISECONDS);
assertEquals(taskExecutor.getMaxActiveSplitTime(), 10);
ticker.increment(60, SECONDS);
assertEquals(taskExecutor.getRunAwaySplitCount(), 0);
ticker.increment(600, SECONDS);
assertEquals(taskExecutor.getRunAwaySplitCount(), 2);
verificationComplete.arriveAndAwaitAdvance();

// advance one phase and verify
Expand Down Expand Up @@ -129,8 +131,8 @@ public void testTasksComplete()
assertEquals(driver3.getLastPhase(), 12);

// no splits remaining
ticker.increment(30, MILLISECONDS);
assertEquals(taskExecutor.getMaxActiveSplitTime(), 0);
ticker.increment(610, SECONDS);
assertEquals(taskExecutor.getRunAwaySplitCount(), 0);
}
finally {
taskExecutor.stop();
Expand Down

0 comments on commit 5dfd1ed

Please sign in to comment.