Skip to content
Permalink
Browse files

Errors during liveness checking do not terminate TLC.

The error/exception - such as an OutOfMemoryError - shows up on the
console, but does not terminate TLC.  A user thus might miss the
error/exception and incorrectly assume that the spec satisfies the
liveness properties.

The effect of this fix can be studied with unit test
CodePlexBug09EWD749FL2Test that terminates with a meaningful "ran
out of memory" message instead of showing TLC regular success
message despite going out of memory (run test with -Xmx56m).

Fixes Github issue #1
#1

[Bug][TLC][Changelog]
  • Loading branch information
lemmy committed Mar 24, 2020
1 parent 64140e5 commit db269535b7cebccb4270b8a505ac9be605021522
@@ -53,6 +53,7 @@
public static final int GENERAL = 1000;
public static final int SYSTEM_OUT_OF_MEMORY = 1001;
public static final int SYSTEM_OUT_OF_MEMORY_TOO_MANY_INIT = 1002;
public static final int SYSTEM_OUT_OF_MEMORY_LIVENESS = 1003;
public static final int SYSTEM_STACK_OVERFLOW = 1005;

public static final int WRONG_COMMANDLINE_PARAMS_SIMULATOR = 1101;
@@ -339,6 +339,11 @@ private static String getMessage0(int messageClass, int messageCode, String[] pa
+ "pool (heap) may fix this. But it won't help if some state has an enormous\n"
+ "number of successor states, or if TLC must compute the value of a huge set.");
break;
case EC.SYSTEM_OUT_OF_MEMORY_LIVENESS:
b.append("Java ran out of memory during liveness checking. Running Java with a larger memory\n"
+ "allocation pool (heap) may fix this. But it won't help if paths in the liveness graph\n"
+ "have an enormous number of states.");
break;
case EC.SYSTEM_OUT_OF_MEMORY_TOO_MANY_INIT:
b.append("Out Of Memory. There are probably too many initial states.");
break;
@@ -9,6 +9,13 @@
import java.util.Enumeration;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import tlc2.TLC;
import tlc2.TLCGlobals;
@@ -210,26 +217,57 @@ protected int check0(final ITool tool, final boolean finalCheck) throws Interrup
final BlockingQueue<ILiveChecker> queue = new ArrayBlockingQueue<ILiveChecker>(checker.length);
queue.addAll(Arrays.asList(checker));

int slen = checker.length;
int wNum = Math.min(slen, TLCGlobals.getNumWorkers());

if (wNum == 1) {
LiveWorker worker = new LiveWorker(tool, 0, 1, this, queue, finalCheck);
worker.run();
} else {
final LiveWorker[] workers = new LiveWorker[wNum];
for (int i = 0; i < wNum; i++) {
workers[i] = new LiveWorker(tool, i, wNum, this, queue, finalCheck);
workers[i].start();
}
for (int i = 0; i < wNum; i++) {
workers[i].join();

/*
* A LiveWorker below can either complete a unit of work a) without finding a
* liveness violation, b) finds a violation, or c) fails to check because of an
* exception/error (such as going out of memory). In case an LW fails to check,
* we still wait for all other LWs to complete. A subset of the LWs might have
* found a violation. In other words, the OOM of an LW has lower precedence than
* a violation found by another LW. However, if any LW fails to check, we terminate
* model checking after all LWs completed.
*/
final int wNum = Math.min(checker.length, TLCGlobals.getNumWorkers());
final ExecutorService pool = Executors.newFixedThreadPool(wNum);
// CS is really just a container around the set of Futures returned by the pool. It saves us from
// creating a low-level array.
final CompletionService<Boolean> completionService = new ExecutorCompletionService<Boolean>(pool);

for (int i = 0; i < wNum; i++) {
completionService.submit(new LiveWorker(tool, i, wNum, this, queue, finalCheck));
}
// Wait for all LWs to complete.
pool.shutdown();
pool.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS); // wait forever

// Check if any one of the LWs found a violation (ignore failures for now).
ExecutionException ee = null;
for (int i = 0; i < wNum; i++) {
try {
final Future<Boolean> future = completionService.take();
if (future.get()) {
MP.printMessage(EC.TLC_CHECKING_TEMPORAL_PROPS_END,
TLC.convertRuntimeToHumanReadable(System.currentTimeMillis() - startTime));
return EC.TLC_TEMPORAL_PROPERTY_VIOLATED;
}
} catch (final ExecutionException e) {
// handled below!
ee = e;
}
}

if (LiveWorker.hasErrFound()) {
MP.printMessage(EC.TLC_CHECKING_TEMPORAL_PROPS_END, TLC.convertRuntimeToHumanReadable(System.currentTimeMillis() - startTime));
return EC.TLC_TEMPORAL_PROPERTY_VIOLATED;
// Terminate if any one of the LWs failed c)
if (ee != null) {
final Throwable cause = ee.getCause();
if (cause instanceof OutOfMemoryError) {
MP.printError(EC.SYSTEM_OUT_OF_MEMORY_LIVENESS, cause);
} else if (cause instanceof StackOverflowError) {
MP.printError(EC.SYSTEM_STACK_OVERFLOW, cause);
} else if (cause != null) {
MP.printError(EC.GENERAL, cause);
} else {
MP.printError(EC.GENERAL, ee);
}
System.exit(1);
}

// Reset after checking unless it's the final check:
@@ -21,7 +21,6 @@
import tlc2.tool.EvalException;
import tlc2.tool.ITool;
import tlc2.tool.TLCStateInfo;
import tlc2.util.IdThread;
import tlc2.util.IntStack;
import tlc2.util.LongVec;
import tlc2.util.MemIntQueue;
@@ -39,7 +38,7 @@
* <li>In case of a violation, reconstructs and prints the error trace.</li>
* </ul>
*/
public class LiveWorker extends IdThread {
public class LiveWorker implements Callable<Boolean> {

/**
* A marker that is pushed onto the dfsStack during SCC depth-first-search
@@ -68,27 +67,33 @@

private final ITool tool;

private final int id;

public LiveWorker(final ITool tool, int id, int numWorkers, final ILiveCheck liveCheck, final BlockingQueue<ILiveChecker> queue, final boolean finalCheck) {
super(id);
this.id = id;
this.tool = tool;
this.numWorkers = numWorkers;
this.liveCheck = liveCheck;
this.queue = queue;
this.isFinalCheck = finalCheck;

// Set the name to something more indicative than "Thread-4711".
this.setName("TLCLiveWorkerThread-" + String.format("%03d", id));
}

/**
* Returns true iff an error has already been found.
*/
public static boolean hasErrFound() {
private static boolean hasErrFound() {
synchronized (workerLock) {
return (errFoundByThread != -1);
}
}

// True iff this LiveWorker found a liveness violation.zs
private static boolean hasErrFound(final int id) {
synchronized (workerLock) {
return (errFoundByThread == id);
}
}

/**
* Returns true iff either an error has not been found or the error is found
* by this thread.
@@ -99,9 +104,9 @@ public static boolean hasErrFound() {
private/* static synchronized */boolean setErrFound() {
synchronized (workerLock) {
if (errFoundByThread == -1) {
errFoundByThread = this.myGetId(); // GetId();
errFoundByThread = this.id; // GetId();
return true;
} else if (errFoundByThread == this.myGetId()) { // (* GetId()) {
} else if (errFoundByThread == this.id) { // (* GetId()) {
return true;
}
return false;
@@ -198,8 +203,8 @@ private final void checkSccs(final ITool tool) throws IOException, InterruptedEx
final int alen = this.oos.getCheckAction().length;

// Tarjan's stack
// Append thread id to name for unique disk files during concurrent SCC search
final IntStack dfsStack = getStack(liveCheck.getMetaDir(), "dfs" + this.myGetId());
// Append thread id to name for unique disk files during concurrent SCC search
final IntStack dfsStack = getStack(liveCheck.getMetaDir(), "dfs" + this.id);

// comStack is only being added to during the deep first search. It is passed
// to the checkComponent method while in DFS though. Note that the nodes pushed
@@ -208,7 +213,7 @@ private final void checkSccs(final ITool tool) throws IOException, InterruptedEx
//
// See tlc2.tool.liveness.LiveWorker.DetailedFormatter.toString(MemIntStack)
// which is useful during debugging.
final IntStack comStack = getStack(liveCheck.getMetaDir(), "com" + this.myGetId());
final IntStack comStack = getStack(liveCheck.getMetaDir(), "com" + this.id);

// Generate the SCCs and check if they contain a "bad" cycle.
while (nodeQueue.size() > 0) {
@@ -1212,49 +1217,39 @@ private GraphNode dfsPostFix(final long state, final int tidx, final TableauNode
return curNode;
}

/* (non-Javadoc)
* @see java.lang.Thread#run()
*/
public final void run() {
try {
while (true) {
// Use poll() to get the next checker from the queue or null if
// there is none. Do *not* block when there are no more checkers
// available. Nobody is going to add new checkers to the queue.
final ILiveChecker checker = queue.poll();
if (checker == null || hasErrFound()) {
// Another thread has either found an error (violation of a
// liveness property) OR there is no more work (checker) to
// be done.
break;
}
public final Boolean call() throws IOException, InterruptedException, ExecutionException {
while (true) {
// Use poll() to get the next checker from the queue or null if
// there is none. Do *not* block when there are no more checkers
// available. Nobody is going to add new checkers to the queue.
final ILiveChecker checker = queue.poll();
if (checker == null || hasErrFound()) {
// Another thread has either found an error (violation of a
// liveness property) OR there is no more work (checker) to
// be done.
break;
}

this.oos = checker.getSolution();
this.dg = checker.getDiskGraph();
this.dg.createCache();
PossibleErrorModel[] pems = this.oos.getPems();
for (int i = 0; i < pems.length; i++) {
if (!hasErrFound()) {
this.pem = pems[i];
this.checkSccs(tool);
}
this.oos = checker.getSolution();
this.dg = checker.getDiskGraph();
this.dg.createCache();
PossibleErrorModel[] pems = this.oos.getPems();
for (int i = 0; i < pems.length; i++) {
if (!hasErrFound()) {
this.pem = pems[i];
this.checkSccs(tool);
}
this.dg.destroyCache();
// Record the size of the disk graph at the time its checked. This
// information is later used to decide if it it makes sense to
// run the next check on the larger but still *partial* graph.
this.dg.recordSize();
// If assertions are on (e.g. during unit testing) make sure
// that the disk graph's invariants hold.
assert this.dg.checkInvariants(oos.getCheckState().length, oos.getCheckAction().length);
}
} catch (Exception e) {
MP.printError(EC.GENERAL, "checking liveness", e); // LL changed
// call 7 April
// 2012
// Assert.printStack(e);
return;
this.dg.destroyCache();
// Record the size of the disk graph at the time its checked. This
// information is later used to decide if it it makes sense to
// run the next check on the larger but still *partial* graph.
this.dg.recordSize();
// If assertions are on (e.g. during unit testing) make sure
// that the disk graph's invariants hold.
assert this.dg.checkInvariants(oos.getCheckState().length, oos.getCheckAction().length);
}
return hasErrFound(this.id);
}

public String toDotViz(final long state, final int tidx, TableauNodePtrTable tnpt) throws IOException {
@@ -111,11 +111,6 @@ public void resetElems() {

/* Double the table when the table is full by the threshhold. */
private final void grow() {
// TODO This grows unbound up to an OOM exception or fails to increase
// the array when length reaches Integer.MAX_VALUE.
// In case of Integer.MAX_VALUE, all subsequent put(long,long)
// invocations will fail to put the new key causing TLC to live-lock.
// This can possibly be replaced with a variant of DiskFPSet.
try {
final int newLength = 2 * this.length + 1;
final long[] oldKeys = this.keys;
@@ -147,9 +142,12 @@ private final void grow() {
}
this.length = newLength;
this.thresh = (int) (newLength * 0.75);
} catch (Throwable t) {
} catch (OutOfMemoryError t) {
// Handle OOM error locally because grow is on the code path of safety checking
// (LiveCheck#addInit/addNext...).
System.gc();
MP.printError(EC.SYSTEM_OUT_OF_MEMORY, t);
System.exit(1);
}
}

0 comments on commit db26953

Please sign in to comment.
You can’t perform that action at this time.