diff --git a/include/taco/lower/lowerer_impl.h b/include/taco/lower/lowerer_impl.h index afc936145..52fd91165 100644 --- a/include/taco/lower/lowerer_impl.h +++ b/include/taco/lower/lowerer_impl.h @@ -383,6 +383,11 @@ class LowererImpl : public util::Uncopyable { /// tensors, instead of the full tensor. ir::Expr searchForStartOfWindowPosition(Iterator iterator, ir::Expr start, ir::Expr end); + /// Expression that returns the end of a window to iterate over + /// in a compressed iterator. It is used when operating over windows of + /// tensors, instead of the full tensor. + ir::Expr searchForEndOfWindowPosition(Iterator iterator, ir::Expr start, ir::Expr end); + /// Statement that guards against going out of bounds of the window that /// the input iterator was configured with. ir::Stmt upperBoundGuardForWindowPosition(Iterator iterator, ir::Expr access); diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp index d2ca6225c..25a3593ab 100644 --- a/src/lower/lowerer_impl.cpp +++ b/src/lower/lowerer_impl.cpp @@ -1022,7 +1022,12 @@ Stmt LowererImpl::lowerForallPosition(Forall forall, Iterator iterator, // variable from the windowed space. if (iterator.isWindowed()) { coordinateArray = this->projectWindowedPositionToCanonicalSpace(iterator, coordinateArray); - boundsGuard = this->upperBoundGuardForWindowPosition(iterator, coordinate); + // If this forall is being parallelized via CPU threads (OpenMP), then we can't + // emit a `break` statement, since OpenMP doesn't support breaking out of a + // parallel loop. Instead, we'll bound the top of the loop and omit the check. + if (forall.getParallelUnit() != ParallelUnit::CPUThread) { + boundsGuard = this->upperBoundGuardForWindowPosition(iterator, coordinate); + } } declareCoordinate = VarDecl::make(coordinate, coordinateArray); } @@ -1060,7 +1065,14 @@ Stmt LowererImpl::lowerForallPosition(Forall forall, Iterator iterator, // If we have a window on this iterator, then search for the start of // the window rather than starting at the beginning of the level. if (iterator.isWindowed()) { - startBound = this->searchForStartOfWindowPosition(iterator, startBound, endBound); + auto startBoundCopy = startBound; + startBound = this->searchForStartOfWindowPosition(iterator, startBound, endBound); + // As discussed above, if this position loop is parallelized over CPU + // threads (OpenMP), then we need to have an explicit upper bound to + // the for loop, instead of breaking out of the loop in the middle. + if (forall.getParallelUnit() == ParallelUnit::CPUThread) { + endBound = this->searchForEndOfWindowPosition(iterator, startBoundCopy, endBound); + } } } else { taco_iassert(iterator.isOrdered() && iterator.getParent().isOrdered()); @@ -2795,6 +2807,19 @@ Expr LowererImpl::searchForStartOfWindowPosition(Iterator iterator, ir::Expr sta return Call::make("taco_binarySearchAfter", args, Datatype::UInt64); } +Expr LowererImpl::searchForEndOfWindowPosition(Iterator iterator, ir::Expr start, ir::Expr end) { + taco_iassert(iterator.isWindowed()); + vector args = { + // Search over the `crd` array of the level, + iterator.getMode().getModePack().getArray(1), + // between the start and end position, + start, end, + // for the end of the window. + iterator.getWindowUpperBound(), + }; + return Call::make("taco_binarySearchAfter", args, Datatype::UInt64); +} + Stmt LowererImpl::upperBoundGuardForWindowPosition(Iterator iterator, ir::Expr access) { taco_iassert(iterator.isWindowed()); return ir::IfThenElse::make( diff --git a/test/tests-windowing.cpp b/test/tests-windowing.cpp index 95e75d34e..ae50b1271 100644 --- a/test/tests-windowing.cpp +++ b/test/tests-windowing.cpp @@ -12,14 +12,14 @@ using namespace taco; // tensor with a mix of IndexVars and WindowedIndexVars. TEST(windowing, mixIndexing) { auto dim = 10; - Tensor a("a", {dim, dim, dim, dim, dim}, {Dense, Dense, Dense, Dense, Dense}); + Tensor a("a", {dim, dim, dim, dim, dim}, Format{Dense, Dense, Dense, Dense, Dense}); IndexVar i, j, k, l, m; auto w1 = a(i, j(1, 3), k, l(4, 5), m(6, 7)); auto w2 = a(i(1, 3), j(2, 4), k, l, m(3, 5)); } TEST(windowing, boundsChecks) { - Tensor a("a", {5}, {Dense}); + Tensor a("a", {5}, Format{Dense}); IndexVar i("i"); ASSERT_THROWS_EXCEPTION_WITH_ERROR([&]() { a(i(-1, 4)); }, "slice lower bound"); ASSERT_THROWS_EXCEPTION_WITH_ERROR([&]() { a(i(0, 10)); }, "slice upper bound"); @@ -29,10 +29,10 @@ TEST(windowing, boundsChecks) { // in the same expression. TEST(windowing, sliceMultipleWays) { auto dim = 10; - Tensor a("a", {dim}, {Dense}); - Tensor b("b", {dim}, {Sparse}); - Tensor c("c", {dim}, {Dense}); - Tensor expected("expected", {dim}, {Dense}); + Tensor a("a", {dim}, Format{Dense}); + Tensor b("b", {dim}, Format{Sparse}); + Tensor c("c", {dim}, Format{Dense}); + Tensor expected("expected", {dim}, Format{Dense}); for (int i = 0; i < dim; i++) { a.insert({i}, i); b.insert({i}, i); @@ -51,28 +51,28 @@ TEST(windowing, sliceMultipleWays) { // of the input tensors and formats for each of the tensors in the computation. struct basic : public TestWithParam> {}; TEST_P(basic, windowing){ - Tensor expectedAdd("expectedAdd", {2, 2}, {Dense, Dense}); + Tensor expectedAdd("expectedAdd", {2, 2}, Format{Dense, Dense}); expectedAdd.insert({0, 0}, 14); expectedAdd.insert({0, 1}, 17); expectedAdd.insert({1, 0}, 17); expectedAdd.insert({1, 1}, 20); expectedAdd.pack(); - Tensor expectedMul("expectedMul", {2, 2}, {Dense, Dense}); + Tensor expectedMul("expectedMul", {2, 2}, Format{Dense, Dense}); expectedMul.insert({0, 0}, 64); expectedMul.insert({0, 1}, 135); expectedMul.insert({1, 0}, 135); expectedMul.insert({1, 1}, 240); expectedMul.pack(); - Tensor d("d", {2, 2}, {Dense, Dense}); + Tensor d("d", {2, 2}, Format{Dense, Dense}); // The test is parameterized by a dimension, and formats for the different tensors. auto dim = std::get<0>(GetParam()); auto x = std::get<1>(GetParam()); auto y = std::get<2>(GetParam()); auto z = std::get<3>(GetParam()); - Tensor a("a", {dim, dim}, {Dense, x}); - Tensor b("b", {dim, dim}, {Dense, y}); - Tensor c("c", {dim, dim}, {Dense, z}); + Tensor a("a", {dim, dim}, Format{Dense, x}); + Tensor b("b", {dim, dim}, Format{Dense, y}); + Tensor c("c", {dim, dim}, Format{Dense, z}); for (int i = 0; i < dim; i++) { for (int j = 0; j < dim; j++) { a.insert({i, j}, i + j); @@ -111,7 +111,7 @@ INSTANTIATE_TEST_CASE_P( struct slicedOutput : public TestWithParam> {}; TEST_P(slicedOutput, windowing) { auto dim = 10; - Tensor expected("expected", {10, 10}, {Dense, Dense}); + Tensor expected("expected", {10, 10}, Format{Dense, Dense}); expected.insert({8, 8}, 12); expected.insert({8, 9}, 14); expected.insert({9, 8}, 14); @@ -119,9 +119,9 @@ TEST_P(slicedOutput, windowing) { expected.pack(); auto x = std::get<0>(GetParam()); auto y = std::get<1>(GetParam()); - Tensor a("a", {dim, dim}, {Dense, x}); - Tensor b("b", {dim, dim}, {Dense, y}); - Tensor c("c", {dim, dim}, {Dense, Dense}); + Tensor a("a", {dim, dim}, Format{Dense, x}); + Tensor b("b", {dim, dim}, Format{Dense, y}); + Tensor c("c", {dim, dim}, Format{Dense, Dense}); for (int i = 0; i < dim; i++) { for (int j = 0; j < dim; j++) { a.insert({i, j}, i + j); @@ -152,15 +152,15 @@ TEST_P(matrixMultiply, windowing) { auto dim = 10; auto windowDim = 4; - Tensor a("a", {windowDim, windowDim}, {Dense, Dense}); - Tensor b("b", {windowDim, windowDim}, {Dense, Dense}); - Tensor c("c", {windowDim, windowDim}, {Dense, Dense}); - Tensor expected("expected", {windowDim, windowDim}, {Dense, Dense}); + Tensor a("a", {windowDim, windowDim}, Format{Dense, Dense}); + Tensor b("b", {windowDim, windowDim}, Format{Dense, Dense}); + Tensor c("c", {windowDim, windowDim}, Format{Dense, Dense}); + Tensor expected("expected", {windowDim, windowDim}, Format{Dense, Dense}); auto x = std::get<0>(GetParam()); auto y = std::get<1>(GetParam()); - Tensor aw("aw", {dim, dim}, {Dense, x}); - Tensor bw("bw", {dim, dim}, {Dense, y}); + Tensor aw("aw", {dim, dim}, Format{Dense, x}); + Tensor bw("bw", {dim, dim}, Format{Dense, y}); for (int i = 0; i < dim; i++) { for (int j = 0; j < dim; j++) { aw.insert({i, j}, i + j); @@ -198,17 +198,17 @@ struct workspace : public TestWithParam> {}; TEST_P(workspace, windowing) { auto dim = 10; size_t windowDim = 4; - Tensor d("d", {static_cast(windowDim)}, {Dense}); - Tensor expected("expected", {static_cast(windowDim)}, {Dense}); + Tensor d("d", {static_cast(windowDim)}, Format{Dense}); + Tensor expected("expected", {static_cast(windowDim)}, Format{Dense}); expected.insert({0}, 8); expected.insert({1}, 11); expected.insert({2}, 14); expected.insert({3}, 17); expected.pack(); auto x = std::get<0>(GetParam()); auto y = std::get<1>(GetParam()); - Tensor a("a", {dim}, {x}); - Tensor b("b", {dim}, {y}); - Tensor c("c", {dim}, {Dense}); + Tensor a("a", {dim}, Format{x}); + Tensor b("b", {dim}, Format{y}); + Tensor c("c", {dim}, Format{Dense}); for (int i = 0; i < dim; i++) { a.insert({i}, i); b.insert({i}, i); @@ -237,7 +237,7 @@ INSTANTIATE_TEST_CASE_P( // transformations and different mode formats. TEST(windowing, transformations) { auto dim = 10; - Tensor expected("expected", {2, 2}, {Dense, Dense}); + Tensor expected("expected", {2, 2}, Format{Dense, Dense}); expected.insert({0, 0}, 12); expected.insert({0, 1}, 14); expected.insert({1, 0}, 14); @@ -300,20 +300,20 @@ TEST_P(assignment, windowing) { IndexVar i, j; // First assign a window of A to a window of B. - Tensor B("B", {dim, dim}, {Dense, Dense}); + Tensor B("B", {dim, dim}, Format{Dense, Dense}); B(i(2, 4), j(3, 5)) = A(i(4, 6), j(5, 7)); B.evaluate(); - Tensor expected("expected", {dim, dim}, {Dense, Dense}); + Tensor expected("expected", {dim, dim}, Format{Dense, Dense}); expected.insert({2, 3}, 9); expected.insert({2, 4}, 10); expected.insert({3, 3}, 10); expected.insert({3, 4}, 11); expected.pack(); ASSERT_TRUE(equals(B, expected)) << B << std::endl << expected << std::endl; // Assign a window of A to b. - B = Tensor("B", {2, 2}, {Dense, Dense}); + B = Tensor("B", {2, 2}, Format{Dense, Dense}); B(i, j) = A(i(4, 6), j(5, 7)); B.evaluate(); - expected = Tensor("expected", {2, 2}, {Dense, Dense}); + expected = Tensor("expected", {2, 2}, Format{Dense, Dense}); expected.insert({0, 0}, 9); expected.insert({0, 1}, 10); expected.insert({1, 0}, 10); expected.insert({1, 1}, 11); expected.pack(); @@ -324,10 +324,10 @@ TEST_P(assignment, windowing) { A.insert({0, 0}, 0); A.insert({0, 1}, 1); A.insert({1, 0}, 1); A.insert({1, 1}, 2); A.pack(); - B = Tensor("B", {dim, dim}, {Dense, Dense}); + B = Tensor("B", {dim, dim}, Format{Dense, Dense}); B(i(4, 6), j(5, 7)) = A(i, j); B.evaluate(); - expected = Tensor("expected", {dim, dim}, {Dense, Dense}); + expected = Tensor("expected", {dim, dim}, Format{Dense, Dense}); expected.insert({4, 5}, 0); expected.insert({4, 6}, 1); expected.insert({5, 5}, 1); expected.insert({5, 6}, 2); expected.pack(); @@ -347,16 +347,16 @@ TEST_P(cuda, windowing) { return; } auto dim = 10; - Tensor expected("expected", {2, 2}, {Dense, Dense}); + Tensor expected("expected", {2, 2}, Format{Dense, Dense}); expected.insert({0, 0}, 12); expected.insert({0, 1}, 14); expected.insert({1, 0}, 14); expected.insert({1, 1}, 16); expected.pack(); auto x = std::get<0>(GetParam()); auto y = std::get<1>(GetParam()); - Tensor a("a", {dim, dim}, {Dense, x}); - Tensor b("b", {dim, dim}, {Dense, y}); - Tensor c("c", {2, 2}, {Dense, Dense}); + Tensor a("a", {dim, dim}, Format{Dense, x}); + Tensor b("b", {dim, dim}, Format{Dense, y}); + Tensor c("c", {2, 2}, Format{Dense, Dense}); for (int i = 0; i < dim; i++) { for (int j = 0; j < dim; j++) {