Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix conjugate-transpose for matrices of certain sizes (issue #19200) #46736

Merged
merged 4 commits into from
Mar 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 35 additions & 26 deletions tensorflow/core/kernels/conv_2d_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -695,21 +695,21 @@ constexpr bool TileSizeOnNonLongSideFrontier(int TileLongSide,
}

// Helper function to launch a batch narrow matirx transpose kernel.
template <typename T, int TileLongSide, int TileShortSide>
template <typename T, int TileLongSide, int TileShortSide, bool conjugate>
void LaunchBatchNarrowMatrixTransposeKernel(
const GPUDevice& d, int tile_size_i, int tile_size_j, int total_tiles_count,
const T* input, const Dimension<3>& input_dims, T* output) {
constexpr int NumThreads = TileLongSide;
if (tile_size_i <= TileLongSide && tile_size_j <= TileShortSide) {
TF_CHECK_OK(GpuLaunchKernel(
SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileLongSide,
TileShortSide>,
TileShortSide, conjugate>,
total_tiles_count, NumThreads, 0, d.stream(), input, input_dims,
output));
} else {
TF_CHECK_OK(GpuLaunchKernel(
SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileShortSide,
TileLongSide>,
TileLongSide, conjugate>,
total_tiles_count, NumThreads, 0, d.stream(), input, input_dims,
output));
}
Expand All @@ -731,7 +731,7 @@ void LaunchBatchNarrowMatrixTransposeKernel(
// can only increment the short side len.
// - It lies on the long side frontier. We launch the kernel without checking if
// the request is satisfied or not.
template <typename T, int TileLongSide, int TileShortSide,
template <typename T, int TileLongSide, int TileShortSide, bool conjugate,
typename dummy = void>
struct BatchNarrowMatrixTransposeDispatcher {
static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
Expand All @@ -745,7 +745,8 @@ struct BatchNarrowMatrixTransposeDispatcher {
std::min(tile_size_i, tile_size_j) <= TileShortSide;

if (request_satisfied) {
LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide,
conjugate>(
d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
output);
return;
Expand All @@ -758,22 +759,26 @@ struct BatchNarrowMatrixTransposeDispatcher {
std::max(tile_size_i, tile_size_j) > TileLongSide;

if (long_side_request_not_satisfied) {
BatchNarrowMatrixTransposeDispatcher<
T, TileLongSide * 2, TileShortSide>::DoIt(d, tile_size_i, tile_size_j,
total_tiles_count, input,
input_dims, output);
BatchNarrowMatrixTransposeDispatcher<T, TileLongSide * 2, TileShortSide,
conjugate>::DoIt(d, tile_size_i,
tile_size_j,
total_tiles_count,
input, input_dims,
output);
} else {
BatchNarrowMatrixTransposeDispatcher<
T, TileLongSide, TileShortSide + 1>::DoIt(d, tile_size_i, tile_size_j,
total_tiles_count, input,
input_dims, output);
BatchNarrowMatrixTransposeDispatcher<T, TileLongSide, TileShortSide + 1,
conjugate>::DoIt(d, tile_size_i,
tile_size_j,
total_tiles_count,
input, input_dims,
output);
}
}
};

template <typename T, int TileLongSide, int TileShortSide>
template <typename T, int TileLongSide, int TileShortSide, bool conjugate>
struct BatchNarrowMatrixTransposeDispatcher<
T, TileLongSide, TileShortSide,
T, TileLongSide, TileShortSide, conjugate,
typename std::enable_if<TileSizeOnNonLongSideFrontier(
TileLongSide, TileShortSide, sizeof(T)),
void>::type> {
Expand All @@ -788,7 +793,8 @@ struct BatchNarrowMatrixTransposeDispatcher<
std::min(tile_size_i, tile_size_j) <= TileShortSide;

if (request_satisfied) {
LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide,
conjugate>(
d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
output);
return;
Expand All @@ -797,16 +803,18 @@ struct BatchNarrowMatrixTransposeDispatcher<
// If the execution reaches here, then the kernel was not launched; since
// we are on the non long side frontier, we increment the short dimension
// and try again.
BatchNarrowMatrixTransposeDispatcher<
T, TileLongSide, TileShortSide + 1>::DoIt(d, tile_size_i, tile_size_j,
total_tiles_count, input,
input_dims, output);
BatchNarrowMatrixTransposeDispatcher<T, TileLongSide, TileShortSide + 1,
conjugate>::DoIt(d, tile_size_i,
tile_size_j,
total_tiles_count,
input, input_dims,
output);
}
};

template <typename T, int TileLongSide, int TileShortSide>
template <typename T, int TileLongSide, int TileShortSide, bool conjugate>
struct BatchNarrowMatrixTransposeDispatcher<
T, TileLongSide, TileShortSide,
T, TileLongSide, TileShortSide, conjugate,
typename std::enable_if<TileSizeOnLongSideFrontier(
TileLongSide, TileShortSide, sizeof(T)),
void>::type> {
Expand All @@ -817,7 +825,8 @@ struct BatchNarrowMatrixTransposeDispatcher<
(TileLongSide & (TileLongSide - 1)) == 0,
"The length of the longer side of the tile is always a power of 2.");

LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide,
conjugate>(
d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
output);
}
Expand Down Expand Up @@ -890,11 +899,11 @@ struct TransposeElemType<4> {
};
template <>
struct TransposeElemType<8> {
using type = uint64;
using type = float2;
Copy link
Member

@allenlavoie allenlavoie Feb 22, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this necessary for correctness? Would we run into similar issues with complex<float16> and TransposeElemType<4>?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is necessary. Instead of simply conjugating, we call maybe_conj, which is specialized for complex types in GPU (float2 and double2).

I thought about TransposeElemType<4> when fixing this, and came to the conclusion that tensorflow doesn't have a complex type which uses float16. std::complex<float16> doesn't exist, as far as I can tell.

To be honest, I'm not entirely sure why TransposeElemType is there in the first place. If I had to make a guess, I would say that it's in order to make this code work for non-basic types (i.e not tensorflow builtin numeric types). However, this is only used in a very specific place, so it would fail to achieve that.

Thanks for the comment, I found out I didn't fix the issue for complex128, but it's sorted now.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, right, we just have DT_COMPLEX64 and DT_COMPLEX128. Sounds OK then. If you want to get rid of TransposeElemType that's fine too (we might find a failing test).

I do think it's another good reason to have a unit test. If someone comes along and adds a smaller complex type it'd make it at least possible to find and fix this issue.

};
template <>
struct TransposeElemType<16> {
using type = float4;
using type = double2;
};

// A helper function to make RunSwapDimension1And2InTensor3 concise. This
Expand Down Expand Up @@ -975,7 +984,7 @@ void SwapDimension1And2InTensor3WithNarrowMatrices(

using ElemType = typename TransposeElemType<sizeof(T)>::type;
static_assert(alignof(T) >= alignof(ElemType), "Unexpected data alignment.");
BatchNarrowMatrixTransposeDispatcher<ElemType, 32, 2>::DoIt(
BatchNarrowMatrixTransposeDispatcher<ElemType, 32, 2, conjugate>::DoIt(
d, requested_tile_size_i, requested_tile_size_j, total_tiles_count,
reinterpret_cast<const ElemType*>(input), input_dims,
reinterpret_cast<ElemType*>(output));
Expand Down
19 changes: 19 additions & 0 deletions tensorflow/python/kernel_tests/array_ops_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,25 @@ def testTensorWithStaticRankLessThanTwoRaisesBecauseNotAMatrix(self):
with self.assertRaisesRegex(ValueError, "should be a "):
array_ops.matrix_transpose(vector)

def testNarrowMatrixConjugateTranspose(self):
for dtype in (dtypes.float32, dtypes.float64):
for conjugate in (True, False):
with self.subTest(complex_type=dtype, conjugate=conjugate):
vector = math_ops.complex(constant_op.constant(0, dtype=dtype),
math_ops.range(96, dtype=dtype))
column_vector = array_ops.expand_dims(vector, axis=-1)
row_vector = array_ops.expand_dims(vector, axis=0)
narrow_matrix = array_ops.tile(column_vector, [1, 2]) # [96, 2]
expected_transposed = array_ops.tile(row_vector, [2, 1]) # [2, 96]
if conjugate:
expected_transposed = -expected_transposed

transposed = array_ops.matrix_transpose(narrow_matrix,
conjugate=conjugate)

self.assertEqual((2, 96), transposed.get_shape())
self.assertAllEqual(expected_transposed, transposed)


class BooleanMaskTest(test_util.TensorFlowTestCase):

Expand Down