Skip to content

Commit

Permalink
* Add GPU implementation of LogDeterminant op.
Browse files Browse the repository at this point in the history
* Switch GPU implementation of Determinant to use the more numerically stable kernel as well.
* Change behavior for Determinant on matrices with (numerically) infinite determinants to match the behavior of numpy.linalg.det: Return inf for matrix with infinite determinant.
* Misc. cleanup in code working around missing support for complex in the NVCC compiler.

PiperOrigin-RevId: 173277377
  • Loading branch information
tensorflower-gardener committed Oct 24, 2017
1 parent 377dd3d commit 1bbec9e
Show file tree
Hide file tree
Showing 10 changed files with 414 additions and 231 deletions.
10 changes: 0 additions & 10 deletions tensorflow/core/kernels/cuda_solvers.h
Expand Up @@ -410,16 +410,6 @@ class DeviceLapackInfo : public ScratchSpace<int> {

namespace functor {

// Helper functor to compute the product of diagonal elements in all matrices
// in a flattened batch.
template <typename Device, typename Scalar>
struct DeterminantFromPivotedLUFunctor {
void operator()(const Device& device,
typename TTypes<Scalar, 3>::ConstTensor lu_factor,
const int* pivots, typename TTypes<Scalar, 1>::Tensor output,
int* info);
};

// Helper functor to set a batch of matrices to the identity.
// TODO(rmlarsen): Use this kernel to replace the horribly inefficient tf.eye
// op.
Expand Down
156 changes: 3 additions & 153 deletions tensorflow/core/kernels/cuda_solvers_gpu.cu.cc
Expand Up @@ -29,159 +29,11 @@ namespace functor {

typedef Eigen::GpuDevice GPUDevice;

namespace {

// Hacks around missing support for complex arithmetic in nvcc.
template <typename Scalar>
__device__ inline Scalar Multiply(Scalar x, Scalar y) {
return x * y;
}

template <>
__device__ inline cuComplex Multiply(cuComplex x, cuComplex y) {
return cuCmulf(x, y);
}

template <>
__device__ inline cuDoubleComplex Multiply(cuDoubleComplex x,
cuDoubleComplex y) {
return cuCmul(x, y);
}

template <typename Scalar>
__device__ inline Scalar Negate(Scalar x) {
return -x;
}

template <>
__device__ inline cuComplex Negate(cuComplex x) {
return make_cuComplex(-cuCrealf(x), -cuCimagf(x));
}

template <>
__device__ inline cuDoubleComplex Negate(cuDoubleComplex x) {
return make_cuDoubleComplex(-cuCreal(x), -cuCimag(x));
}

template <typename Scalar>
__device__ inline bool IsFinite(Scalar x) {
return Eigen::numext::isfinite(x);
}

template <>
__device__ inline bool IsFinite(cuComplex x) {
return Eigen::numext::isfinite(cuCrealf(x)) &&
Eigen::numext::isfinite(cuCimagf(x));
}

template <>
__device__ inline bool IsFinite(cuDoubleComplex x) {
return Eigen::numext::isfinite(cuCreal(x)) &&
Eigen::numext::isfinite(cuCimag(x));
}

template <typename Scalar>
struct Const {
template <typename RealScalar>
__device__ static inline Scalar make_const(const RealScalar x) {
return Scalar(x);
}
};

template <>
struct Const<cuComplex> {
template <typename RealScalar>
__device__ static inline cuComplex make_const(const RealScalar x) {
return make_cuComplex(x, 0.0f);
}
};

template <>
struct Const<cuDoubleComplex> {
template <typename RealScalar>
__device__ static inline cuDoubleComplex make_const(const RealScalar x) {
return make_cuDoubleComplex(x, 0.0f);
}
};

} // namespace

template <typename Scalar>
__global__ void DeterminantFromPivotedLUKernel(int nthreads, int n,
const Scalar* lu_factor,
const int* all_pivots,
Scalar* dst, int* info) {
const int matrix_size = n * n;
const int stride = n + 1;
// We only parallelize over batches here. Performance is not critical,
// since this cheap O(n) kernel always follows an O(n^3) LU factorization.
// The main purpose is to avoid having to copy the LU decomposition to
// host memory.
CUDA_1D_KERNEL_LOOP(o_idx, nthreads) {
// Compute the order of the permutation from the number of transpositions
// encoded in the pivot array, see:
// http://icl.cs.utk.edu/lapack-forum/viewtopic.php?f=2&t=340
const int* pivots = all_pivots + o_idx * n;
int order = 0;
for (int i = 0; i < n - 1; ++i) {
// Notice: Internally, the cuBlas code uses Fortran convention (1-based)
// indexing so we expect pivots[i] == i + 1 for rows that were not moved.
order += pivots[i] != (i + 1);
}

// Compute the product of the diagonal elements of U from the partially
// pivoted LU factorization.
// TODO(rmlarsen): This naive implementation (matching that in Eigen used
// for the CPU kernel) is pathetically unstable. Should we implement
// log-determinant instead (a different set of ops altogether) or something
// like the method used in the old LINPACK code:
// http://www.netlib.org/linpack/dgedi.f ?
int i_idx = matrix_size * o_idx;
Scalar prod = lu_factor[i_idx];
for (int i = 1; i < n; ++i) {
i_idx += stride;
prod = Multiply(prod, lu_factor[i_idx]);
}
// Finally set the determinant to (-1)^order * prod(diag(U)).
dst[o_idx] = order % 2 ? Negate(prod) : prod;

// We write a magic value into the info array if the result was infinite.
if (!IsFinite(prod)) {
info[o_idx] = kint32min;
}
}
}

template <typename Scalar>
struct DeterminantFromPivotedLUFunctor<GPUDevice, Scalar> {
void operator()(const GPUDevice& device,
typename TTypes<Scalar, 3>::ConstTensor lu_factor,
const int* pivots, typename TTypes<Scalar, 1>::Tensor output,
int* info) {
using CudaType = typename CUDAComplexT<Scalar>::type;
const int64 num_matrices = output.size();
const int64 n = lu_factor.dimension(2);
const CudaType* lu_factor_ptr =
reinterpret_cast<const CudaType*>(lu_factor.data());
CudaType* output_ptr = reinterpret_cast<CudaType*>(output.data());
CudaLaunchConfig config = GetCudaLaunchConfig(num_matrices, device);
DeterminantFromPivotedLUKernel<<<
config.block_count, config.thread_per_block, 0, device.stream()>>>(
config.virtual_thread_count, n, lu_factor_ptr, pivots, output_ptr,
info);
}
};

template struct DeterminantFromPivotedLUFunctor<GPUDevice, float>;
template struct DeterminantFromPivotedLUFunctor<GPUDevice, double>;
template struct DeterminantFromPivotedLUFunctor<GPUDevice, complex64>;
template struct DeterminantFromPivotedLUFunctor<GPUDevice, complex128>;

template <typename Scalar>
__global__ void EyeKernel(Cuda3DLaunchConfig config, int batch_size, int m,
int n, Scalar* matrix_batch_ptr) {
const int matrix_size = m * n;
const Scalar one = Const<Scalar>::make_const(1.0);
const Scalar one = Scalar(1);
CUDA_AXIS_KERNEL_LOOP(batch, config.virtual_thread_count, x) {
if (batch >= batch_size) {
break;
Expand All @@ -205,16 +57,14 @@ template <typename Scalar>
struct EyeFunctor<GPUDevice, Scalar> {
void operator()(const GPUDevice& device,
typename TTypes<Scalar, 3>::Tensor matrix_batch) {
using CudaType = typename CUDAComplexT<Scalar>::type;
const int batch_size = matrix_batch.dimension(0);
const int m = matrix_batch.dimension(1);
const int n = matrix_batch.dimension(2);
CudaType* matrix_batch_ptr =
reinterpret_cast<CudaType*>(matrix_batch.data());
Cuda3DLaunchConfig config = GetCuda3DLaunchConfig(batch_size, m, n, device,
EyeKernel<Scalar>, 0, 0);
EyeKernel<<<config.block_count, config.thread_per_block, 0,
device.stream()>>>(config, batch_size, m, n, matrix_batch_ptr);
device.stream()>>>(config, batch_size, m, n,
matrix_batch.data());
}
};

Expand Down

0 comments on commit 1bbec9e

Please sign in to comment.