Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kokkos + KokkosKernels Promotion to 2.8.00 #4329

Merged
merged 7 commits into from
Feb 6, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 9 additions & 9 deletions packages/ifpack2/example/RelaxationWithEquilibration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ deep_copy (Epetra_Vector& X_e,

host_view_type X_e_lcl (X_e_lcl_raw, lclNumRows);
if (X_t.template need_sync<memory_space> ()) {
auto X_t_lcl_2d = X_t.template getLocalView<host_memory_space> ();
auto X_t_lcl_2d = X_t.getLocalViewHost ();
auto X_t_lcl = Kokkos::subview (X_t_lcl_2d, Kokkos::ALL (), 0);
Kokkos::deep_copy (X_e_lcl, X_t_lcl);
}
Expand Down Expand Up @@ -247,8 +247,8 @@ deep_copy (Tpetra::Vector<double, LO, GO, NT>& X_t,

host_view_type X_e_lcl (X_e_lcl_raw, lclNumRows);
if (X_t.template need_sync<memory_space> ()) {
X_t.template modify<host_memory_space> ();
auto X_t_lcl_2d = X_t.template getLocalView<host_memory_space> ();
X_t.modify_host ();
auto X_t_lcl_2d = X_t.getLocalViewHost ();
auto X_t_lcl = Kokkos::subview (X_t_lcl_2d, Kokkos::ALL (), 0);
Kokkos::deep_copy (X_t_lcl, X_e_lcl);
}
Expand Down Expand Up @@ -565,11 +565,11 @@ typename MV::dot_type accurate_dot (const MV& X, const MV& Y)
using dot_type = typename MV::dot_type;

const LO lclNumRows = X.getLocalLength ();
const_cast<MV&> (X).template sync<Kokkos::HostSpace> ();
auto X_lcl_2d = X.template getLocalView<Kokkos::HostSpace> ();
const_cast<MV&> (X).sync_host ();
auto X_lcl_2d = X.getLocalViewHost();
auto X_lcl = Kokkos::subview (X_lcl_2d, Kokkos::ALL (), 0);
const_cast<MV&> (Y).template sync<Kokkos::HostSpace> ();
auto Y_lcl_2d = Y.template getLocalView<Kokkos::HostSpace> ();
const_cast<MV&> (Y).sync_host ();
auto Y_lcl_2d = Y.getLocalViewHost();
auto Y_lcl = Kokkos::subview (Y_lcl_2d, Kokkos::ALL (), 0);

long double sum = 0.0;
Expand Down Expand Up @@ -973,8 +973,8 @@ copyGatheredMultiVector (Tpetra::MultiVector<SC, LO, GO, NT>& X,
using dense_matrix_type = HostDenseMatrix<SC, LO, GO, NT>;
using dev_memory_space = typename Tpetra::MultiVector<SC, LO, GO, NT>::device_type::memory_space;

X.template sync<Kokkos::HostSpace> ();
auto X_lcl = X.template getLocalView<Kokkos::HostSpace> ();
X.sync_host ();
auto X_lcl = X.getLocalViewHost ();
dense_matrix_type X_copy (label, X.getLocalLength (), X.getNumVectors ());
Kokkos::deep_copy (X_copy, X_lcl);

Expand Down
20 changes: 20 additions & 0 deletions packages/kokkos-kernels/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,25 @@
# Change Log

## [2.8.00](https://github.com/kokkos/kokkos-kernels/tree/2.8.00) (2019-02-05)
[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/2.7.24...2.8.00)

**Implemented enhancements:**

- Capability, Tests: C++14 Support and Testing [\#351](https://github.com/kokkos/kokkos-kernels/issues/351)
- Capability: Batched getrs [\#332](https://github.com/kokkos/kokkos-kernels/issues/332)
- More Kernel Labels for KokkosBlas [\#239](https://github.com/kokkos/kokkos-kernels/issues/239)
- Name all parallel kernels and regions [\#124](https://github.com/kokkos/kokkos-kernels/issues/124)

**Fixed bugs:**

- BLAS TPL: BLAS underscore mangling [\#369](https://github.com/kokkos/kokkos-kernels/issues/369)
- BLAS TPL, Complex: Promotion 2.7.24 broke MV unit tests in Tpetra with complex types [\#360](https://github.com/kokkos/kokkos-kernels/issues/360)
- GEMM: GEMM uses wrong function for computing shared memory allocation size [\#368](https://github.com/kokkos/kokkos-kernels/issues/368)
- BuildSystem: BLAS TPL macro not properly enabled with MKL BLAS [\#347](https://github.com/kokkos/kokkos-kernels/issues/347)
- BuildSystem: make clean - errors [\#353](https://github.com/kokkos/kokkos-kernels/issues/353)
- Compiler Workaround: Internal compiler error in KokkosBatched::Experimental::TeamGemm [\#349](https://github.com/kokkos/kokkos-kernels/issues/349)
- KokkosBlas: Some KokkosBlas kernels assume default execution space [\#14](https://github.com/kokkos/kokkos-kernels/issues/14)

## [2.7.24](https://github.com/kokkos/kokkos-kernels/tree/2.7.24) (2018-11-04)
[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/2.7.00...2.7.24)

Expand Down
9 changes: 9 additions & 0 deletions packages/kokkos-kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,15 @@ IF (KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
LIST(APPEND TPL_LIST "CUBLAS")
ENDIF()

# ==================================================================
# Fortran Complex BLAS
# ==================================================================

IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL)
INCLUDE(CheckHostBlasReturnComplex.cmake)
CHECK_HOST_BLAS_RETURN_COMPLEX(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX)
ENDIF()

# ==================================================================
# CMake Summary
# ==================================================================
Expand Down
38 changes: 38 additions & 0 deletions packages/kokkos-kernels/CheckHostBlasReturnComplex.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
INCLUDE(CheckCXXSourceRuns)

FUNCTION(CHECK_HOST_BLAS_RETURN_COMPLEX VARNAME)

SET(CMAKE_REQUIRED_LIBRARIES ${TPL_BLAS_LIBRARIES})

SET(SOURCE
"
#include <complex>

#define F77_BLAS_MANGLE${F77_BLAS_MANGLE}

extern \"C\" {
std::complex<double> F77_BLAS_MANGLE(zdotc,ZDOTC)(
const int* n,
const std::complex<double> x[], const int* incx,
const std::complex<double> y[], const int* incy);
}

int main() {
const int NUM=2;
const int INC=1;
std::complex<double> f[NUM];
const std::complex<double>
ONE = std::complex<double>(0.0,1.0),
TWO = std::complex<double>(0.0,2.0);
f[0] = ONE;
f[1] = TWO;
std::complex<double> ret
= F77_BLAS_MANGLE(zdotc,ZDOTC)(&NUM, f, &INC, f, &INC);
return (ret.real() == double(5.0) ? 0 : 1);
}
"
)

CHECK_CXX_SOURCE_RUNS("${SOURCE}" ${VARNAME})

ENDFUNCTION()
45 changes: 45 additions & 0 deletions packages/kokkos-kernels/Makefile.kokkos-kernels
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,40 @@ tmp := $(shell echo "----------------------------------------------*/" >> Kokkos
tmp := $(shell echo "\#ifndef KOKKOSKERNELS_CONFIG_H_" >> KokkosKernels_config.tmp)
tmp := $(shell echo "\#define KOKKOSKERNELS_CONFIG_H_" >> KokkosKernels_config.tmp)


#==== User-settable options for Fortran mangling macros =================
#With Makefile build, we rely on users' input

# default mangling scheme with a single under score
KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#_
KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_DOUBLE_UNDERSCORES := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle__" | wc -l))
ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_DOUBLE_UNDERSCORES), 1)
KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#__
else
KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_SINGLE_UNDERSCORE := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle_" | wc -l))
ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_SINGLE_UNDERSCORE), 1)
KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#_
else
KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_NO_UNDERSCORE := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle" | wc -l))
ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_NO_UNDERSCORE), 1)
KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#
endif
endif
endif

tmp := $(shell echo "" >> KokkosKernels_config.tmp)
tmp := $(shell echo "/* ---------------------------------------------" >> KokkosKernels_config.tmp)
tmp := $(shell echo "Fortran BLAS mangling:" >> KokkosKernels_config.tmp)
tmp := $(shell echo " ---------------------------------------------*/" >> KokkosKernels_config.tmp)
tmp := $(shell echo "\#if !defined(F77_BLAS_MANGLE)" >> KokkosKernels_config.tmp )
tmp := $(shell echo "\#define F77_BLAS_MANGLE(name,NAME) $(KOKKOSKERNELS_FORTRAN_GLOBAL)" >> KokkosKernels_config.tmp)
tmp := $(shell echo "\#endif" >> KokkosKernels_config.tmp )

KOKKOSKERNELS_INTERNAL_TPL_BLAS_RETURN_COMPLEX := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-return-complex" | wc -l))
ifeq ($(KOKKOSKERNELS_INTERNAL_TPL_BLAS_RETURN_COMPLEX), 1)
tmp := $(shell echo "\#define KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX" >> KokkosKernels_config.tmp )
endif

#==== ETI Macros Scalars =================================================
KOKKOSKERNELS_INTERNAL_INST_SCALARS =
KOKKOSKERNELS_INTERNAL_INST_DOUBLE=$(strip $(shell echo "$(KOKKOSKERNELS_SCALARS)" | grep "double" | wc -l))
Expand Down Expand Up @@ -401,6 +435,17 @@ ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_CUBLAS}, 1)
KOKKOSKERNELS_INTERNAL_SRC_BLAS += ${KOKKOSKERNELS_PATH}/src/impl/tpls/KokkosBlas_Cuda_tpl.cpp
endif

KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS=0
ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_BLAS}, 1)
KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS=1
endif
ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_MKL}, 1)
KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS=1
endif
ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS}, 1)
KOKKOSKERNELS_INTERNAL_SRC_BLAS += ${KOKKOSKERNELS_PATH}/src/impl/tpls/KokkosBlas_Host_tpl.cpp
endif

KOKKOSKERNELS_INTERNAL_HEADERS = $(wildcard ${KOKKOSKERNELS_PATH}/src/impl/*.hpp)
KOKKOSKERNELS_INTERNAL_HEADERS += $(wildcard ${KOKKOSKERNELS_PATH}/src/impl/generated_specializations_hpp/*/*eti_spec*.hpp)

Expand Down
9 changes: 9 additions & 0 deletions packages/kokkos-kernels/cmake/KokkosKernels_config.h.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
#ifndef KOKKOSKERNELS_CONFIG_H
#define KOKKOSKERNELS_CONFIG_H


/* Define Fortran mangle from Trilinos macro definition */
#ifndef F77_BLAS_MANGLE
# define F77_BLAS_MANGLE@F77_BLAS_MANGLE@
#endif

/* Define if fortran blas 1 function can return complex type */
#cmakedefine KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX

/* Define if building in debug mode */
#cmakedefine HAVE_KOKKOSKERNELS_DEBUG

Expand Down
1 change: 1 addition & 0 deletions packages/kokkos-kernels/master_history.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ tag: 2.5.00 date: 12/15/2017 master: e4c645e9 develop: 04d58766
tag: 2.6.00 date: 03/07/2018 master: 00b16484 develop: f81778ce
tag: 2.7.00 date: 05/24/2018 master: 6e8e97a9 develop: 692114a6
tag: 2.7.24 date: 11/05/2018 master: 1a7b524b develop: fab89e37
tag: 2.8.00 date: 02:05:2019 master: a6e05e06 develop: 6a790321
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,7 @@ namespace KokkosBatched {
void operator()(const TeamTagV1 &, const MemberType &member) const {
const int kbeg = (member.league_rank()*(member.team_size()*VectorLength) +
member.team_rank()*VectorLength);
Kokkos::parallel_for
(Kokkos::ThreadVectorRange(member, VectorLength),
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
[&](const int &k) {
const int kk = kbeg + k;
if (kk < int(_c.extent(0))) {
Expand All @@ -93,8 +92,7 @@ namespace KokkosBatched {
KOKKOS_INLINE_FUNCTION
void operator()(const TeamTagV2 &, const MemberType &member) const {
const int kbeg = member.league_rank()*VectorLength;
Kokkos::parallel_for
(Kokkos::ThreadVectorRange(member, VectorLength),
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
[&](const int &k) {
const int kk = kbeg + k;
if (kk < int(_c.extent(0))) {
Expand All @@ -116,8 +114,7 @@ namespace KokkosBatched {
ScratchViewType<ViewType> sb(member.team_scratch(lvl), VectorLength, _b.extent(1), _b.extent(2));

const int kbeg = member.league_rank()*VectorLength;
Kokkos::parallel_for
(Kokkos::ThreadVectorRange(member, VectorLength),
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
[&](const int &k) {
const int kk = kbeg + k;
if (kk < int(_c.extent(0))) {
Expand All @@ -142,14 +139,12 @@ namespace KokkosBatched {
KOKKOS_INLINE_FUNCTION
void operator()(const TeamTagHandmade &, const MemberType &member) const {
const int kbeg = member.league_rank()*VectorLength;
Kokkos::parallel_for
(Kokkos::ThreadVectorRange(member, VectorLength),
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
[&](const int &k) {
const int kk = kbeg + k;
if (kk < int(_c.extent(0))) {
const int m = _c.extent(1), n = _c.extent(2), q = _a.extent(2);
Kokkos::parallel_for
(Kokkos::TeamThreadRange(member,0,m*n),
Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,m*n),
[&](const int &ij) {
const int i = ij%m, j = ij/m;
typename ViewType::non_const_value_type cval = 0;
Expand Down Expand Up @@ -315,7 +310,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for("GEMM: RangePolicy version", policy, functor_type(a,b,c));
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::RangeTag", policy, functor_type(a,b,c));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -382,7 +377,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for("GEMM: TeamPolicy version 1", policy,functor_type(a,b,c));
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV1", policy,functor_type(a,b,c));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -455,7 +450,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for("GEMM: TeamPolicy version 2", policy, functor_type(a,b,c));
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV2", policy, functor_type(a,b,c));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -532,7 +527,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for("GEMM: TeamPolicy version 3", policy, functor_type(a,b,c));
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV3", policy, functor_type(a,b,c));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -604,7 +599,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for("GEMM: TeamPolicy handmade", policy, functor_type(a,b,c));
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyHandmade", policy, functor_type(a,b,c));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ namespace KokkosBatched {
amat_simd("amat_simd", N, BlkSize, BlkSize),
bmat_simd("bmat_simd", N, BlkSize, BlkSize);

Kokkos::parallel_for
(Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength),
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::Pack",
Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength),
KOKKOS_LAMBDA(const int k) {
const int k0 = k/VectorLength, k1 = k%VectorLength;
for (int i=0;i<BlkSize;++i)
Expand Down Expand Up @@ -128,8 +128,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::CblasOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
Expand Down Expand Up @@ -385,8 +385,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::libxswmmOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
Expand Down Expand Up @@ -515,8 +515,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::SIMDSerialOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemvHost::CblasOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
for (int j=0;j<NumVecs;++j) {
Expand Down Expand Up @@ -174,8 +174,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemvHost::SerialOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());

Expand Down Expand Up @@ -249,8 +249,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemvHost::SIMDSerialOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());

Expand Down