Skip to content

Commit

Permalink
Snapshot of kokkos-kernels.git from commit 4ee5f3c6dbd0981f6d8c7a9b2b…
Browse files Browse the repository at this point in the history
…1763439cb56039

From repository at git@github.com:kokkos/kokkos-kernels.git

At commit:
commit 4ee5f3c6dbd0981f6d8c7a9b2b1763439cb56039
Merge: 94456cf 6a79032
Author: Nathan Ellingwood <ndellin@sandia.gov>
Date:   Tue Feb 5 17:13:18 2019 -0700

    Merge branch 'develop' for 2.8.00

    Part of Kokkos C++ Performance Portability Programming EcoSystem 2.8
  • Loading branch information
ndellingwood committed Feb 6, 2019
1 parent 2a24058 commit c5c8608
Show file tree
Hide file tree
Showing 158 changed files with 3,847 additions and 666 deletions.
20 changes: 20 additions & 0 deletions packages/kokkos-kernels/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,25 @@
# Change Log

## [2.8.00](https://github.com/kokkos/kokkos-kernels/tree/2.8.00) (2019-02-05)
[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/2.7.24...2.8.00)

**Implemented enhancements:**

- Capability, Tests: C++14 Support and Testing [\#351](https://github.com/kokkos/kokkos-kernels/issues/351)
- Capability: Batched getrs [\#332](https://github.com/kokkos/kokkos-kernels/issues/332)
- More Kernel Labels for KokkosBlas [\#239](https://github.com/kokkos/kokkos-kernels/issues/239)
- Name all parallel kernels and regions [\#124](https://github.com/kokkos/kokkos-kernels/issues/124)

**Fixed bugs:**

- BLAS TPL: BLAS underscore mangling [\#369](https://github.com/kokkos/kokkos-kernels/issues/369)
- BLAS TPL, Complex: Promotion 2.7.24 broke MV unit tests in Tpetra with complex types [\#360](https://github.com/kokkos/kokkos-kernels/issues/360)
- GEMM: GEMM uses wrong function for computing shared memory allocation size [\#368](https://github.com/kokkos/kokkos-kernels/issues/368)
- BuildSystem: BLAS TPL macro not properly enabled with MKL BLAS [\#347](https://github.com/kokkos/kokkos-kernels/issues/347)
- BuildSystem: make clean - errors [\#353](https://github.com/kokkos/kokkos-kernels/issues/353)
- Compiler Workaround: Internal compiler error in KokkosBatched::Experimental::TeamGemm [\#349](https://github.com/kokkos/kokkos-kernels/issues/349)
- KokkosBlas: Some KokkosBlas kernels assume default execution space [\#14](https://github.com/kokkos/kokkos-kernels/issues/14)

## [2.7.24](https://github.com/kokkos/kokkos-kernels/tree/2.7.24) (2018-11-04)
[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/2.7.00...2.7.24)

Expand Down
9 changes: 9 additions & 0 deletions packages/kokkos-kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,15 @@ IF (KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
LIST(APPEND TPL_LIST "CUBLAS")
ENDIF()

# ==================================================================
# Fortran Complex BLAS
# ==================================================================

IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL)
INCLUDE(CheckHostBlasReturnComplex.cmake)
CHECK_HOST_BLAS_RETURN_COMPLEX(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX)
ENDIF()

# ==================================================================
# CMake Summary
# ==================================================================
Expand Down
38 changes: 38 additions & 0 deletions packages/kokkos-kernels/CheckHostBlasReturnComplex.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
INCLUDE(CheckCXXSourceRuns)

FUNCTION(CHECK_HOST_BLAS_RETURN_COMPLEX VARNAME)

SET(CMAKE_REQUIRED_LIBRARIES ${TPL_BLAS_LIBRARIES})

SET(SOURCE
"
#include <complex>
#define F77_BLAS_MANGLE${F77_BLAS_MANGLE}
extern \"C\" {
std::complex<double> F77_BLAS_MANGLE(zdotc,ZDOTC)(
const int* n,
const std::complex<double> x[], const int* incx,
const std::complex<double> y[], const int* incy);
}
int main() {
const int NUM=2;
const int INC=1;
std::complex<double> f[NUM];
const std::complex<double>
ONE = std::complex<double>(0.0,1.0),
TWO = std::complex<double>(0.0,2.0);
f[0] = ONE;
f[1] = TWO;
std::complex<double> ret
= F77_BLAS_MANGLE(zdotc,ZDOTC)(&NUM, f, &INC, f, &INC);
return (ret.real() == double(5.0) ? 0 : 1);
}
"
)

CHECK_CXX_SOURCE_RUNS("${SOURCE}" ${VARNAME})

ENDFUNCTION()
45 changes: 45 additions & 0 deletions packages/kokkos-kernels/Makefile.kokkos-kernels
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,40 @@ tmp := $(shell echo "----------------------------------------------*/" >> Kokkos
tmp := $(shell echo "\#ifndef KOKKOSKERNELS_CONFIG_H_" >> KokkosKernels_config.tmp)
tmp := $(shell echo "\#define KOKKOSKERNELS_CONFIG_H_" >> KokkosKernels_config.tmp)


#==== User-settable options for Fortran mangling macros =================
#With Makefile build, we rely on users' input

# default mangling scheme with a single under score
KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#_
KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_DOUBLE_UNDERSCORES := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle__" | wc -l))
ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_DOUBLE_UNDERSCORES), 1)
KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#__
else
KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_SINGLE_UNDERSCORE := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle_" | wc -l))
ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_SINGLE_UNDERSCORE), 1)
KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#_
else
KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_NO_UNDERSCORE := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle" | wc -l))
ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_NO_UNDERSCORE), 1)
KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#
endif
endif
endif

tmp := $(shell echo "" >> KokkosKernels_config.tmp)
tmp := $(shell echo "/* ---------------------------------------------" >> KokkosKernels_config.tmp)
tmp := $(shell echo "Fortran BLAS mangling:" >> KokkosKernels_config.tmp)
tmp := $(shell echo " ---------------------------------------------*/" >> KokkosKernels_config.tmp)
tmp := $(shell echo "\#if !defined(F77_BLAS_MANGLE)" >> KokkosKernels_config.tmp )
tmp := $(shell echo "\#define F77_BLAS_MANGLE(name,NAME) $(KOKKOSKERNELS_FORTRAN_GLOBAL)" >> KokkosKernels_config.tmp)
tmp := $(shell echo "\#endif" >> KokkosKernels_config.tmp )

KOKKOSKERNELS_INTERNAL_TPL_BLAS_RETURN_COMPLEX := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-return-complex" | wc -l))
ifeq ($(KOKKOSKERNELS_INTERNAL_TPL_BLAS_RETURN_COMPLEX), 1)
tmp := $(shell echo "\#define KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX" >> KokkosKernels_config.tmp )
endif

#==== ETI Macros Scalars =================================================
KOKKOSKERNELS_INTERNAL_INST_SCALARS =
KOKKOSKERNELS_INTERNAL_INST_DOUBLE=$(strip $(shell echo "$(KOKKOSKERNELS_SCALARS)" | grep "double" | wc -l))
Expand Down Expand Up @@ -401,6 +435,17 @@ ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_CUBLAS}, 1)
KOKKOSKERNELS_INTERNAL_SRC_BLAS += ${KOKKOSKERNELS_PATH}/src/impl/tpls/KokkosBlas_Cuda_tpl.cpp
endif

KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS=0
ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_BLAS}, 1)
KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS=1
endif
ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_MKL}, 1)
KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS=1
endif
ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS}, 1)
KOKKOSKERNELS_INTERNAL_SRC_BLAS += ${KOKKOSKERNELS_PATH}/src/impl/tpls/KokkosBlas_Host_tpl.cpp
endif

KOKKOSKERNELS_INTERNAL_HEADERS = $(wildcard ${KOKKOSKERNELS_PATH}/src/impl/*.hpp)
KOKKOSKERNELS_INTERNAL_HEADERS += $(wildcard ${KOKKOSKERNELS_PATH}/src/impl/generated_specializations_hpp/*/*eti_spec*.hpp)

Expand Down
9 changes: 9 additions & 0 deletions packages/kokkos-kernels/cmake/KokkosKernels_config.h.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
#ifndef KOKKOSKERNELS_CONFIG_H
#define KOKKOSKERNELS_CONFIG_H


/* Define Fortran mangle from Trilinos macro definition */
#ifndef F77_BLAS_MANGLE
# define F77_BLAS_MANGLE@F77_BLAS_MANGLE@
#endif

/* Define if fortran blas 1 function can return complex type */
#cmakedefine KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX

/* Define if building in debug mode */
#cmakedefine HAVE_KOKKOSKERNELS_DEBUG

Expand Down
1 change: 1 addition & 0 deletions packages/kokkos-kernels/master_history.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ tag: 2.5.00 date: 12/15/2017 master: e4c645e9 develop: 04d58766
tag: 2.6.00 date: 03/07/2018 master: 00b16484 develop: f81778ce
tag: 2.7.00 date: 05/24/2018 master: 6e8e97a9 develop: 692114a6
tag: 2.7.24 date: 11/05/2018 master: 1a7b524b develop: fab89e37
tag: 2.8.00 date: 02:05:2019 master: a6e05e06 develop: 6a790321
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,7 @@ namespace KokkosBatched {
void operator()(const TeamTagV1 &, const MemberType &member) const {
const int kbeg = (member.league_rank()*(member.team_size()*VectorLength) +
member.team_rank()*VectorLength);
Kokkos::parallel_for
(Kokkos::ThreadVectorRange(member, VectorLength),
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
[&](const int &k) {
const int kk = kbeg + k;
if (kk < int(_c.extent(0))) {
Expand All @@ -93,8 +92,7 @@ namespace KokkosBatched {
KOKKOS_INLINE_FUNCTION
void operator()(const TeamTagV2 &, const MemberType &member) const {
const int kbeg = member.league_rank()*VectorLength;
Kokkos::parallel_for
(Kokkos::ThreadVectorRange(member, VectorLength),
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
[&](const int &k) {
const int kk = kbeg + k;
if (kk < int(_c.extent(0))) {
Expand All @@ -116,8 +114,7 @@ namespace KokkosBatched {
ScratchViewType<ViewType> sb(member.team_scratch(lvl), VectorLength, _b.extent(1), _b.extent(2));

const int kbeg = member.league_rank()*VectorLength;
Kokkos::parallel_for
(Kokkos::ThreadVectorRange(member, VectorLength),
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
[&](const int &k) {
const int kk = kbeg + k;
if (kk < int(_c.extent(0))) {
Expand All @@ -142,14 +139,12 @@ namespace KokkosBatched {
KOKKOS_INLINE_FUNCTION
void operator()(const TeamTagHandmade &, const MemberType &member) const {
const int kbeg = member.league_rank()*VectorLength;
Kokkos::parallel_for
(Kokkos::ThreadVectorRange(member, VectorLength),
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
[&](const int &k) {
const int kk = kbeg + k;
if (kk < int(_c.extent(0))) {
const int m = _c.extent(1), n = _c.extent(2), q = _a.extent(2);
Kokkos::parallel_for
(Kokkos::TeamThreadRange(member,0,m*n),
Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,m*n),
[&](const int &ij) {
const int i = ij%m, j = ij/m;
typename ViewType::non_const_value_type cval = 0;
Expand Down Expand Up @@ -315,7 +310,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for("GEMM: RangePolicy version", policy, functor_type(a,b,c));
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::RangeTag", policy, functor_type(a,b,c));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -382,7 +377,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for("GEMM: TeamPolicy version 1", policy,functor_type(a,b,c));
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV1", policy,functor_type(a,b,c));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -455,7 +450,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for("GEMM: TeamPolicy version 2", policy, functor_type(a,b,c));
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV2", policy, functor_type(a,b,c));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -532,7 +527,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for("GEMM: TeamPolicy version 3", policy, functor_type(a,b,c));
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV3", policy, functor_type(a,b,c));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -604,7 +599,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for("GEMM: TeamPolicy handmade", policy, functor_type(a,b,c));
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyHandmade", policy, functor_type(a,b,c));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ namespace KokkosBatched {
amat_simd("amat_simd", N, BlkSize, BlkSize),
bmat_simd("bmat_simd", N, BlkSize, BlkSize);

Kokkos::parallel_for
(Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength),
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::Pack",
Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength),
KOKKOS_LAMBDA(const int k) {
const int k0 = k/VectorLength, k1 = k%VectorLength;
for (int i=0;i<BlkSize;++i)
Expand Down Expand Up @@ -128,8 +128,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::CblasOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
Expand Down Expand Up @@ -385,8 +385,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::libxswmmOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
Expand Down Expand Up @@ -515,8 +515,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::SIMDSerialOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemvHost::CblasOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
for (int j=0;j<NumVecs;++j) {
Expand Down Expand Up @@ -174,8 +174,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemvHost::SerialOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());

Expand Down Expand Up @@ -249,8 +249,8 @@ namespace KokkosBatched {
HostSpaceType::fence();
timer.reset();

Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::GemvHost::SIMDSerialOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for(policy, functor_type(a));
Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::RangeTag", policy, functor_type(a));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -345,7 +345,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for(policy, functor_type(a));
Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV1", policy, functor_type(a));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -414,7 +414,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for(policy, functor_type(a));
Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV2", policy, functor_type(a));

DeviceSpaceType::fence();
const double t = timer.seconds();
Expand Down Expand Up @@ -486,7 +486,7 @@ namespace KokkosBatched {
DeviceSpaceType::fence();
timer.reset();

Kokkos::parallel_for(policy.set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)),
Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV3", policy.set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)),
functor_type(a));

DeviceSpaceType::fence();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ namespace KokkosBatched {
Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType>
amat_simd("amat_simd", N, BlkSize, BlkSize); //, a("a", N, BlkSize, BlkSize);

Kokkos::parallel_for
(Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength),
Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::Pack",
Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength),
KOKKOS_LAMBDA(const int k) {
const int k0 = k/VectorLength, k1 = k%VectorLength;
for (int i=0;i<BlkSize;++i)
Expand Down Expand Up @@ -136,8 +136,8 @@ namespace KokkosBatched {
timer.reset();

Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0, N*VectorLength);
Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::LAPACKE_dgetrfOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
auto pp = Kokkos::subview(p, k, Kokkos::ALL());
Expand Down Expand Up @@ -295,8 +295,8 @@ namespace KokkosBatched {
timer.reset();

Kokkos::RangePolicy<HostSpaceType,ScheduleType > policy(0, N);
Kokkos::parallel_for
(policy,
Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::SIMDSerialOpenMP",
policy,
KOKKOS_LAMBDA(const int k) {
auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());

Expand Down

0 comments on commit c5c8608

Please sign in to comment.