Permalink
Browse files

Add AVX intrinsics to vectorize & speed up FP16-CPU computations (#574)

* Add AVX intrinsics to vectorize & speed up FP16-CPU computations

* Add compilation test & runtime identification of AVX/F16C instinsics
  • Loading branch information...
alsrgv committed Oct 23, 2018
1 parent 4efa3fb commit 156c61b41860b9523cf2acc984217a2a1e8dc811
Showing with 95 additions and 18 deletions.
  1. +78 −0 horovod/common/half.cc
  2. +7 −0 horovod/common/half.h
  3. +0 −16 horovod/common/operations.cc
  4. +10 −2 setup.py
@@ -0,0 +1,78 @@
// Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#if __AVX__ && __F16C__
#include <cpuid.h>
#include <immintrin.h>
#endif
#include "half.h"
namespace horovod {
namespace common {
#if __AVX__ && __F16C__
// Query CPUID to determine AVX and F16C runtime support.
bool is_avx_and_f16c() {
static bool initialized = false;
static bool result = false;
if (!initialized) {
unsigned int eax, ebx, ecx, edx;
if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
result = (ecx & bit_AVX) && (ecx & bit_F16C);
}
initialized = true;
}
return result;
}
#endif
// float16 custom data type summation operation.
void float16_sum(void* invec, void* inoutvec, int* len,
MPI_Datatype* datatype) {
// cast invec and inoutvec to your float16 type
auto* in = (unsigned short*)invec;
auto* inout = (unsigned short*)inoutvec;
int i = 0;
#if __AVX__ && __F16C__
if (is_avx_and_f16c()) {
for (; i < (*len / 8) * 8; i += 8) {
// convert in & inout to m256
__m256 in_m256 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(in + i)));
__m256 inout_m256 =
_mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(inout + i)));
// add them together to new_inout_m256
__m256 new_inout_m256 = _mm256_add_ps(in_m256, inout_m256);
// convert back and store in inout
__m128i new_inout_m128i = _mm256_cvtps_ph(new_inout_m256, 0);
_mm_storeu_si128((__m128i*)(inout + i), new_inout_m128i);
}
}
#endif
for (; i < *len; ++i) {
float in_float;
float inout_float;
HalfBits2Float(in + i, &in_float);
HalfBits2Float(inout + i, &inout_float);
inout_float += in_float;
Float2HalfBits(&inout_float, inout + i);
}
}
} // namespace common
} // namespace horovod
@@ -26,6 +26,11 @@
#ifndef HOROVOD_HALF_H
#define HOROVOD_HALF_H
#include <stdint.h>
#define OMPI_SKIP_MPICXX
#include "mpi.h"
namespace horovod {
namespace common {
@@ -127,6 +132,8 @@ inline void Float2HalfBits(float* src, unsigned short* dest) {
*dest = u;
}
void float16_sum(void* invec, void* inoutvec, int* len, MPI_Datatype* datatype);
} // namespace common
} // namespace horovod
@@ -1411,22 +1411,6 @@ void CheckForStalledTensors(HorovodGlobalState& state) {
}
}
// float16 custom data type summation operation.
void float16_sum(void* invec, void* inoutvec, int* len,
MPI_Datatype* datatype) {
// cast invec and inoutvec to your float16 type
auto* in = (unsigned short*)invec;
auto* inout = (unsigned short*)inoutvec;
for (int i = 0; i < *len; ++i) {
float in_float;
float inout_float;
HalfBits2Float(in + i, &in_float);
HalfBits2Float(inout + i, &inout_float);
inout_float += in_float;
Float2HalfBits(&inout_float, inout + i);
}
}
// The MPI background thread loop coordinates all the MPI processes and the
// tensor reductions. The design of the communicator mechanism is limited by a
// few considerations:
@@ -68,11 +68,18 @@ def check_tf_version():
def get_cpp_flags(build_ext):
last_err = None
default_flags = ['-std=c++11', '-fPIC', '-O2']
avx_flags = ['-mf16c', '-mavx']
if sys.platform == 'darwin':
# Darwin most likely will have Clang, which has libc++.
flags_to_try = [default_flags + ['-stdlib=libc++'], default_flags]
flags_to_try = [default_flags + ['-stdlib=libc++'] + avx_flags,
default_flags + avx_flags,
default_flags + ['-stdlib=libc++'],
default_flags]
else:
flags_to_try = [default_flags, default_flags + ['-stdlib=libc++']]
flags_to_try = [default_flags + avx_flags,
default_flags + ['-stdlib=libc++'] + avx_flags,
default_flags,
default_flags + ['-stdlib=libc++']]
for cpp_flags in flags_to_try:
try:
test_compile(build_ext, 'test_cpp_flags', extra_compile_preargs=cpp_flags,
@@ -421,6 +428,7 @@ def get_common_options(build_ext):
INCLUDES = []
SOURCES = ['horovod/common/common.cc',
'horovod/common/mpi_message.cc',
'horovod/common/half.cc',
'horovod/common/operations.cc',
'horovod/common/timeline.cc']
COMPILE_FLAGS = cpp_flags + shlex.split(mpi_flags)

0 comments on commit 156c61b

Please sign in to comment.