Skip to content

Commit

Permalink
convert to node-api
Browse files Browse the repository at this point in the history
  • Loading branch information
zbjornson committed Sep 18, 2023
1 parent e49e947 commit e8223f5
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 68 deletions.
30 changes: 12 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,6 @@ library uses the fastest available SIMD instructions ([PSHUFB (SSSE3) or VPSHUFB
(NEON)](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0489h/Cihjgdid.html)),
which process multiple array elements simultaneously.

Native code requires one of:
* MSVC 2015 or later
* Clang 3.4.x or later
* GCC 4.8.x or later
* ICC 16 or later

In the browser or when native code is unavailable, this library falls back to
the fastest JavaScript implementation. The JavaScript implementation is also
always explicitly available:
Expand All @@ -52,28 +46,28 @@ import {js} from "bswap"; // Use javascript implementation explicitly

Showing millions of elements processed per second when invoked with a
10,000-element array. (Run the benchmark suite to see results for varying array
lengths and other libraries.) Ran on an Intel i7-7700HQ 2.80 GHz processor (AVX2
supported) or Cavium ThunderX 2.0 GHz processor (ARM NEON); Node.js v8.x;
Windows 10 (MSVC) or Ubuntu 16.04 (GCC, Clang). (Note that a 10,000-element
lengths and other libraries.) Ran on an Intel i9-11900H 2.50 GHz processor (AVX2
supported) or Cavium ThunderX 2.0 GHz processor (ARM NEON); Node.js v16.x;
Windows 11 (MSVC) or Ubuntu 20.04 (GCC, Clang). (Note that a 10,000-element
Int16Array fits in L1 cache, whereas a 10,000-element Int32Array or Float64Array
does not.)

| compiler | C++ | JS | Native:JS | Node.js | Native:Node |
| --------- | -----: | ---: | --------: | ------: | ----------: |
| **16 bit types (Uint16Array, Int16Array)**
| MSVC 2015 | 32,286 | 625 | 51.7x | 12,141 | 2.7x |
| GCC 8.1 | 31,549 | (same) | 50.5x | 1,507 | 20.9x |
| Clang 6 | 30,238 | (same) | 48.4x | (same) | 20.1x |
| MSVC 2022 | 46,221 | 722 | 64.0x | 18,213 | 2.5x |
| GCC 9.4 | 40,945 | | 56.8x | 13,720 | 2.9x |
| Clang 15 | 47,398 | | 65.6x | | 3.5x |
| GCC-ARM | 2,677 | 183 | 14.6x | 297 | 9.0x |
| **32 bits types (Uint32Array, Int32Array, Float32Array)**
| MSVC 2015 | 12,558 | 342 | 36.7x | 5,840 | 2.2x |
| GCC 8.1 | 12,074 | (same) | 35.3x | 2,361 | 5.1x |
| Clang 6 | 12,587 | (same) | 36.8x | (same) | 5.3x |
| MSVC 2022 | 27,459 | 342 | 36.7x | 9,431 | 2.9x |
| GCC 9.4 | 23,613 | | 61.9x | 2,842 | 8.3x |
| Clang 15 | 29,013 | | 84.8x | | 10.2x |
| GCC-ARM | 670 | 94 | 7.1x | 249 | 2.7x |
| **64 bit types (Float64Array)**
| MSVC 2015 | 6,841 | 179 | 38.2x | 3,043 | 2.2x |
| GCC 8.1 | 6,528 | (same) | 36.5x | 1,790 | 3.6x |
| Clang 6 | 6,598 | (same) | 36.9x | (same) | 3.7x |
| MSVC 2022 | 9,005 | 179 | 38.2x | 4,348 | 2.1x |
| GCC 9.4 | 8,774 | | 49.1x | 2,642 | 3.3x |
| Clang 15 | 8,937 | | 49.9x | | 3.4x |
| GCC-ARM | 382 | 49 | 7.8x | 213 | 1.8x |

There's an AVX512 implementation that is disabled by default. On the Cascade
Expand Down
6 changes: 0 additions & 6 deletions binding.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,6 @@
{
"target_name": "bswap",
"sources": [ "src/bswap.cc" ],
"include_dirs" : [
"<!(node -p \"require('node-addon-api').include_dir\")"
],
"defines": [
"NAPI_DISABLE_CPP_EXCEPTIONS"
],
"cflags":[
"-fvisibility=hidden",
"-falign-loops=32", # See readme; significant improvement for some cases
Expand Down
4 changes: 1 addition & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,7 @@
"url": "https://github.com/zbjornson/node-bswap/issues"
},
"homepage": "https://github.com/zbjornson/node-bswap#readme",
"dependencies": {
"node-addon-api": "^7.0.0"
},
"dependencies": {},
"devDependencies": {
"mocha": "^10.0.0"
},
Expand Down
125 changes: 84 additions & 41 deletions src/bswap.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "napi.h"
#include "node_api.h"
#include <cstdint>
#include <cstddef>

Expand Down Expand Up @@ -40,8 +40,6 @@ static inline void swap(uint64_t* val) { *val = BSWAP_INTRINSIC_8(*val); }

template<typename STYPE, class VTYPE>
static void shuffle(STYPE* data, size_t elemLength) {
uint8_t* bytes = reinterpret_cast<uint8_t*>(data);

size_t elemIdx = 0;
constexpr size_t vectSize = VTYPE::size();

Expand Down Expand Up @@ -80,51 +78,95 @@ static void shuffle(STYPE* data, size_t elemLength) {
}

template <class VTYPE>
void flipBytes(const Napi::CallbackInfo& info) {
napi_value flipBytes(napi_env env, napi_callback_info info) {
// TODO(perf): consider this to warm up the wider registers:
// asm volatile("vxorps ymm0, ymm0, ymm0" : : "ymm0")

if (!info[0].IsTypedArray()) {
Napi::Error::New(info.Env(), "Expected typed array").ThrowAsJavaScriptException();
return;
napi_status status;

napi_value args[1];
size_t argc = 1;
status = napi_get_cb_info(env, info, &argc, args, nullptr, nullptr);
if (status != napi_ok) goto error;

if (argc < 1) {
napi_throw_error(env, NULL, "Expected typed array");
return nullptr;
}

bool isTypedArray;
status = napi_is_typedarray(env, args[0], &isTypedArray);
if (status != napi_ok) goto error;
if (!isTypedArray) {
napi_throw_error(env, NULL, "Expected typed array");
return nullptr;
}

auto arr = info[0].As<Napi::TypedArray>();
napi_typedarray_type type;
size_t elemLength;
void* data;
napi_status ok = napi_get_typedarray_info(
info.Env(), arr, &type, &elemLength, &data, nullptr, nullptr);
if (ok != napi_ok) {
Napi::Error::New(info.Env(), "Failed to get typed array info").ThrowAsJavaScriptException();
return;
}
status = napi_get_typedarray_info(env, args[0], &type, &elemLength, &data, nullptr, nullptr);
if (status != napi_ok) goto error;

switch (type) {
case napi_int16_array:
case napi_uint16_array:
return shuffle<uint16_t, VTYPE>(reinterpret_cast<uint16_t*>(data), elemLength);
shuffle<uint16_t, VTYPE>(reinterpret_cast<uint16_t*>(data), elemLength);
return nullptr;
case napi_int32_array:
case napi_uint32_array:
case napi_float32_array:
return shuffle<uint32_t, VTYPE>(reinterpret_cast<uint32_t*>(data), elemLength);
shuffle<uint32_t, VTYPE>(reinterpret_cast<uint32_t*>(data), elemLength);
return nullptr;
case napi_float64_array:
#if (NAPI_VERSION > 5)
case napi_bigint64_array:
case napi_biguint64_array:
#endif // (NAPI_VERSION > 5)
return shuffle<uint64_t, VTYPE>(reinterpret_cast<uint64_t*>(data), elemLength);
#endif // (NAPI_VERSION > 5)
shuffle<uint64_t, VTYPE>(reinterpret_cast<uint64_t*>(data), elemLength);
return nullptr;
case napi_int8_array:
case napi_uint8_array:
case napi_uint8_clamped_array:
default:
return;
return nullptr;
}

error:
const napi_extended_error_info* error_info = nullptr;
napi_get_last_error_info(env, &error_info);
const char* err_message = error_info->error_message;
bool is_pending;
napi_is_exception_pending(env, &is_pending);
if (!is_pending) {
const char* message = err_message == nullptr ? "empty error message" : err_message;
napi_throw_error(env, nullptr, message);
}
return nullptr;
}

Napi::Object Init(Napi::Env env, Napi::Object exports) {
Napi::Function fn;
Napi::String ise;
#define NAPI_CALL(env, call) \
do { \
napi_status status = (call); \
if (status != napi_ok) { \
const napi_extended_error_info* error_info = NULL; \
napi_get_last_error_info((env), &error_info); \
const char* err_message = error_info->error_message; \
bool is_pending; \
napi_is_exception_pending((env), &is_pending); \
if (!is_pending) { \
const char* message = (err_message == NULL) \
? "empty error message" \
: err_message; \
napi_throw_error((env), NULL, message); \
return NULL; \
} \
} \
} while(0)

napi_value Init(napi_env env, napi_value exports) {
napi_value fn;
napi_value ise;

// MSVC doesn't have any equivalent to -march=native, but it will emit
// instructions from any instruction set when intrinsics are used. This lets
Expand All @@ -133,39 +175,40 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) {
#ifdef _MSC_VER
// Warning: Do not put the ternary outside of the New. Performance will tank.
# ifdef BSWAP_USE_AVX512
fn = Napi::Function::New(env,
NAPI_CALL(env, napi_create_function(env, "bswap", NAPI_AUTO_LENGTH,
supportsAVX512BW() ? flipBytes<Vec512> :
supportsAVX2() ? flipBytes<Vec256> : flipBytes<Vec128>);
ise = Napi::String::New(env,
supportsAVX512BW() ? "AVX512" : supportsAVX2() ? "AVX2" : "SSSE3");
supportsAVX2() ? flipBytes<Vec256> : flipBytes<Vec128>, NULL, &fn));
NAPI_CALL(env, napi_create_string_latin1(env,
supportsAVX512BW() ? "AVX512" : supportsAVX2() ? "AVX2" : "SSSE3",
NAPI_AUTO_LENGTH, &ise));
# else
fn = Napi::Function::New(env,
supportsAVX2() ? flipBytes<Vec256> : flipBytes<Vec128>);
ise = Napi::String::New(env,
supportsAVX2() ? "AVX2" : "SSSE3");
NAPI_CALL(env, napi_create_function(env, "bswap", NAPI_AUTO_LENGTH,
supportsAVX2() ? flipBytes<Vec256> : flipBytes<Vec128>, NULL, &fn));
NAPI_CALL(env, napi_create_string_latin1(env, supportsAVX2() ? "AVX2" : "SSSE3", NAPI_AUTO_LENGTH, &ise));
# endif // BSWAP_USE_AVX512
#else
// GNU-compatible compilers have -march=native, and refuse to emit
// instructions from an instruction set less than the -m flags allow.
# if defined(__AVX512BW__) && defined(BSWAP_USE_AVX512)
// Disabled by default because it is slower than AVX2.
fn = Napi::Function::New(env, flipBytes<Vec512>);
ise = Napi::String::New(env, "AVX512");
NAPI_CALL(env, napi_create_function(env, "bswap", NAPI_AUTO_LENGTH, flipBytes<Vec512>, NULL, &fn));
NAPI_CALL(env, napi_create_string_latin1(env, "AVX512", NAPI_AUTO_LENGTH, &ise));
# elif defined(__AVX2__)
fn = Napi::Function::New(env, flipBytes<Vec256>);
ise = Napi::String::New(env, "AVX2");
NAPI_CALL(env, napi_create_function(env, "bswap", NAPI_AUTO_LENGTH, flipBytes<Vec256>, NULL, &fn));
NAPI_CALL(env, napi_create_string_latin1(env, "AVX2", NAPI_AUTO_LENGTH, &ise));
# elif defined(__SSSE3__)
fn = Napi::Function::New(env, flipBytes<Vec128>);
ise = Napi::String::New(env, "SSSE3");
NAPI_CALL(env, napi_create_function(env, "bswap", NAPI_AUTO_LENGTH, flipBytes<Vec128>, NULL, &fn));
NAPI_CALL(env, napi_create_string_latin1(env, "SSSE3", NAPI_AUTO_LENGTH, &ise));
# elif defined(__ARM_NEON)
fn = Napi::Function::New(env, flipBytes<VecNeon>);
ise = Napi::String::New(env, "NEON");
NAPI_CALL(env, napi_create_function(env, "bswap", NAPI_AUTO_LENGTH, flipBytes<VecNeon>, NULL, &fn));
NAPI_CALL(env, napi_create_string_latin1(env, "NEON", NAPI_AUTO_LENGTH, &ise));
# endif
#endif

exports.Set(Napi::String::New(env, "bswap"), fn);
exports.Set(Napi::String::New(env, "ise"), ise);
NAPI_CALL(env, napi_set_named_property(env, exports, "bswap", fn));
NAPI_CALL(env, napi_set_named_property(env, exports, "ise", ise));

return exports;
}

NODE_API_MODULE(bswap, Init);
NAPI_MODULE(bswap, Init)

0 comments on commit e8223f5

Please sign in to comment.