Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug in SoftmaxOp, compatible with MXNet 2.0, add custom Op C++ helper, refactor omp context #96

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
84 changes: 0 additions & 84 deletions examples/ConstantOP.py

This file was deleted.

2 changes: 2 additions & 0 deletions mobula/glue/mx.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
if not hasattr(mx.nd.NDArray, 'wait_to_write'):
mx.nd.NDArray.wait_to_write = lambda self: _LIB.MXNDArrayWaitToWrite(
self.handle)
if not hasattr(mx.symbol.Symbol, 'simple_bind'):
mx.symbol.Symbol.simple_bind = mx.symbol.Symbol._simple_bind


def get_pointer(v):
Expand Down
6 changes: 1 addition & 5 deletions mobula/inc/context/naive_ctx.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@

#include <algorithm>
#include <condition_variable>
#include <map>
#include <thread>
#include <utility>
#include <vector>

namespace mobula {
Expand All @@ -15,8 +13,6 @@ namespace mobula {
static thread_local int thread_local_i;
static thread_local int thread_local_n;

#include <iostream>

class Barrier {
public:
explicit Barrier(size_t nthreads) : count_(nthreads), nthreads_(nthreads) {}
Expand Down Expand Up @@ -79,7 +75,7 @@ template <typename Func>
MOBULA_DEVICE void parfor(const size_t n, Func F) {
INDEX_TYPE_SWITCH(n, {
index_t start, end;
get_parfor_range(n, thread_local_n, thread_local_i, &start, &end);
get_parfor_range(n, get_num_threads(), get_thread_num(), &start, &end);
for (index_t i = start; i < end; ++i) {
F(i);
}
Expand Down
52 changes: 46 additions & 6 deletions mobula/inc/context/openmp_ctx.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,71 @@

#include <omp.h>

namespace mobula {
#include <algorithm>

#define KERNEL_RUN(a) (a)
#ifndef _MSC_VER
#define __pragma(id) _Pragma(#id)
#endif

namespace mobula {

#if HOST_NUM_THREADS > 1

template <typename Func>
class KernelRunner {
public:
explicit KernelRunner(Func func) : func_(func) {}
template <typename... Args>
void operator()(const int n, Args... args) {
const int nthreads = std::min(n, omp_get_max_threads());
#pragma omp parallel num_threads(nthreads)
{ func_(n, args...); }
}

private:
Func func_;
};

MOBULA_DEVICE inline int get_num_threads() { return omp_get_num_threads(); }

MOBULA_DEVICE inline int get_thread_num() { return omp_get_thread_num(); }

template <typename Func>
MOBULA_DEVICE void parfor(const size_t n, Func F) {
INDEX_TYPE_SWITCH(n, {
__pragma(omp parallel for) for (index_t i = 0; i < static_cast<index_t>(n);
++i) {
index_t start, end;
get_parfor_range(n, get_num_threads(), get_thread_num(), &start, &end);
for (index_t i = start; i < end; ++i) {
F(i);
}
});
}

inline void __syncthreads() {
#pragma omp barrier
inline void __syncthreads() { __pragma(omp barrier); }

#define KERNEL_RUN(a) (mobula::KernelRunner<decltype(&(a))>(&(a)))

#else // HOST_NUM_THREADS > 1 else

MOBULA_DEVICE inline int get_num_threads() { return 1; }

MOBULA_DEVICE inline int get_thread_num() { return 1; }

template <typename Func>
MOBULA_DEVICE void parfor(const size_t n, Func F) {
INDEX_TYPE_SWITCH(n, {
for (index_t i = 0; i < static_cast<index_t>(n); ++i) {
F(i);
}
});
}

inline void __syncthreads() {}

#define KERNEL_RUN(a) (a)

#endif // HOST_NUM_THREADS > 1

} // namespace mobula

#endif // MOBULA_INC_CONTEXT_OPENMP_CTX_H_
127 changes: 127 additions & 0 deletions mobula/inc/helper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#ifndef MOBULA_INC_HELPER_H_
#define MOBULA_INC_HELPER_H_

#include "defines.h"

namespace mobula {

// Reduce without residual value
template <typename T>
MOBULA_DEVICE void Reduce(const int n, const T *data, T *out,
void (*func)(T &, const T &), T init) {
const int num_threads = get_num_threads();
const int thread_num = get_thread_num();
// 1. Reduce to `num_threads` slots
T res = init;
int start, end;
get_parfor_range(n, num_threads, thread_num, &start, &end);
for (int i = start; i < end; ++i) {
func(res, data[i]);
}
out[thread_num] = res;
__syncthreads();
// 2. Reduce `num_threads` slots to the first slot
/*
* for example, if num_thread is 5,
* thread ids: 0000, 0001, 0010, 0011, 0100
* data index ids: 0000, 0001, 0010, 0011, 0100
* step1: reduce the rightest 1st,
* (0000, 0001) -> 0000 in thread 0000
* (0010, 0011) -> 0010 in thread 0010 (+2)
* (0100, ) -> 0100 in thread 0100
* step2: reduce the rightest 2nd,
* (0000, 0010) -> 0000 in thread 0000
* (0100, ) -> 0100 in thread 0100 (+4)
* step3: reduce the rightest 3rd,
* (0000, 0100) -> 0000 in thread 0000
*/
int bi = 1;
while (bi < num_threads) {
int mask = (bi << 1) - 1;
if ((thread_num & mask) == 0) {
// valid thread
int other_i = thread_num | bi;
if (other_i < num_threads) func(out[thread_num], out[other_i]);
}
bi <<= 1;
__syncthreads();
}
}

// Reduce with residual value
template <typename T>
MOBULA_DEVICE void Reduce(const int n, const T *data, T *out,
void (*func_reduce)(T &, const T &, T &),
void (*func_merge)(T &, T &, const T &, const T &),
T init) {
const int num_threads = get_num_threads();
const int thread_num = get_thread_num();
// 1. Reduce to `num_threads` slots
T res = init;
int start, end;
T residual = T(0);
get_parfor_range(n, num_threads, thread_num, &start, &end);
for (int i = start; i < end; ++i) {
func_reduce(res, data[i], residual);
}
out[thread_num] = res;
__syncthreads();
// 2. Merge the adjacent threads
if ((thread_num & 1) == 0) {
func_reduce(out[thread_num], out[thread_num + 1], residual);
out[thread_num + 1] = residual;
}
__syncthreads();
// 3. Record the residual in out
if (thread_num & 1) {
out[thread_num] += residual;
}
__syncthreads();
// 4. Reduce `num_threads` slots to the first slot
int bi = 1 << 1;
while (bi < num_threads) {
int mask = (bi << 1) - 1;
if ((thread_num & mask) == 0) {
// valid thread
int other_i = thread_num | bi;
if (other_i < num_threads)
func_merge(out[thread_num], out[thread_num + 1], out[other_i],
out[other_i + 1]);
}
bi <<= 1;
__syncthreads();
}
}

template <typename T>
MOBULA_DEVICE void max_func(T &dst, const T &src) {
if (src > dst) dst = src;
}

template <typename T>
MOBULA_DEVICE void add_func(T &dst, const T &src) {
dst += src;
}

template <typename T>
MOBULA_DEVICE void add_residual_reduce_func(T &dst, const T &src, T &residual) {
T y = src - residual;
T t = dst + y;
residual = (t - dst) - y;
dst = t;
}

template <typename T>
MOBULA_DEVICE void add_residual_merge_func(T &dst, T &dst_residual,
const T &src,
const T &src_residual) {
T t1 = dst + src;
T e = t1 - dst;
T t2 = ((src - e) + (dst - (t1 - e))) + dst_residual + src_residual;
dst = t1 + t2;
dst_residual = t2 - (dst - t1);
}

} // namespace mobula

#endif // MOBULA_INC_HELPER_H_
1 change: 1 addition & 0 deletions mobula/inc/mobula_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@

#include "defines.h"
#include "glue_mx.h"
#include "helper.h"

#endif // MOBULA_INC_MOBULA_OP_H_