wkcn · wkcn · Jul 15, 2020 · Jul 15, 2020 · Jul 15, 2020 · Jul 15, 2020
diff --git a/examples/ConstantOP.py b/examples/ConstantOP.py
diff --git a/mobula/glue/mx.py b/mobula/glue/mx.py
@@ -10,6 +10,8 @@
 if not hasattr(mx.nd.NDArray, 'wait_to_write'):
     mx.nd.NDArray.wait_to_write = lambda self: _LIB.MXNDArrayWaitToWrite(
         self.handle)
+if not hasattr(mx.symbol.Symbol, 'simple_bind'):
+    mx.symbol.Symbol.simple_bind = mx.symbol.Symbol._simple_bind
 
 
 def get_pointer(v):

diff --git a/mobula/inc/context/naive_ctx.h b/mobula/inc/context/naive_ctx.h
@@ -3,9 +3,7 @@
 
 #include <algorithm>
 #include <condition_variable>
-#include <map>
 #include <thread>
-#include <utility>
 #include <vector>
 
 namespace mobula {
@@ -15,8 +13,6 @@ namespace mobula {
 static thread_local int thread_local_i;
 static thread_local int thread_local_n;
 
-#include <iostream>
-
 class Barrier {
  public:
   explicit Barrier(size_t nthreads) : count_(nthreads), nthreads_(nthreads) {}
@@ -79,7 +75,7 @@ template <typename Func>
 MOBULA_DEVICE void parfor(const size_t n, Func F) {
   INDEX_TYPE_SWITCH(n, {
     index_t start, end;
-    get_parfor_range(n, thread_local_n, thread_local_i, &start, &end);
+    get_parfor_range(n, get_num_threads(), get_thread_num(), &start, &end);
     for (index_t i = start; i < end; ++i) {
       F(i);
     }

diff --git a/mobula/inc/context/openmp_ctx.h b/mobula/inc/context/openmp_ctx.h
@@ -3,31 +3,71 @@
 
 #include <omp.h>
 
-namespace mobula {
+#include <algorithm>
 
-#define KERNEL_RUN(a) (a)
 #ifndef _MSC_VER
 #define __pragma(id) _Pragma(#id)
 #endif
 
+namespace mobula {
+
+#if HOST_NUM_THREADS > 1
+
+template <typename Func>
+class KernelRunner {
+ public:
+  explicit KernelRunner(Func func) : func_(func) {}
+  template <typename... Args>
+  void operator()(const int n, Args... args) {
+    const int nthreads = std::min(n, omp_get_max_threads());
+#pragma omp parallel num_threads(nthreads)
+    { func_(n, args...); }
+  }
+
+ private:
+  Func func_;
+};
+
 MOBULA_DEVICE inline int get_num_threads() { return omp_get_num_threads(); }
 
 MOBULA_DEVICE inline int get_thread_num() { return omp_get_thread_num(); }
 
 template <typename Func>
 MOBULA_DEVICE void parfor(const size_t n, Func F) {
   INDEX_TYPE_SWITCH(n, {
-    __pragma(omp parallel for) for (index_t i = 0; i < static_cast<index_t>(n);
-                                     ++i) {
+    index_t start, end;
+    get_parfor_range(n, get_num_threads(), get_thread_num(), &start, &end);
+    for (index_t i = start; i < end; ++i) {
       F(i);
     }
   });
 }
 
-inline void __syncthreads() {
-#pragma omp barrier
+inline void __syncthreads() { __pragma(omp barrier); }
+
+#define KERNEL_RUN(a) (mobula::KernelRunner<decltype(&(a))>(&(a)))
+
+#else  // HOST_NUM_THREADS > 1 else
+
+MOBULA_DEVICE inline int get_num_threads() { return 1; }
+
+MOBULA_DEVICE inline int get_thread_num() { return 1; }
+
+template <typename Func>
+MOBULA_DEVICE void parfor(const size_t n, Func F) {
+  INDEX_TYPE_SWITCH(n, {
+    for (index_t i = 0; i < static_cast<index_t>(n); ++i) {
+      F(i);
+    }
+  });
 }
 
+inline void __syncthreads() {}
+
+#define KERNEL_RUN(a) (a)
+
+#endif  // HOST_NUM_THREADS > 1
+
 }  // namespace mobula
 
 #endif  // MOBULA_INC_CONTEXT_OPENMP_CTX_H_
diff --git a/mobula/inc/helper.h b/mobula/inc/helper.h
@@ -0,0 +1,127 @@
+#ifndef MOBULA_INC_HELPER_H_
+#define MOBULA_INC_HELPER_H_
+
+#include "defines.h"
+
+namespace mobula {
+
+// Reduce without residual value
+template <typename T>
+MOBULA_DEVICE void Reduce(const int n, const T *data, T *out,
+                          void (*func)(T &, const T &), T init) {
+  const int num_threads = get_num_threads();
+  const int thread_num = get_thread_num();
+  // 1. Reduce to `num_threads` slots
+  T res = init;
+  int start, end;
+  get_parfor_range(n, num_threads, thread_num, &start, &end);
+  for (int i = start; i < end; ++i) {
+    func(res, data[i]);
+  }
+  out[thread_num] = res;
+  __syncthreads();
+  // 2. Reduce `num_threads` slots to the first slot
+  /*
+   * for example, if num_thread is 5,
+   * thread ids: 0000, 0001, 0010, 0011, 0100
+   * data index ids: 0000, 0001, 0010, 0011, 0100
+   * step1: reduce the rightest 1st,
+   *   (0000, 0001) -> 0000 in thread 0000
+   *   (0010, 0011) -> 0010 in thread 0010 (+2)
+   *   (0100,     ) -> 0100 in thread 0100
+   * step2: reduce the rightest 2nd,
+   *   (0000, 0010) -> 0000 in thread 0000
+   *   (0100,     ) -> 0100 in thread 0100 (+4)
+   * step3: reduce the rightest 3rd,
+   *   (0000, 0100) -> 0000 in thread 0000
+   */
+  int bi = 1;
+  while (bi < num_threads) {
+    int mask = (bi << 1) - 1;
+    if ((thread_num & mask) == 0) {
+      // valid thread
+      int other_i = thread_num | bi;
+      if (other_i < num_threads) func(out[thread_num], out[other_i]);
+    }
+    bi <<= 1;
+    __syncthreads();
+  }
+}
+
+// Reduce with residual value
+template <typename T>
+MOBULA_DEVICE void Reduce(const int n, const T *data, T *out,
+                          void (*func_reduce)(T &, const T &, T &),
+                          void (*func_merge)(T &, T &, const T &, const T &),
+                          T init) {
+  const int num_threads = get_num_threads();
+  const int thread_num = get_thread_num();
+  // 1. Reduce to `num_threads` slots
+  T res = init;
+  int start, end;
+  T residual = T(0);
+  get_parfor_range(n, num_threads, thread_num, &start, &end);
+  for (int i = start; i < end; ++i) {
+    func_reduce(res, data[i], residual);
+  }
+  out[thread_num] = res;
+  __syncthreads();
+  // 2. Merge the adjacent threads
+  if ((thread_num & 1) == 0) {
+    func_reduce(out[thread_num], out[thread_num + 1], residual);
+    out[thread_num + 1] = residual;
+  }
+  __syncthreads();
+  // 3. Record the residual in out
+  if (thread_num & 1) {
+    out[thread_num] += residual;
+  }
+  __syncthreads();
+  // 4. Reduce `num_threads` slots to the first slot
+  int bi = 1 << 1;
+  while (bi < num_threads) {
+    int mask = (bi << 1) - 1;
+    if ((thread_num & mask) == 0) {
+      // valid thread
+      int other_i = thread_num | bi;
+      if (other_i < num_threads)
+        func_merge(out[thread_num], out[thread_num + 1], out[other_i],
+                   out[other_i + 1]);
+    }
+    bi <<= 1;
+    __syncthreads();
+  }
+}
+
+template <typename T>
+MOBULA_DEVICE void max_func(T &dst, const T &src) {
+  if (src > dst) dst = src;
+}
+
+template <typename T>
+MOBULA_DEVICE void add_func(T &dst, const T &src) {
+  dst += src;
+}
+
+template <typename T>
+MOBULA_DEVICE void add_residual_reduce_func(T &dst, const T &src, T &residual) {
+  T y = src - residual;
+  T t = dst + y;
+  residual = (t - dst) - y;
+  dst = t;
+}
+
+template <typename T>
+MOBULA_DEVICE void add_residual_merge_func(T &dst, T &dst_residual,
+                                           const T &src,
+                                           const T &src_residual) {
+  T t1 = dst + src;
+  T e = t1 - dst;
+  T t2 = ((src - e) + (dst - (t1 - e))) + dst_residual + src_residual;
+  dst = t1 + t2;
+  dst_residual = t2 - (dst - t1);
+}
+
+}  // namespace mobula
+
+#endif  // MOBULA_INC_HELPER_H_
diff --git a/mobula/inc/mobula_op.h b/mobula/inc/mobula_op.h
@@ -3,5 +3,6 @@
 
 #include "defines.h"
 #include "glue_mx.h"
+#include "helper.h"
 
 #endif  // MOBULA_INC_MOBULA_OP_H_