Apply modifications to account for RAFT changes (rapidsai#4077)

This PR apply modifications to the cuML codebase to account for changes in RAFT and RMM : - rapidsai/raft#283 - rapidsai/raft#285 - rapidsai/raft#286 - rapidsai/rmm#816 Authors: - Victor Lafargue (https://github.com/viclafargue) - Dante Gama Dessavre (https://github.com/dantegd) Approvers: - William Hicks (https://github.com/wphicks) - Micka (https://github.com/lowener) - Dante Gama Dessavre (https://github.com/dantegd) - Divye Gala (https://github.com/divyegala) URL: rapidsai#4077
vimarsh6739 · Aug 30, 2021 · 269b7b1 · 269b7b1
1 parent 8d1ecb4
commit 269b7b1
Show file tree

Hide file tree

Showing 260 changed files with 3,714 additions and 4,409 deletions.
diff --git a/cpp/bench/common/ml_benchmark.hpp b/cpp/bench/common/ml_benchmark.hpp
@@ -80,19 +80,15 @@ struct CudaEventTimer {
 
  private:
   ::benchmark::State* state;
-  cudaStream_t stream;
+  cudaStream_t stream = 0;
   cudaEvent_t start;
   cudaEvent_t stop;
 };  // end struct CudaEventTimer
 
 /** Main fixture to be inherited and used by all other c++ benchmarks in cuml */
 class Fixture : public ::benchmark::Fixture {
  public:
-  Fixture(const std::string& name, std::shared_ptr<raft::mr::device::allocator> _alloc)
-    : ::benchmark::Fixture(), d_alloc(_alloc)
-  {
-    SetName(name.c_str());
-  }
+  Fixture(const std::string& name) : ::benchmark::Fixture() { SetName(name.c_str()); }
   Fixture() = delete;
 
   void SetUp(const ::benchmark::State& state) override
@@ -163,19 +159,20 @@ class Fixture : public ::benchmark::Fixture {
   template <typename T>
   void alloc(T*& ptr, size_t len, bool init = false)
   {
-    auto nBytes = len * sizeof(T);
-    ptr         = (T*)d_alloc->allocate(nBytes, stream);
+    auto nBytes  = len * sizeof(T);
+    auto d_alloc = rmm::mr::get_current_device_resource();
+    ptr          = (T*)d_alloc->allocate(nBytes, stream);
     if (init) { CUDA_CHECK(cudaMemsetAsync(ptr, 0, nBytes, stream)); }
   }
 
   template <typename T>
   void dealloc(T* ptr, size_t len)
   {
+    auto d_alloc = rmm::mr::get_current_device_resource();
     d_alloc->deallocate(ptr, len * sizeof(T), stream);
   }
 
-  std::shared_ptr<raft::mr::device::allocator> d_alloc;
-  cudaStream_t stream;
+  cudaStream_t stream = 0;
   int l2CacheSize;
   char* scratchBuffer;
 };  // class Fixture

diff --git a/cpp/bench/prims/add.cu b/cpp/bench/prims/add.cu
@@ -16,7 +16,6 @@
 
 #include <common/ml_benchmark.hpp>
 #include <raft/linalg/add.cuh>
-#include <raft/mr/device/allocator.hpp>
 
 namespace MLCommon {
 namespace Bench {
@@ -28,13 +27,7 @@ struct AddParams {
 
 template <typename T>
 struct AddBench : public Fixture {
-  AddBench(const std::string& name, const AddParams& p)
-    : Fixture(
-        name,
-        std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
-      params(p)
-  {
-  }
+  AddBench(const std::string& name, const AddParams& p) : Fixture(name), params(p) {}
 
  protected:
   void allocateBuffers(const ::benchmark::State& state) override

diff --git a/cpp/bench/prims/distance_common.cuh b/cpp/bench/prims/distance_common.cuh
@@ -17,7 +17,6 @@
 #include <raft/cudart_utils.h>
 #include <common/ml_benchmark.hpp>
 #include <raft/distance/distance.cuh>
-#include <raft/mr/device/allocator.hpp>
 
 namespace MLCommon {
 namespace Bench {
@@ -31,42 +30,34 @@ struct Params {
 template <typename T, raft::distance::DistanceType DType>
 struct Distance : public Fixture {
   Distance(const std::string& name, const Params& p)
-    : Fixture(
-        name,
-        std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
-      params(p)
+    : Fixture(name), params(p), x(0, stream), y(0, stream), out(0, stream), workspace(0, stream)
   {
   }
 
  protected:
   void allocateBuffers(const ::benchmark::State& state) override
   {
-    alloc(x, params.m * params.k, true);
-    alloc(y, params.n * params.k, true);
-    alloc(out, params.m * params.n, true);
-    workspace = nullptr;
-    worksize = raft::distance::getWorkspaceSize<DType, T, T, T>(x, y, params.m, params.n, params.k);
-    if (worksize != 0) { alloc(workspace, worksize, false); }
-  }
-
-  void deallocateBuffers(const ::benchmark::State& state) override
-  {
-    dealloc(x, params.m * params.k);
-    dealloc(y, params.n * params.k);
-    dealloc(out, params.m * params.n);
-    dealloc(workspace, worksize);
+    x.resize(params.m * params.k, stream);
+    y.resize(params.n * params.k, stream);
+    out.resize(params.m * params.n, stream);
+    CUDA_CHECK(cudaMemsetAsync(x.data(), 0, x.size() * sizeof(T), stream));
+    CUDA_CHECK(cudaMemsetAsync(y.data(), 0, y.size() * sizeof(T), stream));
+    CUDA_CHECK(cudaMemsetAsync(out.data(), 0, out.size() * sizeof(T), stream));
+    worksize = raft::distance::getWorkspaceSize<DType, T, T, T>(
+      x.data(), y.data(), params.m, params.n, params.k);
+    workspace.resize(worksize, stream);
   }
 
   void runBenchmark(::benchmark::State& state) override
   {
     loopOnState(state, [this]() {
-      raft::distance::distance<DType, T, T, T>(x,
-                                               y,
-                                               out,
+      raft::distance::distance<DType, T, T, T>(x.data(),
+                                               y.data(),
+                                               out.data(),
                                                params.m,
                                                params.n,
                                                params.k,
-                                               (void*)workspace,
+                                               (void*)workspace.data(),
                                                worksize,
                                                stream,
                                                params.isRowMajor);
@@ -75,8 +66,8 @@ struct Distance : public Fixture {
 
  private:
   Params params;
-  T *x, *y, *out;
-  char* workspace;
+  rmm::device_uvector<T> x, y, out;
+  rmm::device_uvector<char> workspace;
   size_t worksize;
 };  // struct Distance
 

diff --git a/cpp/bench/prims/fused_l2_nn.cu b/cpp/bench/prims/fused_l2_nn.cu
@@ -19,7 +19,6 @@
 #include <limits>
 #include <raft/distance/fused_l2_nn.cuh>
 #include <raft/linalg/norm.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/random/rng.cuh>
 
 namespace MLCommon {
@@ -32,13 +31,7 @@ struct FLNParams {
 
 template <typename T>
 struct FusedL2NN : public Fixture {
-  FusedL2NN(const std::string& name, const FLNParams& p)
-    : Fixture(
-        name,
-        std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
-      params(p)
-  {
-  }
+  FusedL2NN(const std::string& name, const FLNParams& p) : Fixture(name), params(p) {}
 
  protected:
   void allocateBuffers(const ::benchmark::State& state) override

diff --git a/cpp/bench/prims/gram_matrix.cu b/cpp/bench/prims/gram_matrix.cu
@@ -15,11 +15,11 @@
  */
 
 #include <cuml/matrix/kernelparams.h>
+#include <raft/linalg/cublas_wrappers.h>
 #include <common/ml_benchmark.hpp>
 #include <matrix/grammatrix.cuh>
 #include <matrix/kernelfactory.cuh>
 #include <memory>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/random/rng.cuh>
 #include <sstream>
 #include <string>
@@ -42,10 +42,7 @@ struct GramTestParams {
 template <typename T>
 struct GramMatrix : public Fixture {
   GramMatrix(const std::string& name, const GramTestParams& p)
-    : Fixture(
-        name,
-        std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
-      params(p)
+    : Fixture(name), params(p), A(0, stream), B(0, stream), C(0, stream)
   {
     std::vector<std::string> kernel_names{"linear", "poly", "rbf", "tanh"};
     std::ostringstream oss;
@@ -63,31 +60,24 @@ struct GramMatrix : public Fixture {
  protected:
   void allocateBuffers(const ::benchmark::State& state) override
   {
-    alloc(A, params.m * params.k);
-    alloc(B, params.k * params.n);
-    alloc(C, params.m * params.n);
+    A.resize(params.m * params.k, stream);
+    B.resize(params.k * params.n, stream);
+    C.resize(params.m * params.n, stream);
     raft::random::Rng r(123456ULL);
-    r.uniform(A, params.m * params.k, T(-1.0), T(1.0), stream);
-    r.uniform(B, params.k * params.n, T(-1.0), T(1.0), stream);
-  }
-
-  void deallocateBuffers(const ::benchmark::State& state) override
-  {
-    dealloc(A, params.m * params.k);
-    dealloc(B, params.k * params.n);
-    dealloc(C, params.m * params.n);
+    r.uniform(A.data(), params.m * params.k, T(-1.0), T(1.0), stream);
+    r.uniform(B.data(), params.k * params.n, T(-1.0), T(1.0), stream);
   }
 
   void runBenchmark(::benchmark::State& state) override
   {
     if (!this->kernel) { state.SkipWithError("Kernel matrix is not initialized"); }
     loopOnState(state, [this]() {
-      (*this->kernel)(this->A,
+      (*this->kernel)(A.data(),
                       this->params.m,
                       this->params.k,
-                      this->B,
+                      B.data(),
                       this->params.n,
-                      this->C,
+                      C.data(),
                       this->params.is_row_major,
                       this->stream);
     });
@@ -98,9 +88,9 @@ struct GramMatrix : public Fixture {
   std::unique_ptr<GramMatrixBase<T>> kernel;
   GramTestParams params;
 
-  T* A;  // input matrix A, size [m * k]
-  T* B;  // input matrix B, size [n * k]
-  T* C;  // output matrix C, size [m*n]
+  rmm::device_uvector<T> A;  // input matrix A, size [m * k]
+  rmm::device_uvector<T> B;  // input matrix B, size [n * k]
+  rmm::device_uvector<T> C;  // output matrix C, size [m*n]
 };
 
 static std::vector<GramTestParams> getInputs()

diff --git a/cpp/bench/prims/make_blobs.cu b/cpp/bench/prims/make_blobs.cu
@@ -15,7 +15,6 @@
  */
 
 #include <common/ml_benchmark.hpp>
-#include <raft/mr/device/allocator.hpp>
 #include <random/make_blobs.cuh>
 
 namespace MLCommon {
@@ -30,44 +29,34 @@ struct Params {
 template <typename T>
 struct MakeBlobs : public Fixture {
   MakeBlobs(const std::string& name, const Params& p)
-    : Fixture(
-        name,
-        std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
-      params(p)
+    : Fixture(name), params(p), data(0, stream), labels(0, stream)
   {
   }
 
  protected:
   void allocateBuffers(const ::benchmark::State& state) override
   {
-    alloc(data, params.rows * params.cols);
-    alloc(labels, params.rows);
-  }
-
-  void deallocateBuffers(const ::benchmark::State& state) override
-  {
-    dealloc(data, params.rows * params.cols);
-    dealloc(labels, params.rows);
+    data.resize(params.rows * params.cols, stream);
+    labels.resize(params.rows, stream);
   }
 
   void runBenchmark(::benchmark::State& state) override
   {
     loopOnState(state, [this]() {
-      MLCommon::Random::make_blobs(data,
-                                   labels,
+      MLCommon::Random::make_blobs(data.data(),
+                                   labels.data(),
                                    params.rows,
                                    params.cols,
                                    params.clusters,
-                                   this->d_alloc,
                                    this->stream,
                                    params.row_major);
     });
   }
 
  private:
   Params params;
-  T* data;
-  int* labels;
+  rmm::device_uvector<T> data;
+  rmm::device_uvector<int> labels;
 };  // struct MakeBlobs
 
 static std::vector<Params> getInputs()

diff --git a/cpp/bench/prims/map_then_reduce.cu b/cpp/bench/prims/map_then_reduce.cu
@@ -16,7 +16,6 @@
 
 #include <common/ml_benchmark.hpp>
 #include <raft/linalg/map_then_reduce.cuh>
-#include <raft/mr/device/allocator.hpp>
 
 namespace MLCommon {
 namespace Bench {
@@ -33,13 +32,7 @@ struct Identity {
 
 template <typename T>
 struct MapThenReduce : public Fixture {
-  MapThenReduce(const std::string& name, const Params& p)
-    : Fixture(
-        name,
-        std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
-      params(p)
-  {
-  }
+  MapThenReduce(const std::string& name, const Params& p) : Fixture(name), params(p) {}
 
  protected:
   void allocateBuffers(const ::benchmark::State& state) override

diff --git a/cpp/bench/prims/matrix_vector_op.cu b/cpp/bench/prims/matrix_vector_op.cu
@@ -16,7 +16,6 @@
 
 #include <common/ml_benchmark.hpp>
 #include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/mr/device/allocator.hpp>
 
 namespace MLCommon {
 namespace Bench {
@@ -29,13 +28,7 @@ struct Params {
 
 template <typename T>
 struct MatVecOp : public Fixture {
-  MatVecOp(const std::string& name, const Params& p)
-    : Fixture(
-        name,
-        std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
-      params(p)
-  {
-  }
+  MatVecOp(const std::string& name, const Params& p) : Fixture(name), params(p) {}
 
  protected:
   void allocateBuffers(const ::benchmark::State& state) override

diff --git a/cpp/bench/prims/permute.cu b/cpp/bench/prims/permute.cu
@@ -31,13 +31,7 @@ struct Params {
 
 template <typename T>
 struct Permute : public Fixture {
-  Permute(const std::string& name, const Params& p)
-    : Fixture(
-        name,
-        std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
-      params(p)
-  {
-  }
+  Permute(const std::string& name, const Params& p) : Fixture(name), params(p) {}
 
  protected:
   void allocateBuffers(const ::benchmark::State& state) override

diff --git a/cpp/bench/prims/reduce.cu b/cpp/bench/prims/reduce.cu
@@ -16,7 +16,6 @@
 
 #include <common/ml_benchmark.hpp>
 #include <raft/linalg/reduce.cuh>
-#include <raft/mr/device/allocator.hpp>
 
 namespace MLCommon {
 namespace Bench {
@@ -29,13 +28,7 @@ struct Params {
 
 template <typename T>
 struct Reduce : public Fixture {
-  Reduce(const std::string& name, const Params& p)
-    : Fixture(
-        name,
-        std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
-      params(p)
-  {
-  }
+  Reduce(const std::string& name, const Params& p) : Fixture(name), params(p) {}
 
  protected:
   void allocateBuffers(const ::benchmark::State& state) override