Skip to content

Commit

Permalink
Apply modifications to account for RAFT changes (rapidsai#4077)
Browse files Browse the repository at this point in the history
This PR apply modifications to the cuML codebase to account for changes in RAFT and RMM :
- rapidsai/raft#283
- rapidsai/raft#285
- rapidsai/raft#286
- rapidsai/rmm#816

Authors:
  - Victor Lafargue (https://github.com/viclafargue)
  - Dante Gama Dessavre (https://github.com/dantegd)

Approvers:
  - William Hicks (https://github.com/wphicks)
  - Micka (https://github.com/lowener)
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Divye Gala (https://github.com/divyegala)

URL: rapidsai#4077
  • Loading branch information
viclafargue authored Aug 30, 2021
1 parent 8d1ecb4 commit 269b7b1
Show file tree
Hide file tree
Showing 260 changed files with 3,714 additions and 4,409 deletions.
17 changes: 7 additions & 10 deletions cpp/bench/common/ml_benchmark.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,19 +80,15 @@ struct CudaEventTimer {

private:
::benchmark::State* state;
cudaStream_t stream;
cudaStream_t stream = 0;
cudaEvent_t start;
cudaEvent_t stop;
}; // end struct CudaEventTimer

/** Main fixture to be inherited and used by all other c++ benchmarks in cuml */
class Fixture : public ::benchmark::Fixture {
public:
Fixture(const std::string& name, std::shared_ptr<raft::mr::device::allocator> _alloc)
: ::benchmark::Fixture(), d_alloc(_alloc)
{
SetName(name.c_str());
}
Fixture(const std::string& name) : ::benchmark::Fixture() { SetName(name.c_str()); }
Fixture() = delete;

void SetUp(const ::benchmark::State& state) override
Expand Down Expand Up @@ -163,19 +159,20 @@ class Fixture : public ::benchmark::Fixture {
template <typename T>
void alloc(T*& ptr, size_t len, bool init = false)
{
auto nBytes = len * sizeof(T);
ptr = (T*)d_alloc->allocate(nBytes, stream);
auto nBytes = len * sizeof(T);
auto d_alloc = rmm::mr::get_current_device_resource();
ptr = (T*)d_alloc->allocate(nBytes, stream);
if (init) { CUDA_CHECK(cudaMemsetAsync(ptr, 0, nBytes, stream)); }
}

template <typename T>
void dealloc(T* ptr, size_t len)
{
auto d_alloc = rmm::mr::get_current_device_resource();
d_alloc->deallocate(ptr, len * sizeof(T), stream);
}

std::shared_ptr<raft::mr::device::allocator> d_alloc;
cudaStream_t stream;
cudaStream_t stream = 0;
int l2CacheSize;
char* scratchBuffer;
}; // class Fixture
Expand Down
9 changes: 1 addition & 8 deletions cpp/bench/prims/add.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

#include <common/ml_benchmark.hpp>
#include <raft/linalg/add.cuh>
#include <raft/mr/device/allocator.hpp>

namespace MLCommon {
namespace Bench {
Expand All @@ -28,13 +27,7 @@ struct AddParams {

template <typename T>
struct AddBench : public Fixture {
AddBench(const std::string& name, const AddParams& p)
: Fixture(
name,
std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
params(p)
{
}
AddBench(const std::string& name, const AddParams& p) : Fixture(name), params(p) {}

protected:
void allocateBuffers(const ::benchmark::State& state) override
Expand Down
41 changes: 16 additions & 25 deletions cpp/bench/prims/distance_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#include <raft/cudart_utils.h>
#include <common/ml_benchmark.hpp>
#include <raft/distance/distance.cuh>
#include <raft/mr/device/allocator.hpp>

namespace MLCommon {
namespace Bench {
Expand All @@ -31,42 +30,34 @@ struct Params {
template <typename T, raft::distance::DistanceType DType>
struct Distance : public Fixture {
Distance(const std::string& name, const Params& p)
: Fixture(
name,
std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
params(p)
: Fixture(name), params(p), x(0, stream), y(0, stream), out(0, stream), workspace(0, stream)
{
}

protected:
void allocateBuffers(const ::benchmark::State& state) override
{
alloc(x, params.m * params.k, true);
alloc(y, params.n * params.k, true);
alloc(out, params.m * params.n, true);
workspace = nullptr;
worksize = raft::distance::getWorkspaceSize<DType, T, T, T>(x, y, params.m, params.n, params.k);
if (worksize != 0) { alloc(workspace, worksize, false); }
}

void deallocateBuffers(const ::benchmark::State& state) override
{
dealloc(x, params.m * params.k);
dealloc(y, params.n * params.k);
dealloc(out, params.m * params.n);
dealloc(workspace, worksize);
x.resize(params.m * params.k, stream);
y.resize(params.n * params.k, stream);
out.resize(params.m * params.n, stream);
CUDA_CHECK(cudaMemsetAsync(x.data(), 0, x.size() * sizeof(T), stream));
CUDA_CHECK(cudaMemsetAsync(y.data(), 0, y.size() * sizeof(T), stream));
CUDA_CHECK(cudaMemsetAsync(out.data(), 0, out.size() * sizeof(T), stream));
worksize = raft::distance::getWorkspaceSize<DType, T, T, T>(
x.data(), y.data(), params.m, params.n, params.k);
workspace.resize(worksize, stream);
}

void runBenchmark(::benchmark::State& state) override
{
loopOnState(state, [this]() {
raft::distance::distance<DType, T, T, T>(x,
y,
out,
raft::distance::distance<DType, T, T, T>(x.data(),
y.data(),
out.data(),
params.m,
params.n,
params.k,
(void*)workspace,
(void*)workspace.data(),
worksize,
stream,
params.isRowMajor);
Expand All @@ -75,8 +66,8 @@ struct Distance : public Fixture {

private:
Params params;
T *x, *y, *out;
char* workspace;
rmm::device_uvector<T> x, y, out;
rmm::device_uvector<char> workspace;
size_t worksize;
}; // struct Distance

Expand Down
9 changes: 1 addition & 8 deletions cpp/bench/prims/fused_l2_nn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include <limits>
#include <raft/distance/fused_l2_nn.cuh>
#include <raft/linalg/norm.cuh>
#include <raft/mr/device/allocator.hpp>
#include <raft/random/rng.cuh>

namespace MLCommon {
Expand All @@ -32,13 +31,7 @@ struct FLNParams {

template <typename T>
struct FusedL2NN : public Fixture {
FusedL2NN(const std::string& name, const FLNParams& p)
: Fixture(
name,
std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
params(p)
{
}
FusedL2NN(const std::string& name, const FLNParams& p) : Fixture(name), params(p) {}

protected:
void allocateBuffers(const ::benchmark::State& state) override
Expand Down
36 changes: 13 additions & 23 deletions cpp/bench/prims/gram_matrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
*/

#include <cuml/matrix/kernelparams.h>
#include <raft/linalg/cublas_wrappers.h>
#include <common/ml_benchmark.hpp>
#include <matrix/grammatrix.cuh>
#include <matrix/kernelfactory.cuh>
#include <memory>
#include <raft/mr/device/allocator.hpp>
#include <raft/random/rng.cuh>
#include <sstream>
#include <string>
Expand All @@ -42,10 +42,7 @@ struct GramTestParams {
template <typename T>
struct GramMatrix : public Fixture {
GramMatrix(const std::string& name, const GramTestParams& p)
: Fixture(
name,
std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
params(p)
: Fixture(name), params(p), A(0, stream), B(0, stream), C(0, stream)
{
std::vector<std::string> kernel_names{"linear", "poly", "rbf", "tanh"};
std::ostringstream oss;
Expand All @@ -63,31 +60,24 @@ struct GramMatrix : public Fixture {
protected:
void allocateBuffers(const ::benchmark::State& state) override
{
alloc(A, params.m * params.k);
alloc(B, params.k * params.n);
alloc(C, params.m * params.n);
A.resize(params.m * params.k, stream);
B.resize(params.k * params.n, stream);
C.resize(params.m * params.n, stream);
raft::random::Rng r(123456ULL);
r.uniform(A, params.m * params.k, T(-1.0), T(1.0), stream);
r.uniform(B, params.k * params.n, T(-1.0), T(1.0), stream);
}

void deallocateBuffers(const ::benchmark::State& state) override
{
dealloc(A, params.m * params.k);
dealloc(B, params.k * params.n);
dealloc(C, params.m * params.n);
r.uniform(A.data(), params.m * params.k, T(-1.0), T(1.0), stream);
r.uniform(B.data(), params.k * params.n, T(-1.0), T(1.0), stream);
}

void runBenchmark(::benchmark::State& state) override
{
if (!this->kernel) { state.SkipWithError("Kernel matrix is not initialized"); }
loopOnState(state, [this]() {
(*this->kernel)(this->A,
(*this->kernel)(A.data(),
this->params.m,
this->params.k,
this->B,
B.data(),
this->params.n,
this->C,
C.data(),
this->params.is_row_major,
this->stream);
});
Expand All @@ -98,9 +88,9 @@ struct GramMatrix : public Fixture {
std::unique_ptr<GramMatrixBase<T>> kernel;
GramTestParams params;

T* A; // input matrix A, size [m * k]
T* B; // input matrix B, size [n * k]
T* C; // output matrix C, size [m*n]
rmm::device_uvector<T> A; // input matrix A, size [m * k]
rmm::device_uvector<T> B; // input matrix B, size [n * k]
rmm::device_uvector<T> C; // output matrix C, size [m*n]
};

static std::vector<GramTestParams> getInputs()
Expand Down
25 changes: 7 additions & 18 deletions cpp/bench/prims/make_blobs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
*/

#include <common/ml_benchmark.hpp>
#include <raft/mr/device/allocator.hpp>
#include <random/make_blobs.cuh>

namespace MLCommon {
Expand All @@ -30,44 +29,34 @@ struct Params {
template <typename T>
struct MakeBlobs : public Fixture {
MakeBlobs(const std::string& name, const Params& p)
: Fixture(
name,
std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
params(p)
: Fixture(name), params(p), data(0, stream), labels(0, stream)
{
}

protected:
void allocateBuffers(const ::benchmark::State& state) override
{
alloc(data, params.rows * params.cols);
alloc(labels, params.rows);
}

void deallocateBuffers(const ::benchmark::State& state) override
{
dealloc(data, params.rows * params.cols);
dealloc(labels, params.rows);
data.resize(params.rows * params.cols, stream);
labels.resize(params.rows, stream);
}

void runBenchmark(::benchmark::State& state) override
{
loopOnState(state, [this]() {
MLCommon::Random::make_blobs(data,
labels,
MLCommon::Random::make_blobs(data.data(),
labels.data(),
params.rows,
params.cols,
params.clusters,
this->d_alloc,
this->stream,
params.row_major);
});
}

private:
Params params;
T* data;
int* labels;
rmm::device_uvector<T> data;
rmm::device_uvector<int> labels;
}; // struct MakeBlobs

static std::vector<Params> getInputs()
Expand Down
9 changes: 1 addition & 8 deletions cpp/bench/prims/map_then_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

#include <common/ml_benchmark.hpp>
#include <raft/linalg/map_then_reduce.cuh>
#include <raft/mr/device/allocator.hpp>

namespace MLCommon {
namespace Bench {
Expand All @@ -33,13 +32,7 @@ struct Identity {

template <typename T>
struct MapThenReduce : public Fixture {
MapThenReduce(const std::string& name, const Params& p)
: Fixture(
name,
std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
params(p)
{
}
MapThenReduce(const std::string& name, const Params& p) : Fixture(name), params(p) {}

protected:
void allocateBuffers(const ::benchmark::State& state) override
Expand Down
9 changes: 1 addition & 8 deletions cpp/bench/prims/matrix_vector_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

#include <common/ml_benchmark.hpp>
#include <raft/linalg/matrix_vector_op.cuh>
#include <raft/mr/device/allocator.hpp>

namespace MLCommon {
namespace Bench {
Expand All @@ -29,13 +28,7 @@ struct Params {

template <typename T>
struct MatVecOp : public Fixture {
MatVecOp(const std::string& name, const Params& p)
: Fixture(
name,
std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
params(p)
{
}
MatVecOp(const std::string& name, const Params& p) : Fixture(name), params(p) {}

protected:
void allocateBuffers(const ::benchmark::State& state) override
Expand Down
8 changes: 1 addition & 7 deletions cpp/bench/prims/permute.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,7 @@ struct Params {

template <typename T>
struct Permute : public Fixture {
Permute(const std::string& name, const Params& p)
: Fixture(
name,
std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
params(p)
{
}
Permute(const std::string& name, const Params& p) : Fixture(name), params(p) {}

protected:
void allocateBuffers(const ::benchmark::State& state) override
Expand Down
9 changes: 1 addition & 8 deletions cpp/bench/prims/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

#include <common/ml_benchmark.hpp>
#include <raft/linalg/reduce.cuh>
#include <raft/mr/device/allocator.hpp>

namespace MLCommon {
namespace Bench {
Expand All @@ -29,13 +28,7 @@ struct Params {

template <typename T>
struct Reduce : public Fixture {
Reduce(const std::string& name, const Params& p)
: Fixture(
name,
std::shared_ptr<raft::mr::device::allocator>(new raft::mr::device::default_allocator)),
params(p)
{
}
Reduce(const std::string& name, const Params& p) : Fixture(name), params(p) {}

protected:
void allocateBuffers(const ::benchmark::State& state) override
Expand Down
Loading

0 comments on commit 269b7b1

Please sign in to comment.