Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Lang] Add element access for sparse matrix on CUDA #6250

Merged
merged 16 commits into from
Nov 7, 2022
Merged
21 changes: 9 additions & 12 deletions misc/test_gpu_sm_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,29 +22,26 @@
cols = N - 1
Hanke98 marked this conversation as resolved.
Show resolved Hide resolved

S1 = random(rows, cols, density=0.5, random_state=rng,
data_rvs=rvs).astype(np_dtype)
data_rvs=rvs).astype(np_dtype).tocoo()
S2 = random(rows, cols, density=0.5, random_state=rng,
data_rvs=rvs).astype(np_dtype)
data_rvs=rvs).astype(np_dtype).tocoo()

nnz_A = len(S1.data)
nnz_B = len(S2.data)

coo_S1 = scipy.sparse.coo_matrix(S1)
coo_S2 = scipy.sparse.coo_matrix(S2)

row_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt)
col_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt)
value_coo_A = ti.ndarray(shape=nnz_A, dtype=val_dt)
row_coo_A.from_numpy(coo_S1.row)
col_coo_A.from_numpy(coo_S1.col)
value_coo_A.from_numpy(coo_S1.data)
row_coo_A.from_numpy(S1.row)
col_coo_A.from_numpy(S1.col)
value_coo_A.from_numpy(S1.data)

row_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt)
col_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt)
value_coo_B = ti.ndarray(shape=nnz_B, dtype=val_dt)
row_coo_B.from_numpy(coo_S2.row)
col_coo_B.from_numpy(coo_S2.col)
value_coo_B.from_numpy(coo_S2.data)
row_coo_B.from_numpy(S2.row)
col_coo_B.from_numpy(S2.col)
value_coo_B.from_numpy(S2.data)

A = ti.linalg.SparseMatrix(n=rows, m=cols, dtype=ti.f32)
B = ti.linalg.SparseMatrix(n=rows, m=cols, dtype=ti.f32)
Expand All @@ -71,7 +68,7 @@
E = A * 2.5
print(E)
print('>>>> verify:')
print((2.5 * S1).A)
print((S1 * 2.5).A)
print('>>>> A.T:')
F = A.transpose()
print(F)
Expand Down
75 changes: 64 additions & 11 deletions taichi/program/sparse_matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ struct key_hash {
};

template <typename T, typename T1, typename T2>
void print_triplet_from_csr(int64_t n_rows,
int n_cols,
T *row,
T1 *col,
T2 *value,
std::ostringstream &ostr) {
void print_triplets_from_csr(int64_t n_rows,
int n_cols,
T *row,
T1 *col,
T2 *value,
std::ostringstream &ostr) {
using Triplets = Eigen::Triplet<T2>;
std::vector<Triplets> trips;
for (int64_t i = 1; i <= n_rows; ++i) {
Expand All @@ -57,6 +57,20 @@ void print_triplet_from_csr(int64_t n_rows,
ostr << Eigen::MatrixXf(m.cast<float>()).format(clean_fmt);
}

template <typename T, typename T1, typename T2>
T2 get_element_from_csr(int row,
int col,
T *row_data,
T1 *col_data,
T2 *value) {
for (T i = row_data[row]; i < row_data[row + 1]; ++i) {
strongoier marked this conversation as resolved.
Show resolved Hide resolved
if (col == col_data[i])
return value[i];
}
// zero entry
return 0;
}

} // namespace

namespace taichi::lang {
Expand Down Expand Up @@ -433,7 +447,7 @@ std::unique_ptr<SparseMatrix> CuSparseMatrix::addition(
std::unique_ptr<SparseMatrix> CuSparseMatrix::matmul(
const CuSparseMatrix &other) const {
#if defined(TI_WITH_CUDA)
return gemm(other, 1.0f, 1.0f);
return gemm(other, 1.0f, 0.0f);
#else
TI_NOT_IMPLEMENTED;
return std::unique_ptr<SparseMatrix>();
Expand All @@ -446,7 +460,7 @@ std::unique_ptr<SparseMatrix> CuSparseMatrix::gemm(const CuSparseMatrix &other,
const float alpha,
const float beta) const {
#if defined(TI_WITH_CUDA)
cusparseHandle_t handle;
cusparseHandle_t handle = nullptr;
CUSPARSEDriver::get_instance().cpCreate(&handle);
cusparseOperation_t op_A = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseOperation_t op_B = CUSPARSE_OPERATION_NON_TRANSPOSE;
Expand All @@ -468,7 +482,7 @@ std::unique_ptr<SparseMatrix> CuSparseMatrix::gemm(const CuSparseMatrix &other,
CUSPARSEDriver::get_instance().cpCreateSpGEMM(&spgemm_desc);

// 3. ask buffer_size1 bytes for external memory
void *d_buffer1;
void *d_buffer1 = nullptr;
size_t buffer_size1 = 0;
CUSPARSEDriver::get_instance().cpSpGEMM_workEstimation(
handle, op_A, op_B, &alpha, this->matrix_, other.matrix_, &beta, mat_C,
Expand All @@ -486,7 +500,7 @@ std::unique_ptr<SparseMatrix> CuSparseMatrix::gemm(const CuSparseMatrix &other,
CUSPARSEDriver::get_instance().cpSpGEMM_compute(
handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F,
CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size2, nullptr);
void *d_buffer2;
void *d_buffer2 = nullptr;
CUDADriver::get_instance().malloc((void **)&d_buffer2, buffer_size2);

// 6. compute the intermediate product of A * B
Expand Down Expand Up @@ -647,9 +661,48 @@ const std::string CuSparseMatrix::to_string() const {
CUDADriver::get_instance().memcpy_device_to_host((void *)hV, (void *)dV,
(nnz) * sizeof(float));

print_triplet_from_csr<int, int, float>(rows, cols, hR, hC, hV, ostr);
print_triplets_from_csr<int, int, float>(rows, cols, hR, hC, hV, ostr);
delete[] hR;
delete[] hC;
delete[] hV;
#endif
return ostr.str();
}

float CuSparseMatrix::get_element(int row, int col) const {
float res = 0.0f;
#ifdef TI_WITH_CUDA
size_t rows, cols, nnz;
float *dR;
int *dC, *dV;
cusparseIndexType_t row_type, column_type;
cusparseIndexBase_t idx_base;
cudaDataType value_type;
CUSPARSEDriver::get_instance().cpCsrGet(
matrix_, &rows, &cols, &nnz, (void **)&dR, (void **)&dC, (void **)&dV,
&row_type, &column_type, &idx_base, &value_type);

TI_ASSERT(row < rows);
TI_ASSERT(col < cols);

auto *hR = new int[rows + 1];
auto *hC = new int[nnz];
auto *hV = new float[nnz];

CUDADriver::get_instance().memcpy_device_to_host((void *)hR, (void *)dR,
(rows + 1) * sizeof(int));
CUDADriver::get_instance().memcpy_device_to_host((void *)hC, (void *)dC,
(nnz) * sizeof(int));
CUDADriver::get_instance().memcpy_device_to_host((void *)hV, (void *)dV,
(nnz) * sizeof(float));

res = get_element_from_csr<int, int, float>(row, col, hR, hC, hV);

delete[] hR;
delete[] hC;
delete[] hV;
#endif // TI_WITH_CUDA
return res;
}

} // namespace taichi::lang
2 changes: 2 additions & 0 deletions taichi/program/sparse_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,8 @@ class CuSparseMatrix : public SparseMatrix {
return &matrix_;
};

float get_element(int row, int col) const;

const std::string to_string() const override;

void *get_row_ptr() const {
Expand Down
1 change: 1 addition & 0 deletions taichi/python/export_lang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1262,6 +1262,7 @@ void export_lang(py::module &m) {
.def(float32() * py::self)
.def("matmul", &CuSparseMatrix::matmul)
.def("transpose", &CuSparseMatrix::transpose)
.def("get_element", &CuSparseMatrix::get_element)
.def("to_string", &CuSparseMatrix::to_string);

py::class_<SparseSolver>(m, "SparseSolver")
Expand Down
80 changes: 79 additions & 1 deletion tests/python/test_sparse_matrix.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import numpy as np
import pytest

import taichi as ti
Expand Down Expand Up @@ -379,6 +378,7 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),

@test_utils.test(arch=ti.cuda)
def test_gpu_sparse_matrix():
import numpy as np
h_coo_row = np.asarray([1, 0, 0, 0, 2, 2, 2, 3, 3], dtype=np.int32)
h_coo_col = np.asarray([1, 0, 2, 3, 0, 2, 3, 1, 3], dtype=np.int32)
h_coo_val = np.asarray([4.0, 1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0],
Expand Down Expand Up @@ -414,3 +414,81 @@ def test_gpu_sparse_matrix():
A.spmv(X, Y)
for i in range(4):
assert Y[i] == h_Y[i]


@pytest.mark.parametrize('N', [5])
@test_utils.test(arch=ti.cuda)
def test_gpu_sparse_matrix_ops(N):
import numpy as np
from numpy.random import default_rng
from scipy import stats
from scipy.sparse import coo_matrix, random

seed = 2
np.random.seed(seed)
rng = default_rng(seed)
rvs = stats.poisson(3, loc=1).rvs
np_dtype = np.float32
idx_dt = ti.int32
val_dt = ti.float32

n_rows = N
n_cols = N - 1

S1 = random(n_rows, n_cols, density=0.5, random_state=rng,
data_rvs=rvs).astype(np_dtype).tocoo()
S2 = random(n_rows, n_cols, density=0.5, random_state=rng,
data_rvs=rvs).astype(np_dtype).tocoo()

nnz_A = len(S1.data)
nnz_B = len(S2.data)

row_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt)
col_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt)
value_coo_A = ti.ndarray(shape=nnz_A, dtype=val_dt)
row_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt)
col_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt)
value_coo_B = ti.ndarray(shape=nnz_B, dtype=val_dt)

row_coo_A.from_numpy(S1.row)
col_coo_A.from_numpy(S1.col)
value_coo_A.from_numpy(S1.data)

row_coo_B.from_numpy(S2.row)
col_coo_B.from_numpy(S2.col)
value_coo_B.from_numpy(S2.data)

A = ti.linalg.SparseMatrix(n=n_rows, m=n_cols, dtype=ti.f32)
B = ti.linalg.SparseMatrix(n=n_rows, m=n_cols, dtype=ti.f32)
A.build_coo(row_coo_A, col_coo_A, value_coo_A)
B.build_coo(row_coo_B, col_coo_B, value_coo_B)

def verify(scipy_spm, taichi_spm):
scipy_spm = scipy_spm.tocoo()
for i, j, v in zip(scipy_spm.row, scipy_spm.col, scipy_spm.data):
# assert v == taichi_spm[i, j]
Hanke98 marked this conversation as resolved.
Show resolved Hide resolved
assert v == test_utils.approx(taichi_spm[i, j], rel=1e-5)

C = A + B
S3 = S1 + S2
verify(S3, C)

D = C - A
S4 = S3 - S1
verify(S4, D)

E = A * 2.5
S5 = S1 * 2.5
verify(S5, E)
Hanke98 marked this conversation as resolved.
Show resolved Hide resolved

F = A * 2.5
S6 = S1 * 2.5
verify(S6, F)

G = A.transpose()
S7 = S1.T
verify(S7, G)

H = A @ B.transpose()
S8 = S1 @ S2.T
verify(S8, H)