Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[lang] Reorder sparse matrix before solving #6886

Merged
merged 7 commits into from
Dec 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/taichi/examples/simulation/stable_fluid.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
help='The arch (backend) to run this example on')
args, unknowns = parser.parse_known_args()

res = 256
res = 512
dt = 0.03
p_jacobi_iters = 500 # 40 for a quicker but less accurate result
f_strength = 10000.0
Expand Down
153 changes: 135 additions & 18 deletions taichi/program/sparse_solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,11 @@ void CuSparseSolver::analyze_pattern(const SparseMatrix &sm) {
SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm);
CuSparseMatrix *A = dynamic_cast<CuSparseMatrix *>(sm_no_cv);
size_t rowsA = A->num_rows();
size_t colsA = A->num_cols();
size_t nnzA = A->get_nnz();
void *d_csrRowPtrA = A->get_row_ptr();
void *d_csrColIndA = A->get_col_ind();
void *d_csrValA = A->get_val_ptr();
CUSOLVERDriver::get_instance().csSpCreate(&cusolver_handle_);
CUSPARSEDriver::get_instance().cpCreate(&cusparse_handel_);
CUSPARSEDriver::get_instance().cpCreateMatDescr(&descr_);
Expand All @@ -210,12 +212,68 @@ void CuSparseSolver::analyze_pattern(const SparseMatrix &sm) {
CUSPARSEDriver::get_instance().cpSetMatIndexBase(descr_,
CUSPARSE_INDEX_BASE_ZERO);

// step 1: create opaque info structure
// step 1: reorder the sparse matrix
float *h_csrValA = nullptr;
h_Q = (int *)malloc(sizeof(int) * colsA);
h_csrRowPtrB = (int *)malloc(sizeof(int) * (rowsA + 1));
h_csrColIndB = (int *)malloc(sizeof(int) * nnzA);
h_csrValB = (float *)malloc(sizeof(float) * nnzA);
h_csrValA = (float *)malloc(sizeof(float) * nnzA);
h_mapBfromA = (int *)malloc(sizeof(int) * nnzA);
assert(nullptr != h_Q);
assert(nullptr != h_csrRowPtrB);
assert(nullptr != h_csrColIndB);
assert(nullptr != h_csrValB);
assert(nullptr != h_mapBfromA);

CUDADriver::get_instance().memcpy_device_to_host(h_csrRowPtrB, d_csrRowPtrA,
sizeof(int) * (rowsA + 1));
CUDADriver::get_instance().memcpy_device_to_host(h_csrColIndB, d_csrColIndA,
sizeof(int) * nnzA);
CUDADriver::get_instance().memcpy_device_to_host(h_csrValA, d_csrValA,
sizeof(float) * nnzA);

// compoute h_Q
CUSOLVERDriver::get_instance().csSpXcsrsymamdHost(
cusolver_handle_, rowsA, nnzA, descr_, h_csrRowPtrB, h_csrColIndB, h_Q);
CUDADriver::get_instance().malloc((void **)&d_Q, sizeof(int) * colsA);
CUDADriver::get_instance().memcpy_host_to_device((void *)d_Q, (void *)h_Q,
sizeof(int) * (colsA));
size_t size_perm = 0;
CUSOLVERDriver::get_instance().csSpXcsrperm_bufferSizeHost(
cusolver_handle_, rowsA, colsA, nnzA, descr_, h_csrRowPtrB, h_csrColIndB,
h_Q, h_Q, &size_perm);
void *buffer_cpu = (void *)malloc(sizeof(char) * size_perm);
assert(nullptr != buffer_cpu);
for (int j = 0; j < nnzA; j++) {
h_mapBfromA[j] = j;
}
CUSOLVERDriver::get_instance().csSpXcsrpermHost(
cusolver_handle_, rowsA, colsA, nnzA, descr_, h_csrRowPtrB, h_csrColIndB,
h_Q, h_Q, h_mapBfromA, buffer_cpu);
// B = A( mapBfromA )
for (int j = 0; j < nnzA; j++) {
h_csrValB[j] = h_csrValA[h_mapBfromA[j]];
}
CUDADriver::get_instance().malloc((void **)&d_csrRowPtrB,
sizeof(int) * (rowsA + 1));
CUDADriver::get_instance().malloc((void **)&d_csrColIndB, sizeof(int) * nnzA);
CUDADriver::get_instance().malloc((void **)&d_csrValB, sizeof(float) * nnzA);
CUDADriver::get_instance().memcpy_host_to_device(
(void *)d_csrRowPtrB, (void *)h_csrRowPtrB, sizeof(int) * (rowsA + 1));
CUDADriver::get_instance().memcpy_host_to_device(
(void *)d_csrColIndB, (void *)h_csrColIndB, sizeof(int) * nnzA);
CUDADriver::get_instance().memcpy_host_to_device(
(void *)d_csrValB, (void *)h_csrValB, sizeof(float) * nnzA);
free(h_csrValA);
free(buffer_cpu);

// step 2: create opaque info structure
CUSOLVERDriver::get_instance().csSpCreateCsrcholInfo(&info_);

// step 2: analyze chol(A) to know structure of L
// step 3: analyze chol(A) to know structure of L
CUSOLVERDriver::get_instance().csSpXcsrcholAnalysis(
cusolver_handle_, rowsA, nnzA, descr_, d_csrRowPtrA, d_csrColIndA, info_);
cusolver_handle_, rowsA, nnzA, descr_, d_csrRowPtrB, d_csrColIndB, info_);
is_analyzed_ = true;
#else
TI_NOT_IMPLEMENTED
Expand All @@ -229,26 +287,23 @@ void CuSparseSolver::factorize(const SparseMatrix &sm) {
CuSparseMatrix *A = dynamic_cast<CuSparseMatrix *>(sm_no_cv);
size_t rowsA = A->num_rows();
size_t nnzA = A->get_nnz();
void *d_csrRowPtrA = A->get_row_ptr();
void *d_csrColIndA = A->get_col_ind();
void *d_csrValA = A->get_val_ptr();

size_t size_internal = 0;
size_t size_chol = 0; // size of working space for csrlu
// step 1: workspace for chol(A)
CUSOLVERDriver::get_instance().csSpScsrcholBufferInfo(
cusolver_handle_, rowsA, nnzA, descr_, d_csrValA, d_csrRowPtrA,
d_csrColIndA, info_, &size_internal, &size_chol);
cusolver_handle_, rowsA, nnzA, descr_, d_csrValB, d_csrRowPtrB,
d_csrColIndB, info_, &size_internal, &size_chol);

if (size_chol > 0)
CUDADriver::get_instance().malloc(&gpu_buffer_, sizeof(char) * size_chol);

// step 2: compute A = L*L^T
CUSOLVERDriver::get_instance().csSpScsrcholFactor(
cusolver_handle_, rowsA, nnzA, descr_, d_csrValA, d_csrRowPtrA,
d_csrColIndA, info_, gpu_buffer_);
cusolver_handle_, rowsA, nnzA, descr_, d_csrValB, d_csrRowPtrB,
d_csrColIndB, info_, gpu_buffer_);
// step 3: check if the matrix is singular
const double tol = 1.e-14;
const float tol = 1.e-14;
int singularity = 0;
CUSOLVERDriver::get_instance().csSpScsrcholZeroPivot(cusolver_handle_, info_,
tol, &singularity);
Expand Down Expand Up @@ -439,16 +494,46 @@ void CuSparseSolver::solve_rf(Program *prog,
SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm);
CuSparseMatrix *A = dynamic_cast<CuSparseMatrix *>(sm_no_cv);
size_t rowsA = A->num_rows();
size_t colsA = A->num_cols();
size_t d_b = prog->get_ndarray_data_ptr_as_int(&b);
size_t d_x = prog->get_ndarray_data_ptr_as_int(&x);
CUSOLVERDriver::get_instance().csSpScsrcholSolve(
cusolver_handle_, rowsA, (void *)d_b, (void *)d_x, info_, gpu_buffer_);

// TODO: free allocated memory and handles
// CUDADriver::get_instance().mem_free(gpu_buffer_);
// CUSOLVERDriver::get_instance().csSpDestory(cusolver_handle_);
// CUSPARSEDriver::get_instance().cpDestroy(cusparse_handel_);
// CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrA);
// step 1: d_Qb = Q * b
void *d_Qb = nullptr;
CUDADriver::get_instance().malloc(&d_Qb, sizeof(float) * rowsA);
cusparseDnVecDescr_t vec_b;
cusparseSpVecDescr_t vec_Qb;
CUSPARSEDriver::get_instance().cpCreateDnVec(&vec_b, (int)rowsA, (void *)d_b,
CUDA_R_32F);
CUSPARSEDriver::get_instance().cpCreateSpVec(
&vec_Qb, (int)rowsA, (int)rowsA, d_Q, d_Qb, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
CUSPARSEDriver::get_instance().cpGather(cusparse_handel_, vec_b, vec_Qb);

// step 2: solve B*z = Q*b using cholesky solver
void *d_z = nullptr;
CUDADriver::get_instance().malloc(&d_z, sizeof(float) * colsA);
CUSOLVERDriver::get_instance().csSpScsrcholSolve(
cusolver_handle_, rowsA, (void *)d_Qb, (void *)d_z, info_, gpu_buffer_);

// step 3: Q*x = z
cusparseSpVecDescr_t vecX;
cusparseDnVecDescr_t vecY;
CUSPARSEDriver::get_instance().cpCreateSpVec(
&vecX, (int)colsA, (int)colsA, d_Q, d_z, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
CUSPARSEDriver::get_instance().cpCreateDnVec(&vecY, (int)colsA, (void *)d_x,
CUDA_R_32F);
CUSPARSEDriver::get_instance().cpScatter(cusparse_handel_, vecX, vecY);

if (d_Qb != nullptr)
CUDADriver::get_instance().mem_free(d_Qb);
if (d_z != nullptr)
CUDADriver::get_instance().mem_free(d_z);
CUSPARSEDriver::get_instance().cpDestroySpVec(vec_Qb);
CUSPARSEDriver::get_instance().cpDestroyDnVec(vec_b);
CUSPARSEDriver::get_instance().cpDestroySpVec(vecX);
CUSPARSEDriver::get_instance().cpDestroyDnVec(vecY);
#else
TI_NOT_IMPLEMENTED
#endif
Expand Down Expand Up @@ -492,6 +577,38 @@ std::unique_ptr<SparseSolver> make_sparse_solver(DataType dt,
TI_ERROR("Not supported sparse solver type: {}", solver_type);
}

CuSparseSolver::~CuSparseSolver() {
#if defined(TI_WITH_CUDA)
if (h_Q != nullptr)
free(h_Q);
if (h_csrRowPtrB != nullptr)
free(h_csrRowPtrB);
if (h_csrColIndB != nullptr)
free(h_csrColIndB);
if (h_csrValB != nullptr)
free(h_csrValB);
if (h_mapBfromA != nullptr)
free(h_mapBfromA);
if (info_ != nullptr)
CUSOLVERDriver::get_instance().csSpDestroyCsrcholInfo(info_);
if (cusolver_handle_ != nullptr)
CUSOLVERDriver::get_instance().csSpDestory(cusolver_handle_);
if (cusparse_handel_ != nullptr)
CUSPARSEDriver::get_instance().cpDestroy(cusparse_handel_);
if (descr_ != nullptr)
CUSPARSEDriver::get_instance().cpDestroyMatDescr(descr_);
if (gpu_buffer_ != nullptr)
CUDADriver::get_instance().mem_free(gpu_buffer_);
if (d_Q != nullptr)
CUDADriver::get_instance().mem_free(d_Q);
if (d_csrRowPtrB != nullptr)
CUDADriver::get_instance().mem_free(d_csrRowPtrB);
if (d_csrColIndB != nullptr)
CUDADriver::get_instance().mem_free(d_csrColIndB);
if (d_csrValB != nullptr)
CUDADriver::get_instance().mem_free(d_csrValB);
#endif
}
std::unique_ptr<SparseSolver> make_cusparse_solver(
DataType dt,
const std::string &solver_type,
Expand Down
11 changes: 10 additions & 1 deletion taichi/program/sparse_solver.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,18 @@ class CuSparseSolver : public SparseSolver {
bool is_analyzed_{false};
bool is_factorized_{false};

int *h_Q{nullptr}; /* <int> n, B = Q*A*Q' or B = A(Q,Q) by MATLAB notation */
int *d_Q{nullptr};
int *h_csrRowPtrB{nullptr}; /* <int> n+1 */
int *h_csrColIndB{nullptr}; /* <int> nnzA */
float *h_csrValB{nullptr}; /* <float> nnzA */
int *h_mapBfromA{nullptr}; /* <int> nnzA */
int *d_csrRowPtrB{nullptr}; /* <int> n+1 */
int *d_csrColIndB{nullptr}; /* <int> nnzA */
float *d_csrValB{nullptr}; /* <float> nnzA */
public:
CuSparseSolver();
~CuSparseSolver() override = default;
~CuSparseSolver() override;
bool compute(const SparseMatrix &sm) override {
TI_NOT_IMPLEMENTED;
};
Expand Down
2 changes: 2 additions & 0 deletions taichi/rhi/cuda/cusolver_functions.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@ PER_CUSOLVER_FUNCTION(csSpCreate, cusolverSpCreate, cusolverSpHandle_t * );
PER_CUSOLVER_FUNCTION(csSpDestory, cusolverSpDestroy, cusolverSpHandle_t );
PER_CUSOLVER_FUNCTION(csSpXcsrissymHost, cusolverSpXcsrissymHost, cusolverSpHandle_t,int ,int ,const cusparseMatDescr_t ,const void *,const void *,const void *,void *);
PER_CUSOLVER_FUNCTION(csSpXcsrsymrcmHost, cusolverSpXcsrsymrcmHost, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,const void *,const void *,void *);
PER_CUSOLVER_FUNCTION(csSpXcsrsymamdHost, cusolverSpXcsrsymamdHost, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,const void *,const void *,void *);
PER_CUSOLVER_FUNCTION(csSpXcsrperm_bufferSizeHost, cusolverSpXcsrperm_bufferSizeHost, cusolverSpHandle_t ,int ,int ,int ,const cusparseMatDescr_t ,void *,void *,const void *,const void *,size_t *);
PER_CUSOLVER_FUNCTION(csSpXcsrpermHost, cusolverSpXcsrpermHost, cusolverSpHandle_t ,int ,int ,int ,const cusparseMatDescr_t ,void *,void *,const void *,const void *,void *,void *);
PER_CUSOLVER_FUNCTION(csSpScsrlsvcholHost, cusolverSpScsrlsvcholHost, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,const void *,const void *,const void *,const void *,float ,int ,void *,void *);

// cusolver preview API
PER_CUSOLVER_FUNCTION(csSpCreateCsrcholInfo, cusolverSpCreateCsrcholInfo, csrcholInfo_t*);
PER_CUSOLVER_FUNCTION(csSpDestroyCsrcholInfo, cusolverSpDestroyCsrcholInfo, csrcholInfo_t);
PER_CUSOLVER_FUNCTION(csSpXcsrcholAnalysis, cusolverSpXcsrcholAnalysis, cusolverSpHandle_t,int ,int ,const cusparseMatDescr_t , void *, void *,csrcholInfo_t );
PER_CUSOLVER_FUNCTION(csSpScsrcholBufferInfo, cusolverSpScsrcholBufferInfo, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,void *,void *,void *,csrcholInfo_t ,size_t *,size_t *);
PER_CUSOLVER_FUNCTION(csSpScsrcholFactor, cusolverSpScsrcholFactor, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,void *,void *,void *,csrcholInfo_t ,void *);
Expand Down
2 changes: 2 additions & 0 deletions taichi/rhi/cuda/cusparse_functions.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ PER_CUSPARSE_FUNCTION(cpCreateIdentityPermutation, cusparseCreateIdentityPermuta
PER_CUSPARSE_FUNCTION(cpXcoosort_bufferSizeExt, cusparseXcoosort_bufferSizeExt, cusparseHandle_t,int ,int,int, void* ,void* ,void*);
PER_CUSPARSE_FUNCTION(cpXcoosortByRow, cusparseXcoosortByRow, cusparseHandle_t,int,int,int,void* ,void* ,void* ,void*);
PER_CUSPARSE_FUNCTION(cpGather, cusparseGather, cusparseHandle_t, cusparseDnVecDescr_t, cusparseSpVecDescr_t);
PER_CUSPARSE_FUNCTION(cpScatter, cusparseScatter, cusparseHandle_t, cusparseSpVecDescr_t, cusparseDnVecDescr_t);
PER_CUSPARSE_FUNCTION(cpSsctr, cusparseSsctr, cusparseHandle_t, int, void*, void*, void*, cusparseIndexBase_t);
PER_CUSPARSE_FUNCTION(cpSetPointerMode, cusparseSetPointerMode, cusparseHandle_t, cusparsePointerMode_t);
PER_CUSPARSE_FUNCTION(cpCsrSetPointers, cusparseCsrSetPointers, cusparseSpMatDescr_t,void*, void*, void*);

Expand Down