Skip to content

Commit

Permalink
Free Cublas GPU memory
Browse files Browse the repository at this point in the history
I have corrected the PR ggerganov#5576 which causes crash and streamlined the code.
Unfortunately, this does not free all occupied GPU memory yet (only 15% of it). We still need to find some objects which are not freed after releasing GPU memory.
  • Loading branch information
zsogitbe committed Mar 6, 2024
1 parent 8ced9f7 commit 5a790a3
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 4 deletions.
21 changes: 17 additions & 4 deletions ggml-cuda.cu
Expand Up @@ -8751,10 +8751,11 @@ GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}

static bool g_cublas_initialized = false;

GGML_CALL void ggml_init_cublas() {
static bool initialized = false;

if (!initialized) {
if (!g_cublas_initialized) {

#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
Expand All @@ -8764,7 +8765,7 @@ GGML_CALL void ggml_init_cublas() {
#endif

if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
initialized = true;
g_cublas_initialized = true;
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
Expand Down Expand Up @@ -8835,7 +8836,7 @@ GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));

initialized = true;
g_cublas_initialized = true;
g_cublas_loaded = true;
}
}
Expand Down Expand Up @@ -12490,3 +12491,15 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}

extern "C" GGML_CALL void ggml_free_cublas(void);

GGML_CALL void ggml_free_cublas(void) {
#ifdef GGML_USE_CUBLAS
for (int id = 0; id < g_device_count; ++id) {
CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id]));
g_cublas_handles[id] = nullptr;
}
g_cublas_initialized = false;
#endif
}
3 changes: 3 additions & 0 deletions ggml-cuda.h
Expand Up @@ -17,6 +17,9 @@ extern "C" {

#define GGML_CUDA_MAX_DEVICES 16

// Release CUDA resources
GGML_API GGML_CALL void ggml_free_cublas(void);

// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
GGML_API GGML_CALL void ggml_init_cublas(void);

Expand Down

0 comments on commit 5a790a3

Please sign in to comment.