Skip to content

Commit

Permalink
ggml-qnn: bug free and update comments according to the refined ggml-…
Browse files Browse the repository at this point in the history
…backend-subsystem (#217)
  • Loading branch information
zhouwg committed May 31, 2024
1 parent bee4a4b commit 0d05e7e
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 50 deletions.
4 changes: 2 additions & 2 deletions cdeosplayer/kantv/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ android {
externalNativeBuild {
cmake {
//modify to -DCMAKE_BUILD_TYPE=Release before prepare release apk
arguments += "-DCMAKE_BUILD_TYPE=Release"
arguments += "-DCMAKE_BUILD_TYPE=Debug"
//weiguo:2024-05-28, added for fix issue in this PR:https://github.com/zhouwg/kantv/pull/204
arguments += "-DCMAKE_ANDROID_STL_TYPE=c++_shared"
arguments += "-DANDROID_STL=c++_shared"
Expand Down Expand Up @@ -69,7 +69,7 @@ android {
}

productFlavors {
all32 { minSdkVersion project.ext.minSdkVersion }
//all32 { minSdkVersion project.ext.minSdkVersion }
all64 { minSdkVersion project.ext.minSdkVersion }
}
sourceSets {
Expand Down
13 changes: 2 additions & 11 deletions core/ggml/jni/ggml-jni-impl-external.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1348,21 +1348,12 @@ int llama_inference(const char * model_path, const char * prompt, int bench_typ

ggml_jni_llama_print_timings(ctx);
if (ctx_guidance) {
LOGGD("here");
llama_free(ctx_guidance);
}
LOGGD("here");
//TODO:crash here on Xiaomi 14 and memory leak after comment it
//llama_free(ctx); //TODO:
LOGGD("here");
//TODO:crash here on Xiaomi 14 and memory leak after comment it
//llama_free_model(model);
LOGGD("here");

llama_free(ctx);
llama_free_model(model);
llama_sampling_free(ctx_sampling);
LOGGD("here");
llama_backend_free();
LOGGD("here");

return 0;
}
Expand Down
11 changes: 2 additions & 9 deletions core/ggml/jni/llm-inference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -989,15 +989,8 @@ int llama_inference_main(int argc, char ** argv, int backend) {
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

if (ctx_guidance) { llama_free(ctx_guidance); }

LOGGD("here");
//TODO:crash here on Xiaomi 14 and memory leak after comment it
//llama_free(ctx);
LOGGD("here");
//TODO:crash here on Xiaomi 14 and memory leak after comment it
//llama_free_model(model);
LOGGD("here");

llama_free(ctx);
llama_free_model(model);
llama_sampling_free(ctx_sampling);
llama_backend_free();

Expand Down
46 changes: 18 additions & 28 deletions core/ggml/llamacpp/ggml-qnn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const
#define BUF_CONTROL_BASE 0xEE000000

#define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend, should be changed to 0 in product envs
#define NOT_IN_PR 1 //for update PR(https://github.com/ggerganov/llama.cpp/pull/6869) in upstream easily and quickly
#define NOT_IN_PR 0 //for update PR(https://github.com/ggerganov/llama.cpp/pull/6869) in upstream easily and quickly


#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
Expand Down Expand Up @@ -892,7 +892,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
*scales = (float *)malloc(scaleSize);
memscpy(*scales, scaleSize, srcQParam.bwAxisScaleOffsetEncoding.scales, scaleSize);

// Only copy offsets if present, nullptr implies all offsets are 0
// only copy offsets if present, nullptr implies all offsets are 0
if (bwAxisScaleOffset.offsets != nullptr) {
size_t offsetSize = bwAxisScaleOffset.numElements * sizeof(int32_t);
*offsets = (int32_t *)malloc(offsetSize);
Expand All @@ -903,7 +903,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
QNN_TENSOR_SET_QUANT_PARAMS(dst, srcQParam);
}

// need to allocate and copy memory for all the pointer members
// allocate and copy memory for all the pointer members
uint32_t rank = QNN_TENSOR_GET_RANK(src);
QNN_TENSOR_SET_RANK(dst, rank);
size_t dim_size = rank * sizeof(uint32_t);
Expand All @@ -922,20 +922,8 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
static int free_qnn_tensor(Qnn_Tensor_t & tensor) {
int err = 0;
VALIDATE_TENSOR_VERSION(tensor, err);

if (nullptr == QNN_TENSOR_GET_NAME(tensor)) {
QNN_LOG_INFO("it should not happen, pls check");
} else {
//QNN_LOG_DEBUG("QNN tensor name %s", QNN_TENSOR_GET_NAME(tensor));
free((void *) QNN_TENSOR_GET_NAME(tensor));
}
if (nullptr == QNN_TENSOR_GET_DIMENSIONS(tensor)) {
QNN_LOG_INFO("it should not happen, pls check");
} else {
//TODO:why crash in here? why pointer changed with mul_mat?
//memory leak after comment below line
//free(QNN_TENSOR_GET_DIMENSIONS(tensor));
}
free((void *) QNN_TENSOR_GET_NAME(tensor));
free(QNN_TENSOR_GET_DIMENSIONS(tensor));

return err;
}
Expand Down Expand Up @@ -3521,6 +3509,7 @@ struct ggml_backend_qnn_buffer_context {
if (buffer) {
free(buffer);
}

for (auto * sub_buffer : sub_buffers) {
free(sub_buffer);
}
Expand All @@ -3530,16 +3519,6 @@ struct ggml_backend_qnn_buffer_context {
free(qnn_tensor);
}

std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *, Qnn_Tensor_t *, Qnn_Tensor_t *>>::iterator graph_it;
struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *) g_qnn_backend->context;
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->instance->get_qnn_raw_interface();
for (graph_it = backend_ctx->instance->_qnn_graph_map.begin(); graph_it != backend_ctx->instance->_qnn_graph_map.end(); graph_it++) {
auto & graph_item = graph_it->second;
Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item);
QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str());
}
backend_ctx->instance->_qnn_graph_map.clear();

sub_buffers.clear();
qnn_tensors.clear();
}
Expand Down Expand Up @@ -3771,6 +3750,15 @@ static void ggml_backend_qnn_free(ggml_backend_t backend) {

qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance;
if (instance != nullptr) {
std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *, Qnn_Tensor_t *, Qnn_Tensor_t *>>::iterator graph_it;
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->instance->get_qnn_raw_interface();
for (graph_it = instance->_qnn_graph_map.begin(); graph_it != instance->_qnn_graph_map.end(); graph_it++) {
auto & graph_item = graph_it->second;
Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item);
QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str());
}
instance->_qnn_graph_map.clear();

instance->qnn_finalize();
delete instance;
g_qnn_mgr[ctx->device].instance = nullptr;
Expand Down Expand Up @@ -3832,7 +3820,9 @@ static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tens
//note: this function be used in new/proposal/refined ggml backend subsystem:
// https://github.com/zhouwg/kantv/pull/216 in this project
// https://github.com/ggerganov/llama.cpp/pull/7641 in upstream
// new ggml backend can following this style for mixed inference between CPU&GPU / CPU&NPU very easily
//
// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true)
// can following this style for mixed inference between CPU&GPU / CPU&NPU very easily
// and the complex/complicated "Backend Sched" feature in ggml backend subsystem could be not used along this new approach
static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) {
GGML_UNUSED(backend);
Expand Down

1 comment on commit 0d05e7e

@zhouwg
Copy link
Owner Author

@zhouwg zhouwg commented on 0d05e7e May 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the existing "Backend Sched" feature is used for other scenarios which the backend need to use/operate device(CPU/GPU) memory directly, for example, the Intel SYCL backend.

any existing or new backend which only need to use/operate system memory can follow the style in ggml-qnn.cpp along the proposal/refine ggml backend subsystem(although the PR is not accepted by the maintainer of ggml backend subsystem).

in the fact, the existing "Backend Sched" feature is heavily used in the llama.cpp for various complex scenarios.

Please sign in to comment.