ggml-qnn: bug free and update comments according to the refined ggml-…

…backend-subsystem (#217)
zhouwg · May 31, 2024 · 0d05e7e · 0d05e7e · zhouwg · May 31, 2024
1 parent bee4a4b
commit 0d05e7e
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 50 deletions.
diff --git a/cdeosplayer/kantv/build.gradle b/cdeosplayer/kantv/build.gradle
@@ -14,7 +14,7 @@ android {
         externalNativeBuild {
             cmake {
                 //modify to -DCMAKE_BUILD_TYPE=Release before prepare release apk
-                arguments += "-DCMAKE_BUILD_TYPE=Release"
+                arguments += "-DCMAKE_BUILD_TYPE=Debug"
                 //weiguo:2024-05-28, added for fix issue in this PR:https://github.com/zhouwg/kantv/pull/204
                 arguments += "-DCMAKE_ANDROID_STL_TYPE=c++_shared"
                 arguments += "-DANDROID_STL=c++_shared"
@@ -69,7 +69,7 @@ android {
     }
 
     productFlavors {
-        all32 { minSdkVersion project.ext.minSdkVersion }
+        //all32 { minSdkVersion project.ext.minSdkVersion }
         all64 { minSdkVersion project.ext.minSdkVersion }
     }
     sourceSets {

diff --git a/core/ggml/jni/ggml-jni-impl-external.cpp b/core/ggml/jni/ggml-jni-impl-external.cpp
@@ -1348,21 +1348,12 @@ int  llama_inference(const char * model_path, const char * prompt, int bench_typ
 
     ggml_jni_llama_print_timings(ctx);
     if (ctx_guidance) {
-        LOGGD("here");
         llama_free(ctx_guidance);
     }
-    LOGGD("here");
-    //TODO:crash here on Xiaomi 14 and memory leak after comment it
-    //llama_free(ctx); //TODO:
-    LOGGD("here");
-    //TODO:crash here on Xiaomi 14 and memory leak after comment it
-    //llama_free_model(model);
-    LOGGD("here");
-
+    llama_free(ctx);
+    llama_free_model(model);
     llama_sampling_free(ctx_sampling);
-    LOGGD("here");
     llama_backend_free();
-    LOGGD("here");
 
     return 0;
 }

diff --git a/core/ggml/jni/llm-inference.cpp b/core/ggml/jni/llm-inference.cpp
@@ -989,15 +989,8 @@ int llama_inference_main(int argc, char ** argv, int backend) {
     write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
 
     if (ctx_guidance) { llama_free(ctx_guidance); }
-
-    LOGGD("here");
-    //TODO:crash here on Xiaomi 14 and memory leak after comment it
-    //llama_free(ctx);
-    LOGGD("here");
-    //TODO:crash here on Xiaomi 14 and memory leak after comment it
-    //llama_free_model(model);
-    LOGGD("here");
-
+    llama_free(ctx);
+    llama_free_model(model);
     llama_sampling_free(ctx_sampling);
     llama_backend_free();
 

diff --git a/core/ggml/llamacpp/ggml-qnn.cpp b/core/ggml/llamacpp/ggml-qnn.cpp
@@ -124,7 +124,7 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const
 #define BUF_CONTROL_BASE                                0xEE000000
 
 #define GGML_QNN_DEBUG                                  1  //for troubleshooting QNN backend, should be changed to 0 in product envs
-#define NOT_IN_PR                                       1  //for update PR(https://github.com/ggerganov/llama.cpp/pull/6869) in upstream easily and quickly
+#define NOT_IN_PR                                       0  //for update PR(https://github.com/ggerganov/llama.cpp/pull/6869) in upstream easily and quickly
 
 
 #define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG,  __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
@@ -892,7 +892,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
         *scales                                    = (float *)malloc(scaleSize);
         memscpy(*scales, scaleSize, srcQParam.bwAxisScaleOffsetEncoding.scales, scaleSize);
 
-        // Only copy offsets if present, nullptr implies all offsets are 0
+        // only copy offsets if present, nullptr implies all offsets are 0
         if (bwAxisScaleOffset.offsets != nullptr) {
             size_t offsetSize = bwAxisScaleOffset.numElements * sizeof(int32_t);
             *offsets          = (int32_t *)malloc(offsetSize);
@@ -903,7 +903,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
         QNN_TENSOR_SET_QUANT_PARAMS(dst, srcQParam);
     }
 
-    // need to allocate and copy memory for all the pointer members
+    // allocate and copy memory for all the pointer members
     uint32_t rank = QNN_TENSOR_GET_RANK(src);
     QNN_TENSOR_SET_RANK(dst, rank);
     size_t dim_size       = rank * sizeof(uint32_t);
@@ -922,20 +922,8 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
 static int free_qnn_tensor(Qnn_Tensor_t & tensor) {
     int err = 0;
     VALIDATE_TENSOR_VERSION(tensor, err);
-
-    if (nullptr == QNN_TENSOR_GET_NAME(tensor)) {
-        QNN_LOG_INFO("it should not happen, pls check");
-    } else {
-        //QNN_LOG_DEBUG("QNN tensor name %s", QNN_TENSOR_GET_NAME(tensor));
-        free((void *) QNN_TENSOR_GET_NAME(tensor));
-    }
-    if (nullptr == QNN_TENSOR_GET_DIMENSIONS(tensor)) {
-        QNN_LOG_INFO("it should not happen, pls check");
-    } else {
-        //TODO:why crash in here? why pointer changed with mul_mat?
-        //memory leak after comment below line
-        //free(QNN_TENSOR_GET_DIMENSIONS(tensor));
-    }
+    free((void *) QNN_TENSOR_GET_NAME(tensor));
+    free(QNN_TENSOR_GET_DIMENSIONS(tensor));
 
     return err;
 }
@@ -3521,6 +3509,7 @@ struct ggml_backend_qnn_buffer_context {
         if (buffer) {
             free(buffer);
         }
+
         for (auto * sub_buffer : sub_buffers) {
             free(sub_buffer);
         }
@@ -3530,16 +3519,6 @@ struct ggml_backend_qnn_buffer_context {
             free(qnn_tensor);
         }
 
-        std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *, Qnn_Tensor_t *, Qnn_Tensor_t *>>::iterator graph_it;
-        struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *) g_qnn_backend->context;
-        QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->instance->get_qnn_raw_interface();
-        for (graph_it = backend_ctx->instance->_qnn_graph_map.begin(); graph_it != backend_ctx->instance->_qnn_graph_map.end(); graph_it++) {
-            auto & graph_item = graph_it->second;
-            Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item);
-            QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str());
-        }
-        backend_ctx->instance->_qnn_graph_map.clear();
-
         sub_buffers.clear();
         qnn_tensors.clear();
     }
@@ -3771,6 +3750,15 @@ static void ggml_backend_qnn_free(ggml_backend_t backend) {
 
     qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance;
     if (instance != nullptr) {
+        std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *, Qnn_Tensor_t *, Qnn_Tensor_t *>>::iterator graph_it;
+        QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->instance->get_qnn_raw_interface();
+        for (graph_it = instance->_qnn_graph_map.begin(); graph_it != instance->_qnn_graph_map.end(); graph_it++) {
+            auto & graph_item = graph_it->second;
+            Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item);
+            QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str());
+        }
+        instance->_qnn_graph_map.clear();
+
         instance->qnn_finalize();
         delete instance;
         g_qnn_mgr[ctx->device].instance = nullptr;
@@ -3832,7 +3820,9 @@ static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tens
 //note: this function be used in new/proposal/refined ggml backend subsystem:
 // https://github.com/zhouwg/kantv/pull/216 in this project
 // https://github.com/ggerganov/llama.cpp/pull/7641 in upstream
-// new ggml backend can following this style for mixed inference between CPU&GPU / CPU&NPU very easily
+//
+// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true)
+// can following this style for mixed inference between CPU&GPU / CPU&NPU very easily
 // and the complex/complicated "Backend Sched" feature in ggml backend subsystem could be not used along this new approach
 static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) {
     GGML_UNUSED(backend);