tsisw · akapoor3518 · Jun 30, 2025 · Jun 27, 2025 · Jun 27, 2025 · Jun 27, 2025
@@ -619,6 +619,9 @@ cd ../../
 #Compile for posix with build-posix as a target folder
 
 cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix
+to enable STatus use below command
+cmake -B build-posix   -DCMAKE_BUILD_TYPE=Debug   -DGGML_TSAVORITE=ON   -DCMAKE_C_FLAGS="-DGGML_PERF"   -DCMAKE_CXX_FLAGS="-DGGML_PERF"
+
 cmake --build build-posix --config Release
 
 #Compile for fpga with build-fpga as a target folder

@@ -317,6 +317,14 @@ extern "C" {
     GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
     GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
 
+#ifdef GGML_PERF
+    enum ggml_compute_backend_type {
+        GGML_COMPUTE_BACKEND_CPU=0,
+        GGML_COMPUTE_BACKEND_TSAVORITE,
+        GGML_COMPUTE_BACKEND_COUNT
+    };
+#endif /* GGML_PERF */
+
     enum ggml_status {
         GGML_STATUS_ALLOC_FAILED = -2,
         GGML_STATUS_FAILED = -1,
@@ -603,8 +611,14 @@ extern "C" {
         char name[GGML_MAX_NAME];
 
         void * extra; // extra things e.g. for ggml-cuda.cu
-
+#ifdef GGML_PERF
+	int64_t perf_runs;
+        int64_t perf_time_us;
+	enum ggml_compute_backend_type ggml_compute_backend;
+        char padding[4];
+#else
         char padding[8];
+#endif /* GGML_PERF */
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -2197,6 +2211,23 @@ extern "C" {
     GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
     GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
     GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
+
+#ifdef GGML_PERF
+// internal perf accumulation struct
+struct ggml_perf_totals {
+    int op_count;
+    int64_t total_us;
+    int64_t runs;
+    const char * op_name;
+};
+
+FILE * ggml_perf_log_open(const char *filename);
+void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp);
+
+// capture perf into totals
+void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph);
+
+#endif /* GGML_PERF */
 
 #ifdef  __cplusplus
 }

@@ -2844,8 +2844,21 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
         struct ggml_tensor * node = cgraph->nodes[node_n];
 
+#ifdef GGML_PERF
+        int64_t t_start = ggml_time_us();
+#endif
         ggml_compute_forward(&params, node);
 
+#ifdef GGML_PERF
+        int64_t t_end = ggml_time_us();
+        node->perf_runs++;
+        if (t_end >= t_start) {
+            node->perf_time_us += (t_end - t_start);
+        } else {
+            // Handle wraparound by assuming timer rolls over at max int64_t value
+            node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
+        }
+#endif
         if (state->ith == 0 && cplan->abort_callback &&
                 cplan->abort_callback(cplan->abort_callback_data)) {
             atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);

@@ -813,6 +813,9 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
   tensor_log log_data;
 
   for (int i = 0; i < cgraph->n_nodes; i++) {
+#ifdef GGML_PERF
+    int64_t t_start = ggml_time_us();
+#endif
     node = cgraph->nodes[i];
     src0 = node->src[0];
     src1 = node->src[1];
@@ -1122,6 +1125,17 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
           device->stats.op_run_count[kernel_type].max_num_of_elem < max_num_of_elem)
         device->stats.op_run_count[kernel_type].max_num_of_elem = max_num_of_elem;
     }
+#ifdef GGML_PERF
+    int64_t t_end = ggml_time_us();
+    node->perf_runs++;
+    node->ggml_compute_backend = GGML_COMPUTE_BACKEND_TSAVORITE;
+    if (t_end >= t_start) {
+        node->perf_time_us += (t_end - t_start);
+    } else {
+        // Handle wraparound by assuming timer rolls over at max int64_t value
+        node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
+    }
+#endif
   }
 
   // This this need to implement correctly when we have mixture of CPU and accelerator operation

@@ -250,7 +250,7 @@ void ggml_log_internal(enum ggml_log_level level, const char * format, ...) {
     va_list args;
     va_start(args, format);
     if (level == GGML_LOG_LEVEL_TSAVORITE)
-    ggml_log_internal_v(level, format, args);
+        ggml_log_internal_v(level, format, args);
     va_end(args);
 }
 
@@ -985,6 +985,13 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
+#ifdef GGML_PERF
+static const char * GGML_BACKEND_TYPE[GGML_COMPUTE_BACKEND_COUNT] = {
+    "CPU",
+    "TSAVORITE"
+};
+#endif /* GGML_PERF */
+
 static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -1200,6 +1207,12 @@ const char * ggml_op_name(enum ggml_op op) {
     return GGML_OP_NAME[op];
 }
 
+#ifdef GGML_PERF
+static const char * ggml_backend_type(enum ggml_compute_backend_type backend) {
+    return GGML_BACKEND_TYPE[backend];
+}
+#endif /* GGML_PERF */
+
 const char * ggml_op_symbol(enum ggml_op op) {
     return GGML_OP_SYMBOL[op];
 }
@@ -1617,6 +1630,11 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
         /*.name         =*/ { 0 },
         /*.extra        =*/ NULL,
+#ifdef GGML_PERF
+        /*.perf_runs    =*/ 0,
+        /*.perf_time_us =*/ 0,
+        /*.ggml_compute_backend =*/ GGML_COMPUTE_BACKEND_CPU,
+#endif /* GGML_PERF */
         /*.padding      =*/ { 0 },
     };
 
@@ -6549,3 +6567,66 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons
     if (p0->strict_cpu     != p1->strict_cpu )    return false;
     return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
 }
+
+#ifdef GGML_PERF
+void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph) {
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+        enum ggml_op op = node->op;
+
+        if (op >= GGML_OP_COUNT) continue;
+
+        totals[op].op_name = ggml_op_name(op);
+        totals[op].total_us += node->perf_time_us;
+        totals[op].runs     += node->perf_runs;
+        totals[op].op_count++;
+    }
+}
+
+FILE * ggml_perf_log_open(const char *filename) {
+    // Try to delete existing file, ignore error if it doesn't exist
+    remove(filename);
+
+    // Create a new file in write mode
+    FILE *fp = fopen(filename, "w");
+    if (!fp) {
+        fprintf(stderr, "Error: Could not create file %s\n", filename);
+        return NULL;
+    }
+
+    return fp;
+}
+
+void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp) {
+    if (!fp) return;
+
+    int64_t total_time_us = 0;
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        if (cgraph->nodes[i]->perf_runs > 0) {
+            total_time_us += cgraph->nodes[i]->perf_time_us;
+        }
+    }
+
+    fprintf(fp, "ggml_graph_compute_perf: total compute time: %.3f ms\n", total_time_us / 1000.0);
+
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+        if (node->perf_runs == 0) continue;
+
+        double t_ms   = node->perf_time_us / 1000.0;
+        double avg_ms = t_ms / node->perf_runs;
+
+        fprintf(fp,
+            " - BACKEND:%s OP:%s: total %.3f ms over %d runs (avg %.3f ms) [shape=%d,%d,%d]\n",
+            ggml_backend_type(node->ggml_compute_backend),
+            ggml_op_name(node->op),
+            t_ms,
+            node->perf_runs,
+            avg_ms,
+            node->ne[0], node->ne[1], node->ne[2]);
+    }
+
+    fprintf(fp, "--------------------------------------------------\n\n");
+}
+
+#endif /* GGML_PERF */
@@ -932,6 +932,9 @@ int llama_context::decode(llama_batch & inp_batch) {
     kv_self_update();
 
     int64_t n_outputs_prev = 0;
+#ifdef GGML_PERF
+    FILE *perf_all_shape_fp = ggml_perf_log_open("ggml_perf-all-shape.log");
+#endif /* GGML_PERF */
 
     while (sbatch.n_tokens > 0) {
         llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
@@ -971,6 +974,12 @@ int llama_context::decode(llama_batch & inp_batch) {
         res->set_inputs(&ubatch);
 
         const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
+#ifdef GGML_PERF
+        if (perf_all_shape_fp) {
+            ggml_perf_write_detailed_csv(gf, perf_all_shape_fp);
+        }
+        ggml_perf_accumulate(perf_totals, gf);
+#endif /* GGML_PERF */
         if (compute_status != GGML_STATUS_SUCCESS) {
             switch (compute_status) {
                 case GGML_STATUS_ABORTED:
@@ -2611,6 +2620,19 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) {
     return data;
 }
 
+void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
+    LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
+    for (int i = 0; i < GGML_OP_COUNT; ++i) {
+        if (totals[i].runs > 0) {
+            LLAMA_LOG_TSAVORITE("  %-16s: %5ld runs, %8ld us total, avg %.2f us\n",
+                   totals[i].op_name ? totals[i].op_name : "UNKNOWN",
+                   totals[i].runs,
+                   totals[i].total_us,
+                   (double)totals[i].total_us / totals[i].runs);
+        }
+    }
+}
+
 void llama_perf_context_print(const llama_context * ctx) {
     const auto data = llama_perf_context(ctx);
 
@@ -2622,12 +2644,16 @@ void llama_perf_context_print(const llama_context * ctx) {
             __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
     LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
 
+#ifdef GGML_PERF
     LLAMA_LOG_TSAVORITE("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
     LLAMA_LOG_TSAVORITE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
     LLAMA_LOG_TSAVORITE("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
     LLAMA_LOG_TSAVORITE("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+
+    ggml_perf_print_totals(const_cast<ggml_perf_totals *>(ctx->perf_totals));
+#endif /* GGML_PERF */
 }
 
 void llama_perf_context_reset(llama_context * ctx) {

@@ -8,6 +8,7 @@
 
 #include "ggml-cpp.h"
 #include "ggml-opt.h"
+#include "ggml.h"
 
 #include <map>
 #include <vector>
@@ -183,6 +184,9 @@ struct llama_context {
     ggml_status graph_compute(
             ggml_cgraph * gf,
                    bool   batched);
+#ifdef GGML_PERF
+    struct ggml_perf_totals perf_totals[GGML_OP_COUNT] = {};  // add this to llama_context
+#endif
 
 private:
     llm_graph_result_ptr graph_build(

@@ -126,7 +126,9 @@ int main(int argc, char ** argv) {
         LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
+#ifdef GGML_PERF
     llama_log_set(my_logger, nullptr);
+#endif /* GGML_PERF */
     LOG_INF("%s: llama backend init\n", __func__);
 
     llama_backend_init();