From fa46e7ef579941fbc52f9e215a21aa47ea0c1773 Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Mon, 30 Jun 2025 16:10:33 -0700
Subject: [PATCH] @FIR-781 - LLama.cpp ggml Stats:Adding Backend and Unary OP
 Detail

---
 ggml/include/ggml.h   | 19 +++++++++++++++++--
 ggml/src/ggml.c       | 42 ++++++++++++++++++++++++++++++++++--------
 src/llama-context.cpp | 39 ++++++++++++++++++++++++++++++++++-----
 3 files changed, 85 insertions(+), 15 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index d739f4c44e0..52780b0aea9 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -612,9 +612,9 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 #ifdef GGML_PERF
-	int64_t perf_runs;
+        int64_t perf_runs;
         int64_t perf_time_us;
-	enum ggml_compute_backend_type ggml_compute_backend;
+        enum ggml_compute_backend_type ggml_compute_backend;
         char padding[4];
 #else
         char padding[8];
@@ -2213,12 +2213,26 @@ extern "C" {
     GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
  
 #ifdef GGML_PERF
+struct ggml_perf_backend_subtotals {
+    int64_t total_us;
+    int64_t runs;
+};
+
+struct ggml_perf_unary_subtotals {
+    int64_t total_us;
+    int64_t runs;
+};
 // internal perf accumulation struct
 struct ggml_perf_totals {
     int op_count;
     int64_t total_us;
     int64_t runs;
     const char * op_name;
+
+    // Only used for UNARY
+    struct ggml_perf_unary_subtotals unary_subtotals[GGML_UNARY_OP_COUNT];
+    // run count per backend
+    struct ggml_perf_backend_subtotals backend_subtotals[GGML_COMPUTE_BACKEND_COUNT];
 };
 
 FILE * ggml_perf_log_open(const char *filename);
@@ -2226,6 +2240,7 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp);
 
 // capture perf into totals
 void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph);
+const char * ggml_backend_type(enum ggml_compute_backend_type backend);
 
 #endif /* GGML_PERF */
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index cddeca90672..a90f144352c 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1208,7 +1208,7 @@ const char * ggml_op_name(enum ggml_op op) {
 }
 
 #ifdef GGML_PERF
-static const char * ggml_backend_type(enum ggml_compute_backend_type backend) {
+const char * ggml_backend_type(enum ggml_compute_backend_type backend) {
     return GGML_BACKEND_TYPE[backend];
 }
 #endif /* GGML_PERF */
@@ -6580,6 +6580,19 @@ void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct
         totals[op].total_us += node->perf_time_us;
         totals[op].runs     += node->perf_runs;
         totals[op].op_count++;
+
+	// Count backend runs
+        enum ggml_compute_backend_type be = node->ggml_compute_backend;
+        if (be >= GGML_COMPUTE_BACKEND_CPU && be < GGML_COMPUTE_BACKEND_COUNT) {
+            totals[op].backend_subtotals[be].total_us += node->perf_time_us;
+	    totals[op].backend_subtotals[be].runs     += node->perf_runs;
+        }
+
+        if (op == GGML_OP_UNARY) {
+            enum ggml_unary_op subop = ggml_get_unary_op(node);
+            totals[op].unary_subtotals[subop].total_us += node->perf_time_us;
+            totals[op].unary_subtotals[subop].runs     += node->perf_runs;
+        }
     }
 }
 
@@ -6607,7 +6620,10 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp) {
         }
     }
 
-    fprintf(fp, "ggml_graph_compute_perf: total compute time: %.3f ms\n", total_time_us / 1000.0);
+    fprintf(fp, "\n=== GGML Detailed Op Perf (%.3f ms total) ===\n", total_time_us / 1000.0);
+    fprintf(fp,
+        "%-12s %-20s %10s %12s %10s %10s %10s %10s %10s\n",
+        "Backend", "Op", "Runs", "Total ms", "Avg ms", "ne[0]", "ne[1]", "ne[2]", "ne[3]");
 
     for (int i = 0; i < cgraph->n_nodes; ++i) {
         struct ggml_tensor * node = cgraph->nodes[i];
@@ -6616,17 +6632,27 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp) {
         double t_ms   = node->perf_time_us / 1000.0;
         double avg_ms = t_ms / node->perf_runs;
 
+        const char * op_name = ggml_op_name(node->op);
+        char full_op[64];
+        if (node->op == GGML_OP_UNARY) {
+            enum ggml_unary_op subop = ggml_get_unary_op(node);
+            snprintf(full_op, sizeof(full_op), "UNARY(%s)", ggml_unary_op_name(subop));
+            op_name = full_op;
+        }
+
         fprintf(fp,
-            " - BACKEND:%s OP:%s: total %.3f ms over %d runs (avg %.3f ms) [shape=%d,%d,%d]\n",
+            "%-12s %-20s %10d %12.3f %10.3f %10ld %10ld %10ld %10ld\n",
             ggml_backend_type(node->ggml_compute_backend),
-            ggml_op_name(node->op),
-            t_ms,
+            op_name,
             node->perf_runs,
+            t_ms,
             avg_ms,
-            node->ne[0], node->ne[1], node->ne[2]);
+            node->ne[0],
+            node->ne[1],
+            node->ne[2],
+            node->ne[3]);
     }
 
-    fprintf(fp, "--------------------------------------------------\n\n");
+    fprintf(fp, "--------------------------------------------------------------------------------------------------------\n\n");
 }
-
 #endif /* GGML_PERF */
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 4d8b6fdf5a7..b62f82f8185 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2620,18 +2620,47 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) {
     return data;
 }
 
+#ifdef GGML_PERF
 void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
     LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
+    LLAMA_LOG_TSAVORITE("  %-16s: %7s  %14s  %16s\n", "Op", "Runs", "Total us", "Avg us");
+
     for (int i = 0; i < GGML_OP_COUNT; ++i) {
         if (totals[i].runs > 0) {
-            LLAMA_LOG_TSAVORITE("  %-16s: %5ld runs, %8ld us total, avg %.2f us\n",
-                   totals[i].op_name ? totals[i].op_name : "UNKNOWN",
-                   totals[i].runs,
-                   totals[i].total_us,
-                   (double)totals[i].total_us / totals[i].runs);
+            // Main op row
+            LLAMA_LOG_TSAVORITE("  %-16s: %7ld  %14ld  %16.2f\n",
+                totals[i].op_name ? totals[i].op_name : "UNKNOWN",
+                totals[i].runs,
+                totals[i].total_us,
+                (double)totals[i].total_us / totals[i].runs);
+
+            // Backend subtotals
+            for (int b = 0; b < GGML_COMPUTE_BACKEND_COUNT; ++b) {
+                if (totals[i].backend_subtotals[b].runs > 0) {
+                    LLAMA_LOG_TSAVORITE("    [%-10s] : %7ld  %14ld  %16.2f\n",
+                        ggml_backend_type((enum ggml_compute_backend_type) b),
+                        totals[i].backend_subtotals[b].runs,
+                        totals[i].backend_subtotals[b].total_us,
+                        (double)totals[i].backend_subtotals[b].total_us / totals[i].backend_subtotals[b].runs);
+                }
+            }
+
+            // Unary sub-op breakdown
+            if (i == GGML_OP_UNARY) {
+                for (int j = 0; j < GGML_UNARY_OP_COUNT; ++j) {
+                    if (totals[i].unary_subtotals[j].runs > 0) {
+                        LLAMA_LOG_TSAVORITE("    -> %-12s: %7ld  %14ld  %16.2f\n",
+                            ggml_unary_op_name((enum ggml_unary_op) j),
+                            totals[i].unary_subtotals[j].runs,
+                            totals[i].unary_subtotals[j].total_us,
+                            (double)totals[i].unary_subtotals[j].total_us / totals[i].unary_subtotals[j].runs);
+                    }
+                }
+            }
         }
     }
 }
+#endif /* GGML_PERF */
 
 void llama_perf_context_print(const llama_context * ctx) {
     const auto data = llama_perf_context(ctx);