tsisw · akapoor3518 · Jul 1, 2025 · Jun 30, 2025
@@ -612,9 +612,9 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 #ifdef GGML_PERF
-	int64_t perf_runs;
+        int64_t perf_runs;
         int64_t perf_time_us;
-	enum ggml_compute_backend_type ggml_compute_backend;
+        enum ggml_compute_backend_type ggml_compute_backend;
         char padding[4];
 #else
         char padding[8];
@@ -2213,19 +2213,34 @@ extern "C" {
     GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
 
 #ifdef GGML_PERF
+struct ggml_perf_backend_subtotals {
+    int64_t total_us;
+    int64_t runs;
+};
+
+struct ggml_perf_unary_subtotals {
+    int64_t total_us;
+    int64_t runs;
+};
 // internal perf accumulation struct
 struct ggml_perf_totals {
     int op_count;
     int64_t total_us;
     int64_t runs;
     const char * op_name;
+
+    // Only used for UNARY
+    struct ggml_perf_unary_subtotals unary_subtotals[GGML_UNARY_OP_COUNT];
+    // run count per backend
+    struct ggml_perf_backend_subtotals backend_subtotals[GGML_COMPUTE_BACKEND_COUNT];
 };
 
 FILE * ggml_perf_log_open(const char *filename);
 void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp);
 
 // capture perf into totals
 void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph);
+const char * ggml_backend_type(enum ggml_compute_backend_type backend);
 
 #endif /* GGML_PERF */
 

@@ -1208,7 +1208,7 @@ const char * ggml_op_name(enum ggml_op op) {
 }
 
 #ifdef GGML_PERF
-static const char * ggml_backend_type(enum ggml_compute_backend_type backend) {
+const char * ggml_backend_type(enum ggml_compute_backend_type backend) {
     return GGML_BACKEND_TYPE[backend];
 }
 #endif /* GGML_PERF */
@@ -6580,6 +6580,19 @@ void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct
         totals[op].total_us += node->perf_time_us;
         totals[op].runs     += node->perf_runs;
         totals[op].op_count++;
+
+	// Count backend runs
+        enum ggml_compute_backend_type be = node->ggml_compute_backend;
+        if (be >= GGML_COMPUTE_BACKEND_CPU && be < GGML_COMPUTE_BACKEND_COUNT) {
+            totals[op].backend_subtotals[be].total_us += node->perf_time_us;
+	    totals[op].backend_subtotals[be].runs     += node->perf_runs;
+        }
+
+        if (op == GGML_OP_UNARY) {
+            enum ggml_unary_op subop = ggml_get_unary_op(node);
+            totals[op].unary_subtotals[subop].total_us += node->perf_time_us;
+            totals[op].unary_subtotals[subop].runs     += node->perf_runs;
+        }
     }
 }
 
@@ -6607,7 +6620,10 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp) {
         }
     }
 
-    fprintf(fp, "ggml_graph_compute_perf: total compute time: %.3f ms\n", total_time_us / 1000.0);
+    fprintf(fp, "\n=== GGML Detailed Op Perf (%.3f ms total) ===\n", total_time_us / 1000.0);
+    fprintf(fp,
+        "%-12s %-20s %10s %12s %10s %10s %10s %10s %10s\n",
+        "Backend", "Op", "Runs", "Total ms", "Avg ms", "ne[0]", "ne[1]", "ne[2]", "ne[3]");
 
     for (int i = 0; i < cgraph->n_nodes; ++i) {
         struct ggml_tensor * node = cgraph->nodes[i];
@@ -6616,17 +6632,27 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp) {
         double t_ms   = node->perf_time_us / 1000.0;
         double avg_ms = t_ms / node->perf_runs;
 
+        const char * op_name = ggml_op_name(node->op);
+        char full_op[64];
+        if (node->op == GGML_OP_UNARY) {
+            enum ggml_unary_op subop = ggml_get_unary_op(node);
+            snprintf(full_op, sizeof(full_op), "UNARY(%s)", ggml_unary_op_name(subop));
+            op_name = full_op;
+        }
+
         fprintf(fp,
-            " - BACKEND:%s OP:%s: total %.3f ms over %d runs (avg %.3f ms) [shape=%d,%d,%d]\n",
+            "%-12s %-20s %10d %12.3f %10.3f %10ld %10ld %10ld %10ld\n",
             ggml_backend_type(node->ggml_compute_backend),
-            ggml_op_name(node->op),
-            t_ms,
+            op_name,
             node->perf_runs,
+            t_ms,
             avg_ms,
-            node->ne[0], node->ne[1], node->ne[2]);
+            node->ne[0],
+            node->ne[1],
+            node->ne[2],
+            node->ne[3]);
     }
 
-    fprintf(fp, "--------------------------------------------------\n\n");
+    fprintf(fp, "--------------------------------------------------------------------------------------------------------\n\n");
 }
-
 #endif /* GGML_PERF */
@@ -2620,18 +2620,47 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) {
     return data;
 }
 
+#ifdef GGML_PERF
 void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
     LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
+    LLAMA_LOG_TSAVORITE("  %-16s: %7s  %14s  %16s\n", "Op", "Runs", "Total us", "Avg us");
+
     for (int i = 0; i < GGML_OP_COUNT; ++i) {
         if (totals[i].runs > 0) {
-            LLAMA_LOG_TSAVORITE("  %-16s: %5ld runs, %8ld us total, avg %.2f us\n",
-                   totals[i].op_name ? totals[i].op_name : "UNKNOWN",
-                   totals[i].runs,
-                   totals[i].total_us,
-                   (double)totals[i].total_us / totals[i].runs);
+            // Main op row
+            LLAMA_LOG_TSAVORITE("  %-16s: %7ld  %14ld  %16.2f\n",
+                totals[i].op_name ? totals[i].op_name : "UNKNOWN",
+                totals[i].runs,
+                totals[i].total_us,
+                (double)totals[i].total_us / totals[i].runs);
+
+            // Backend subtotals
+            for (int b = 0; b < GGML_COMPUTE_BACKEND_COUNT; ++b) {
+                if (totals[i].backend_subtotals[b].runs > 0) {
+                    LLAMA_LOG_TSAVORITE("    [%-10s] : %7ld  %14ld  %16.2f\n",
+                        ggml_backend_type((enum ggml_compute_backend_type) b),
+                        totals[i].backend_subtotals[b].runs,
+                        totals[i].backend_subtotals[b].total_us,
+                        (double)totals[i].backend_subtotals[b].total_us / totals[i].backend_subtotals[b].runs);
+                }
+            }
+
+            // Unary sub-op breakdown
+            if (i == GGML_OP_UNARY) {
+                for (int j = 0; j < GGML_UNARY_OP_COUNT; ++j) {
+                    if (totals[i].unary_subtotals[j].runs > 0) {
+                        LLAMA_LOG_TSAVORITE("    -> %-12s: %7ld  %14ld  %16.2f\n",
+                            ggml_unary_op_name((enum ggml_unary_op) j),
+                            totals[i].unary_subtotals[j].runs,
+                            totals[i].unary_subtotals[j].total_us,
+                            (double)totals[i].unary_subtotals[j].total_us / totals[i].unary_subtotals[j].runs);
+                    }
+                }
+            }
         }
     }
 }
+#endif /* GGML_PERF */
 
 void llama_perf_context_print(const llama_context * ctx) {
     const auto data = llama_perf_context(ctx);