From fa46e7ef579941fbc52f9e215a21aa47ea0c1773 Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Mon, 30 Jun 2025 16:10:33 -0700 Subject: [PATCH] @FIR-781 - LLama.cpp ggml Stats:Adding Backend and Unary OP Detail --- ggml/include/ggml.h | 19 +++++++++++++++++-- ggml/src/ggml.c | 42 ++++++++++++++++++++++++++++++++++-------- src/llama-context.cpp | 39 ++++++++++++++++++++++++++++++++++----- 3 files changed, 85 insertions(+), 15 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index d739f4c44e0..52780b0aea9 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -612,9 +612,9 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu #ifdef GGML_PERF - int64_t perf_runs; + int64_t perf_runs; int64_t perf_time_us; - enum ggml_compute_backend_type ggml_compute_backend; + enum ggml_compute_backend_type ggml_compute_backend; char padding[4]; #else char padding[8]; @@ -2213,12 +2213,26 @@ extern "C" { GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); #ifdef GGML_PERF +struct ggml_perf_backend_subtotals { + int64_t total_us; + int64_t runs; +}; + +struct ggml_perf_unary_subtotals { + int64_t total_us; + int64_t runs; +}; // internal perf accumulation struct struct ggml_perf_totals { int op_count; int64_t total_us; int64_t runs; const char * op_name; + + // Only used for UNARY + struct ggml_perf_unary_subtotals unary_subtotals[GGML_UNARY_OP_COUNT]; + // run count per backend + struct ggml_perf_backend_subtotals backend_subtotals[GGML_COMPUTE_BACKEND_COUNT]; }; FILE * ggml_perf_log_open(const char *filename); @@ -2226,6 +2240,7 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp); // capture perf into totals void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph); +const char * ggml_backend_type(enum ggml_compute_backend_type backend); #endif /* GGML_PERF */ diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index cddeca90672..a90f144352c 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1208,7 +1208,7 @@ const char * ggml_op_name(enum ggml_op op) { } #ifdef GGML_PERF -static const char * ggml_backend_type(enum ggml_compute_backend_type backend) { +const char * ggml_backend_type(enum ggml_compute_backend_type backend) { return GGML_BACKEND_TYPE[backend]; } #endif /* GGML_PERF */ @@ -6580,6 +6580,19 @@ void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct totals[op].total_us += node->perf_time_us; totals[op].runs += node->perf_runs; totals[op].op_count++; + + // Count backend runs + enum ggml_compute_backend_type be = node->ggml_compute_backend; + if (be >= GGML_COMPUTE_BACKEND_CPU && be < GGML_COMPUTE_BACKEND_COUNT) { + totals[op].backend_subtotals[be].total_us += node->perf_time_us; + totals[op].backend_subtotals[be].runs += node->perf_runs; + } + + if (op == GGML_OP_UNARY) { + enum ggml_unary_op subop = ggml_get_unary_op(node); + totals[op].unary_subtotals[subop].total_us += node->perf_time_us; + totals[op].unary_subtotals[subop].runs += node->perf_runs; + } } } @@ -6607,7 +6620,10 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp) { } } - fprintf(fp, "ggml_graph_compute_perf: total compute time: %.3f ms\n", total_time_us / 1000.0); + fprintf(fp, "\n=== GGML Detailed Op Perf (%.3f ms total) ===\n", total_time_us / 1000.0); + fprintf(fp, + "%-12s %-20s %10s %12s %10s %10s %10s %10s %10s\n", + "Backend", "Op", "Runs", "Total ms", "Avg ms", "ne[0]", "ne[1]", "ne[2]", "ne[3]"); for (int i = 0; i < cgraph->n_nodes; ++i) { struct ggml_tensor * node = cgraph->nodes[i]; @@ -6616,17 +6632,27 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp) { double t_ms = node->perf_time_us / 1000.0; double avg_ms = t_ms / node->perf_runs; + const char * op_name = ggml_op_name(node->op); + char full_op[64]; + if (node->op == GGML_OP_UNARY) { + enum ggml_unary_op subop = ggml_get_unary_op(node); + snprintf(full_op, sizeof(full_op), "UNARY(%s)", ggml_unary_op_name(subop)); + op_name = full_op; + } + fprintf(fp, - " - BACKEND:%s OP:%s: total %.3f ms over %d runs (avg %.3f ms) [shape=%d,%d,%d]\n", + "%-12s %-20s %10d %12.3f %10.3f %10ld %10ld %10ld %10ld\n", ggml_backend_type(node->ggml_compute_backend), - ggml_op_name(node->op), - t_ms, + op_name, node->perf_runs, + t_ms, avg_ms, - node->ne[0], node->ne[1], node->ne[2]); + node->ne[0], + node->ne[1], + node->ne[2], + node->ne[3]); } - fprintf(fp, "--------------------------------------------------\n\n"); + fprintf(fp, "--------------------------------------------------------------------------------------------------------\n\n"); } - #endif /* GGML_PERF */ diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4d8b6fdf5a7..b62f82f8185 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2620,18 +2620,47 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) { return data; } +#ifdef GGML_PERF void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) { LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n"); + LLAMA_LOG_TSAVORITE(" %-16s: %7s %14s %16s\n", "Op", "Runs", "Total us", "Avg us"); + for (int i = 0; i < GGML_OP_COUNT; ++i) { if (totals[i].runs > 0) { - LLAMA_LOG_TSAVORITE(" %-16s: %5ld runs, %8ld us total, avg %.2f us\n", - totals[i].op_name ? totals[i].op_name : "UNKNOWN", - totals[i].runs, - totals[i].total_us, - (double)totals[i].total_us / totals[i].runs); + // Main op row + LLAMA_LOG_TSAVORITE(" %-16s: %7ld %14ld %16.2f\n", + totals[i].op_name ? totals[i].op_name : "UNKNOWN", + totals[i].runs, + totals[i].total_us, + (double)totals[i].total_us / totals[i].runs); + + // Backend subtotals + for (int b = 0; b < GGML_COMPUTE_BACKEND_COUNT; ++b) { + if (totals[i].backend_subtotals[b].runs > 0) { + LLAMA_LOG_TSAVORITE(" [%-10s] : %7ld %14ld %16.2f\n", + ggml_backend_type((enum ggml_compute_backend_type) b), + totals[i].backend_subtotals[b].runs, + totals[i].backend_subtotals[b].total_us, + (double)totals[i].backend_subtotals[b].total_us / totals[i].backend_subtotals[b].runs); + } + } + + // Unary sub-op breakdown + if (i == GGML_OP_UNARY) { + for (int j = 0; j < GGML_UNARY_OP_COUNT; ++j) { + if (totals[i].unary_subtotals[j].runs > 0) { + LLAMA_LOG_TSAVORITE(" -> %-12s: %7ld %14ld %16.2f\n", + ggml_unary_op_name((enum ggml_unary_op) j), + totals[i].unary_subtotals[j].runs, + totals[i].unary_subtotals[j].total_us, + (double)totals[i].unary_subtotals[j].total_us / totals[i].unary_subtotals[j].runs); + } + } + } } } } +#endif /* GGML_PERF */ void llama_perf_context_print(const llama_context * ctx) { const auto data = llama_perf_context(ctx);