Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -612,9 +612,9 @@ extern "C" {

void * extra; // extra things e.g. for ggml-cuda.cu
#ifdef GGML_PERF
int64_t perf_runs;
int64_t perf_runs;
int64_t perf_time_us;
enum ggml_compute_backend_type ggml_compute_backend;
enum ggml_compute_backend_type ggml_compute_backend;
char padding[4];
#else
char padding[8];
Expand Down Expand Up @@ -2213,19 +2213,34 @@ extern "C" {
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);

#ifdef GGML_PERF
struct ggml_perf_backend_subtotals {
int64_t total_us;
int64_t runs;
};

struct ggml_perf_unary_subtotals {
int64_t total_us;
int64_t runs;
};
// internal perf accumulation struct
struct ggml_perf_totals {
int op_count;
int64_t total_us;
int64_t runs;
const char * op_name;

// Only used for UNARY
struct ggml_perf_unary_subtotals unary_subtotals[GGML_UNARY_OP_COUNT];
// run count per backend
struct ggml_perf_backend_subtotals backend_subtotals[GGML_COMPUTE_BACKEND_COUNT];
};

FILE * ggml_perf_log_open(const char *filename);
void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp);

// capture perf into totals
void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph);
const char * ggml_backend_type(enum ggml_compute_backend_type backend);

#endif /* GGML_PERF */

Expand Down
42 changes: 34 additions & 8 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1208,7 +1208,7 @@ const char * ggml_op_name(enum ggml_op op) {
}

#ifdef GGML_PERF
static const char * ggml_backend_type(enum ggml_compute_backend_type backend) {
const char * ggml_backend_type(enum ggml_compute_backend_type backend) {
return GGML_BACKEND_TYPE[backend];
}
#endif /* GGML_PERF */
Expand Down Expand Up @@ -6580,6 +6580,19 @@ void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct
totals[op].total_us += node->perf_time_us;
totals[op].runs += node->perf_runs;
totals[op].op_count++;

// Count backend runs
enum ggml_compute_backend_type be = node->ggml_compute_backend;
if (be >= GGML_COMPUTE_BACKEND_CPU && be < GGML_COMPUTE_BACKEND_COUNT) {
totals[op].backend_subtotals[be].total_us += node->perf_time_us;
totals[op].backend_subtotals[be].runs += node->perf_runs;
}

if (op == GGML_OP_UNARY) {
enum ggml_unary_op subop = ggml_get_unary_op(node);
totals[op].unary_subtotals[subop].total_us += node->perf_time_us;
totals[op].unary_subtotals[subop].runs += node->perf_runs;
}
}
}

Expand Down Expand Up @@ -6607,7 +6620,10 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp) {
}
}

fprintf(fp, "ggml_graph_compute_perf: total compute time: %.3f ms\n", total_time_us / 1000.0);
fprintf(fp, "\n=== GGML Detailed Op Perf (%.3f ms total) ===\n", total_time_us / 1000.0);
fprintf(fp,
"%-12s %-20s %10s %12s %10s %10s %10s %10s %10s\n",
"Backend", "Op", "Runs", "Total ms", "Avg ms", "ne[0]", "ne[1]", "ne[2]", "ne[3]");

for (int i = 0; i < cgraph->n_nodes; ++i) {
struct ggml_tensor * node = cgraph->nodes[i];
Expand All @@ -6616,17 +6632,27 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp) {
double t_ms = node->perf_time_us / 1000.0;
double avg_ms = t_ms / node->perf_runs;

const char * op_name = ggml_op_name(node->op);
char full_op[64];
if (node->op == GGML_OP_UNARY) {
enum ggml_unary_op subop = ggml_get_unary_op(node);
snprintf(full_op, sizeof(full_op), "UNARY(%s)", ggml_unary_op_name(subop));
op_name = full_op;
}

fprintf(fp,
" - BACKEND:%s OP:%s: total %.3f ms over %d runs (avg %.3f ms) [shape=%d,%d,%d]\n",
"%-12s %-20s %10d %12.3f %10.3f %10ld %10ld %10ld %10ld\n",
ggml_backend_type(node->ggml_compute_backend),
ggml_op_name(node->op),
t_ms,
op_name,
node->perf_runs,
t_ms,
avg_ms,
node->ne[0], node->ne[1], node->ne[2]);
node->ne[0],
node->ne[1],
node->ne[2],
node->ne[3]);
}

fprintf(fp, "--------------------------------------------------\n\n");
fprintf(fp, "--------------------------------------------------------------------------------------------------------\n\n");
}

#endif /* GGML_PERF */
39 changes: 34 additions & 5 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2620,18 +2620,47 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) {
return data;
}

#ifdef GGML_PERF
void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
LLAMA_LOG_TSAVORITE(" %-16s: %7s %14s %16s\n", "Op", "Runs", "Total us", "Avg us");

for (int i = 0; i < GGML_OP_COUNT; ++i) {
if (totals[i].runs > 0) {
LLAMA_LOG_TSAVORITE(" %-16s: %5ld runs, %8ld us total, avg %.2f us\n",
totals[i].op_name ? totals[i].op_name : "UNKNOWN",
totals[i].runs,
totals[i].total_us,
(double)totals[i].total_us / totals[i].runs);
// Main op row
LLAMA_LOG_TSAVORITE(" %-16s: %7ld %14ld %16.2f\n",
totals[i].op_name ? totals[i].op_name : "UNKNOWN",
totals[i].runs,
totals[i].total_us,
(double)totals[i].total_us / totals[i].runs);

// Backend subtotals
for (int b = 0; b < GGML_COMPUTE_BACKEND_COUNT; ++b) {
if (totals[i].backend_subtotals[b].runs > 0) {
LLAMA_LOG_TSAVORITE(" [%-10s] : %7ld %14ld %16.2f\n",
ggml_backend_type((enum ggml_compute_backend_type) b),
totals[i].backend_subtotals[b].runs,
totals[i].backend_subtotals[b].total_us,
(double)totals[i].backend_subtotals[b].total_us / totals[i].backend_subtotals[b].runs);
}
}

// Unary sub-op breakdown
if (i == GGML_OP_UNARY) {
for (int j = 0; j < GGML_UNARY_OP_COUNT; ++j) {
if (totals[i].unary_subtotals[j].runs > 0) {
LLAMA_LOG_TSAVORITE(" -> %-12s: %7ld %14ld %16.2f\n",
ggml_unary_op_name((enum ggml_unary_op) j),
totals[i].unary_subtotals[j].runs,
totals[i].unary_subtotals[j].total_us,
(double)totals[i].unary_subtotals[j].total_us / totals[i].unary_subtotals[j].runs);
}
}
}
}
}
}
#endif /* GGML_PERF */

void llama_perf_context_print(const llama_context * ctx) {
const auto data = llama_perf_context(ctx);
Expand Down