Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_d
if (tsi_log_setup() == false)
return NULL;

std::string mainProfilerName = "GGML Tsavorite ";
std::string mainProfilerName = "OPU ";
tsirt::utils::TSIProfiler::initialize();
tsirt::utils::TSIScopedProfiler mainProfiler(mainProfilerName);

Expand Down Expand Up @@ -630,7 +630,7 @@ static void ggml_tsavorite_free(struct ggml_backend_tsavorite_context *ctx) {
tsi_finalize();
GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
tsirt::utils::TSIProfiler::finalize();
std::cout << "\nGGML Tsavorite Profiling Results:" << std::endl;
std::cout << "\nOPU Profiling Results:" << std::endl;
std::cout << tsirt::utils::TSIProfiler::getFormattedResults(
/*truncateFuncNames*/ true)
<< std::endl;
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -988,7 +988,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
#ifdef GGML_PERF
static const char * GGML_BACKEND_TYPE[GGML_COMPUTE_BACKEND_COUNT] = {
"CPU",
"TSAVORITE"
"OPU"
};
#endif /* GGML_PERF */

Expand Down
54 changes: 47 additions & 7 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2620,25 +2620,51 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) {
return data;
}


#ifdef GGML_PERF
void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
LLAMA_LOG_TSAVORITE(" %-16s: %7s %14s %16s\n", "Op", "Runs", "Total us", "Avg us");
LLAMA_LOG_TSAVORITE(" %-16s %7s %14s %16s\n", "Op", "Runs", "Total us", "Avg us");

for (int i = 0; i < GGML_OP_COUNT; ++i) {
if (totals[i].runs > 0) {
// Main op row
LLAMA_LOG_TSAVORITE(" %-16s: %7ld %14ld %16.2f\n",
LLAMA_LOG_TSAVORITE(" %-16s %7ld %14ld %16.2f\n",
totals[i].op_name ? totals[i].op_name : "UNKNOWN",
totals[i].runs,
totals[i].total_us,
(double)totals[i].total_us / totals[i].runs);
}

// Unary sub-op breakdown
if (i == GGML_OP_UNARY) {
for (int j = 0; j < GGML_UNARY_OP_COUNT; ++j) {
if (totals[i].unary_subtotals[j].runs > 0) {
LLAMA_LOG_TSAVORITE(" -> %-11s %7ld %14ld %16.2f\n",
ggml_unary_op_name((enum ggml_unary_op) j),
totals[i].unary_subtotals[j].runs,
totals[i].unary_subtotals[j].total_us,
(double)totals[i].unary_subtotals[j].total_us / totals[i].unary_subtotals[j].runs);
}
}
}
}
}
#else
void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
LLAMA_LOG_TSAVORITE(" %-16s %-8s %7s %14s %16s\n", "Op", "Target", "Runs", "Total us", "Avg us");

// Backend subtotals
for (int i = 0; i < GGML_OP_COUNT; ++i) {
if (totals[i].runs > 0) {
for (int b = 0; b < GGML_COMPUTE_BACKEND_COUNT; ++b) {
if (totals[i].backend_subtotals[b].runs > 0) {
LLAMA_LOG_TSAVORITE(" [%-10s] : %7ld %14ld %16.2f\n",
ggml_backend_type((enum ggml_compute_backend_type) b),
const char *backend_name = ggml_backend_type((enum ggml_compute_backend_type) b);
char padded_backend[7] = {0}; // 6 chars + null terminator
snprintf(padded_backend, sizeof(padded_backend), "%-6s", backend_name);

LLAMA_LOG_TSAVORITE(" %-16s %-8s %7ld %14ld %16.2f\n",
totals[i].op_name ? totals[i].op_name : "UNKNOWN",
padded_backend,
totals[i].backend_subtotals[b].runs,
totals[i].backend_subtotals[b].total_us,
(double)totals[i].backend_subtotals[b].total_us / totals[i].backend_subtotals[b].runs);
Expand All @@ -2649,8 +2675,21 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
if (i == GGML_OP_UNARY) {
for (int j = 0; j < GGML_UNARY_OP_COUNT; ++j) {
if (totals[i].unary_subtotals[j].runs > 0) {
LLAMA_LOG_TSAVORITE(" -> %-12s: %7ld %14ld %16.2f\n",
// Find backend for unary op (assumes same as parent op)
const char *backend_name = NULL;
for (int b = 0; b < GGML_COMPUTE_BACKEND_COUNT; ++b) {
if (totals[i].backend_subtotals[b].runs > 0) {
backend_name = ggml_backend_type((enum ggml_compute_backend_type) b);
break;
}
}

char padded_backend[7] = {0};
snprintf(padded_backend, sizeof(padded_backend), "%-6s", backend_name ? backend_name : "UNK");

LLAMA_LOG_TSAVORITE(" -> %-11s %-8s %7ld %14ld %16.2f\n",
ggml_unary_op_name((enum ggml_unary_op) j),
padded_backend,
totals[i].unary_subtotals[j].runs,
totals[i].unary_subtotals[j].total_us,
(double)totals[i].unary_subtotals[j].total_us / totals[i].unary_subtotals[j].runs);
Expand All @@ -2662,6 +2701,7 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
}
#endif /* GGML_PERF */


void llama_perf_context_print(const llama_context * ctx) {
const auto data = llama_perf_context(ctx);

Expand Down
11 changes: 8 additions & 3 deletions tools/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -971,16 +971,21 @@ int main(int argc, char ** argv) {
LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
// Ensure all prompt response output is fully completed before printing profiling data.
// Free threadpools first to guarantee all worker threads have finished processing and output.
// This prevents interleaving of prompt response and profiling output, ensuring correct order.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

spacing is not aligned

// fflush(stdout) is called to flush any remaining output before printing profiling stats.

ggml_threadpool_free_fn(threadpool);
ggml_threadpool_free_fn(threadpool_batch);

LOG("\n\n");
fflush(stdout);
common_perf_print(ctx, smpl);

common_sampler_free(smpl);

llama_backend_free();

ggml_threadpool_free_fn(threadpool);
ggml_threadpool_free_fn(threadpool_batch);

return 0;
}