diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp index a3cc538794f9f..ae36728a8d3ce 100644 --- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp +++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp @@ -522,7 +522,7 @@ static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_d if (tsi_log_setup() == false) return NULL; - std::string mainProfilerName = "GGML Tsavorite "; + std::string mainProfilerName = "OPU "; tsirt::utils::TSIProfiler::initialize(); tsirt::utils::TSIScopedProfiler mainProfiler(mainProfilerName); @@ -630,7 +630,7 @@ static void ggml_tsavorite_free(struct ggml_backend_tsavorite_context *ctx) { tsi_finalize(); GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); tsirt::utils::TSIProfiler::finalize(); - std::cout << "\nGGML Tsavorite Profiling Results:" << std::endl; + std::cout << "\nOPU Profiling Results:" << std::endl; std::cout << tsirt::utils::TSIProfiler::getFormattedResults( /*truncateFuncNames*/ true) << std::endl; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index ac670388a6f54..39bb43ae2c744 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -988,7 +988,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { #ifdef GGML_PERF static const char * GGML_BACKEND_TYPE[GGML_COMPUTE_BACKEND_COUNT] = { "CPU", - "TSAVORITE" + "OPU" }; #endif /* GGML_PERF */ diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 6cc08b71a9244..b5daa2179ca62 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2620,25 +2620,51 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) { return data; } + #ifdef GGML_PERF void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) { LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n"); - LLAMA_LOG_TSAVORITE(" %-16s: %7s %14s %16s\n", "Op", "Runs", "Total us", "Avg us"); + LLAMA_LOG_TSAVORITE(" %-16s %7s %14s %16s\n", "Op", "Runs", "Total us", "Avg us"); for (int i = 0; i < GGML_OP_COUNT; ++i) { if (totals[i].runs > 0) { - // Main op row - LLAMA_LOG_TSAVORITE(" %-16s: %7ld %14ld %16.2f\n", + LLAMA_LOG_TSAVORITE(" %-16s %7ld %14ld %16.2f\n", totals[i].op_name ? totals[i].op_name : "UNKNOWN", totals[i].runs, totals[i].total_us, (double)totals[i].total_us / totals[i].runs); + } + + // Unary sub-op breakdown + if (i == GGML_OP_UNARY) { + for (int j = 0; j < GGML_UNARY_OP_COUNT; ++j) { + if (totals[i].unary_subtotals[j].runs > 0) { + LLAMA_LOG_TSAVORITE(" -> %-11s %7ld %14ld %16.2f\n", + ggml_unary_op_name((enum ggml_unary_op) j), + totals[i].unary_subtotals[j].runs, + totals[i].unary_subtotals[j].total_us, + (double)totals[i].unary_subtotals[j].total_us / totals[i].unary_subtotals[j].runs); + } + } + } + } +} +#else +void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) { + LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n"); + LLAMA_LOG_TSAVORITE(" %-16s %-8s %7s %14s %16s\n", "Op", "Target", "Runs", "Total us", "Avg us"); - // Backend subtotals + for (int i = 0; i < GGML_OP_COUNT; ++i) { + if (totals[i].runs > 0) { for (int b = 0; b < GGML_COMPUTE_BACKEND_COUNT; ++b) { if (totals[i].backend_subtotals[b].runs > 0) { - LLAMA_LOG_TSAVORITE(" [%-10s] : %7ld %14ld %16.2f\n", - ggml_backend_type((enum ggml_compute_backend_type) b), + const char *backend_name = ggml_backend_type((enum ggml_compute_backend_type) b); + char padded_backend[7] = {0}; // 6 chars + null terminator + snprintf(padded_backend, sizeof(padded_backend), "%-6s", backend_name); + + LLAMA_LOG_TSAVORITE(" %-16s %-8s %7ld %14ld %16.2f\n", + totals[i].op_name ? totals[i].op_name : "UNKNOWN", + padded_backend, totals[i].backend_subtotals[b].runs, totals[i].backend_subtotals[b].total_us, (double)totals[i].backend_subtotals[b].total_us / totals[i].backend_subtotals[b].runs); @@ -2649,8 +2675,21 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) { if (i == GGML_OP_UNARY) { for (int j = 0; j < GGML_UNARY_OP_COUNT; ++j) { if (totals[i].unary_subtotals[j].runs > 0) { - LLAMA_LOG_TSAVORITE(" -> %-12s: %7ld %14ld %16.2f\n", + // Find backend for unary op (assumes same as parent op) + const char *backend_name = NULL; + for (int b = 0; b < GGML_COMPUTE_BACKEND_COUNT; ++b) { + if (totals[i].backend_subtotals[b].runs > 0) { + backend_name = ggml_backend_type((enum ggml_compute_backend_type) b); + break; + } + } + + char padded_backend[7] = {0}; + snprintf(padded_backend, sizeof(padded_backend), "%-6s", backend_name ? backend_name : "UNK"); + + LLAMA_LOG_TSAVORITE(" -> %-11s %-8s %7ld %14ld %16.2f\n", ggml_unary_op_name((enum ggml_unary_op) j), + padded_backend, totals[i].unary_subtotals[j].runs, totals[i].unary_subtotals[j].total_us, (double)totals[i].unary_subtotals[j].total_us / totals[i].unary_subtotals[j].runs); @@ -2662,6 +2701,7 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) { } #endif /* GGML_PERF */ + void llama_perf_context_print(const llama_context * ctx) { const auto data = llama_perf_context(ctx); diff --git a/tools/main/main.cpp b/tools/main/main.cpp index d736ef9e515c6..eb0b49bf9d00d 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -971,16 +971,21 @@ int main(int argc, char ** argv) { LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } + // Ensure all prompt response output is fully completed before printing profiling data. + // Free threadpools first to guarantee all worker threads have finished processing and output. + // This prevents interleaving of prompt response and profiling output, ensuring correct order. + // fflush(stdout) is called to flush any remaining output before printing profiling stats. + + ggml_threadpool_free_fn(threadpool); + ggml_threadpool_free_fn(threadpool_batch); LOG("\n\n"); + fflush(stdout); common_perf_print(ctx, smpl); common_sampler_free(smpl); llama_backend_free(); - ggml_threadpool_free_fn(threadpool); - ggml_threadpool_free_fn(threadpool_batch); - return 0; }