From a6918d847dd867fb88760489107d4494b9ccd506 Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Thu, 26 Jun 2025 22:43:37 -0700 Subject: [PATCH 1/5] @FIR-770 - LLama.cpp: Adding Perf Status for all GGML Operation --- ggml/include/ggml.h | 35 +++++++- ggml/src/ggml-cpu/ggml-cpu.c | 9 ++ ggml/src/ggml-tsavorite/ggml-tsavorite.cpp | 9 ++ ggml/src/ggml.c | 96 +++++++++++++++++++++- src/llama-context.cpp | 13 +++ src/llama-context.h | 4 + tools/main/main.cpp | 2 +- 7 files changed, 165 insertions(+), 3 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index e6830b63ba8e1..0b354acc89f23 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -317,6 +317,14 @@ extern "C" { GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4) GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...); +#ifdef GGML_PERF + enum ggml_compute_backend_type { + GGML_COMPUTE_BACKEND_CPU=0, + GGML_COMPUTE_BACKEND_TSAVORITE, + GGML_COMPUTE_BACKEND_COUNT + }; +#endif /* GGML_PERF */ + enum ggml_status { GGML_STATUS_ALLOC_FAILED = -2, GGML_STATUS_FAILED = -1, @@ -603,8 +611,14 @@ extern "C" { char name[GGML_MAX_NAME]; void * extra; // extra things e.g. for ggml-cuda.cu - +#ifdef GGML_PERF + int64_t perf_runs; + int64_t perf_time_us; + enum ggml_compute_backend_type ggml_compute_backend; + char padding[8+12]; +#else char padding[8]; +#endif /* GGML_PERF */ }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); @@ -2197,6 +2211,25 @@ extern "C" { GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); + +#ifdef GGML_PERF +// internal perf accumulation struct +struct ggml_perf_totals { + int op_count; + int64_t total_us; + int64_t runs; + const char * op_name; +}; + +FILE * ggml_perf_log_open(const char *filename); +void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp); + +// capture perf into totals +void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph); + +// print final stats +void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]); +#endif /* GGML_PERF */ #ifdef __cplusplus } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 46f75ad97cd61..4354106692b48 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2844,8 +2844,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { struct ggml_tensor * node = cgraph->nodes[node_n]; +#ifdef GGML_PERF + int64_t t_start = ggml_time_us(); +#endif ggml_compute_forward(¶ms, node); +#ifdef GGML_PERF + int64_t t_end = ggml_time_us(); + node->perf_runs++; + node->perf_time_us += (t_end - t_start); +#endif + if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed); diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp index c49d02375921f..d2cfa93b8bc97 100644 --- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp +++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp @@ -813,6 +813,9 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, tensor_log log_data; for (int i = 0; i < cgraph->n_nodes; i++) { +#ifdef GGML_PERF + int64_t t_start = ggml_time_us(); +#endif node = cgraph->nodes[i]; src0 = node->src[0]; src1 = node->src[1]; @@ -1122,6 +1125,12 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, device->stats.op_run_count[kernel_type].max_num_of_elem < max_num_of_elem) device->stats.op_run_count[kernel_type].max_num_of_elem = max_num_of_elem; } +#ifdef GGML_PERF + int64_t t_end = ggml_time_us(); + node->perf_runs++; + node->perf_time_us += (t_end - t_start); + node->ggml_compute_backend = GGML_COMPUTE_BACKEND_TSAVORITE; +#endif } // This this need to implement correctly when we have mixture of CPU and accelerator operation diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 134b7420de746..839446d7a17d9 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -249,7 +249,7 @@ static void ggml_log_internal_v(enum ggml_log_level level, const char * format, void ggml_log_internal(enum ggml_log_level level, const char * format, ...) { va_list args; va_start(args, format); - if (level == GGML_LOG_LEVEL_TSAVORITE) + //if (level == GGML_LOG_LEVEL_TSAVORITE) ggml_log_internal_v(level, format, args); va_end(args); } @@ -985,6 +985,13 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; +#ifdef GGML_PERF +static const char * GGML_BACKEND_TYPE[GGML_COMPUTE_BACKEND_COUNT] = { + "CPU", + "TSAVORITE" +}; +#endif /* GGML_PERF */ + static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { @@ -1200,6 +1207,12 @@ const char * ggml_op_name(enum ggml_op op) { return GGML_OP_NAME[op]; } +#ifdef GGML_PERF +static const char * ggml_backend_type(enum ggml_compute_backend_type backend) { + return GGML_BACKEND_TYPE[backend]; +} +#endif /* GGML_PERF */ + const char * ggml_op_symbol(enum ggml_op op) { return GGML_OP_SYMBOL[op]; } @@ -1617,6 +1630,11 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, +#ifdef GGML_PERF + /*.perf_runs =*/ 0, + /*.perf_time_us =*/ 0, + /*.ggml_compute_backend =*/ GGML_COMPUTE_BACKEND_CPU, +#endif /* GGML_PERF */ /*.padding =*/ { 0 }, }; @@ -6549,3 +6567,79 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons if (p0->strict_cpu != p1->strict_cpu ) return false; return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; } + +#ifdef GGML_PERF +void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph) { + for (int i = 0; i < cgraph->n_nodes; ++i) { + struct ggml_tensor * node = cgraph->nodes[i]; + enum ggml_op op = node->op; + + if (op >= GGML_OP_COUNT) continue; + + totals[op].op_name = ggml_op_name(op); + totals[op].total_us += node->perf_time_us; + totals[op].runs += node->perf_runs; + totals[op].op_count++; + } +} + +void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) { + printf("\n=== GGML Perf Summary ===\n"); + for (int i = 0; i < GGML_OP_COUNT; ++i) { + if (totals[i].runs > 0) { + printf(" %-16s: %5ld runs, %8ld us total, avg %.2f us\n", + totals[i].op_name ? totals[i].op_name : "UNKNOWN", + totals[i].runs, + totals[i].total_us, + (double)totals[i].total_us / totals[i].runs); + } + } +} + +FILE * ggml_perf_log_open(const char *filename) { + // Try to delete existing file, ignore error if it doesn't exist + remove(filename); + + // Create a new file in write mode + FILE *fp = fopen(filename, "w"); + if (!fp) { + fprintf(stderr, "Error: Could not create file %s\n", filename); + return NULL; + } + + return fp; +} + +void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp) { + if (!fp) return; + + int64_t total_time_us = 0; + for (int i = 0; i < cgraph->n_nodes; ++i) { + if (cgraph->nodes[i]->perf_runs > 0) { + total_time_us += cgraph->nodes[i]->perf_time_us; + } + } + + fprintf(fp, "ggml_graph_compute_perf: total compute time: %.3f ms\n", total_time_us / 1000.0); + + for (int i = 0; i < cgraph->n_nodes; ++i) { + struct ggml_tensor * node = cgraph->nodes[i]; + if (node->perf_runs == 0) continue; + + double t_ms = node->perf_time_us / 1000.0; + double avg_ms = t_ms / node->perf_runs; + + fprintf(fp, + " - BACKEND:%s OP:%s: total %.3f ms over %d runs (avg %.3f ms) [shape=%d,%d,%d]\n", + ggml_backend_type(node->ggml_compute_backend), + ggml_op_name(node->op), + t_ms, + node->perf_runs, + avg_ms, + node->ne[0], node->ne[1], node->ne[2]); + } + + fprintf(fp, "--------------------------------------------------\n\n"); +} + +#endif /* GGML_PERF */ diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 984dbf14d14ae..3c4b86bc08e1d 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -932,6 +932,9 @@ int llama_context::decode(llama_batch & inp_batch) { kv_self_update(); int64_t n_outputs_prev = 0; +#ifdef GGML_PERF + FILE *perf_fp = ggml_perf_log_open("ggml_perf.log"); +#endif /* GGML_PERF */ while (sbatch.n_tokens > 0) { llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled); @@ -971,6 +974,12 @@ int llama_context::decode(llama_batch & inp_batch) { res->set_inputs(&ubatch); const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); +#ifdef GGML_PERF + if (perf_fp) { + ggml_perf_write_detailed_csv(gf, perf_fp); + } + ggml_perf_accumulate(perf_totals, gf); +#endif /* GGML_PERF */ if (compute_status != GGML_STATUS_SUCCESS) { switch (compute_status) { case GGML_STATUS_ABORTED: @@ -1139,6 +1148,10 @@ int llama_context::decode(llama_batch & inp_batch) { // overlap with device computation. ggml_backend_sched_reset(sched.get()); +#ifdef GGML_PERF + ggml_perf_print_totals(perf_totals); +#endif /* GGML_PERF */ + return 0; } diff --git a/src/llama-context.h b/src/llama-context.h index c0ceacb10ce6f..4ff8fe2cfeaff 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -8,6 +8,7 @@ #include "ggml-cpp.h" #include "ggml-opt.h" +#include "ggml.h" #include #include @@ -273,4 +274,7 @@ struct llama_context { mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) mutable int32_t n_eval = 0; // number of eval calls +#ifdef GGML_PERF + struct ggml_perf_totals perf_totals[GGML_OP_COUNT] = {}; // add this to llama_context +#endif }; diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 26842116ec6df..d5a5797f1c9a4 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -126,7 +126,7 @@ int main(int argc, char ** argv) { LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - llama_log_set(my_logger, nullptr); + //llama_log_set(my_logger, nullptr); LOG_INF("%s: llama backend init\n", __func__); llama_backend_init(); From c7e934d6273f67b354b5f82a8ccc942006fa578b Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Thu, 26 Jun 2025 22:55:38 -0700 Subject: [PATCH 2/5] Updated the README --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index f222c9a1a8ae1..97be1e621c20d 100644 --- a/README.md +++ b/README.md @@ -619,6 +619,9 @@ cd ../../ #Compile for posix with build-posix as a target folder cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix +to enable STatus use below command +cmake -B build-posix -DCMAKE_BUILD_TYPE=Debug -DGGML_TSAVORITE=ON -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF" + cmake --build build-posix --config Release #Compile for fpga with build-fpga as a target folder From d1bb3f6ba3344560377e9bf999e27c87e741c74a Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Fri, 27 Jun 2025 13:46:32 -0700 Subject: [PATCH 3/5] Addressed Ashish's PR comments --- ggml/include/ggml.h | 2 +- ggml/src/ggml-cpu/ggml-cpu.c | 5 +++++ ggml/src/ggml-tsavorite/ggml-tsavorite.cpp | 7 ++++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 0b354acc89f23..64dc3bc984dde 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -615,7 +615,7 @@ extern "C" { int64_t perf_runs; int64_t perf_time_us; enum ggml_compute_backend_type ggml_compute_backend; - char padding[8+12]; + char padding[4]; #else char padding[8]; #endif /* GGML_PERF */ diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 4354106692b48..8d5119474d996 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2852,7 +2852,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { #ifdef GGML_PERF int64_t t_end = ggml_time_us(); node->perf_runs++; + if (t_end >= t_start) { node->perf_time_us += (t_end - t_start); + } else { + // Handle wraparound by assuming timer rolls over at max int64_t value + node->perf_time_us += (INT64_MAX - t_start + t_end + 1); + } #endif if (state->ith == 0 && cplan->abort_callback && diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp index d2cfa93b8bc97..cdb9842bf7a0f 100644 --- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp +++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp @@ -1128,8 +1128,13 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, #ifdef GGML_PERF int64_t t_end = ggml_time_us(); node->perf_runs++; - node->perf_time_us += (t_end - t_start); node->ggml_compute_backend = GGML_COMPUTE_BACKEND_TSAVORITE; + if (t_end >= t_start) { + node->perf_time_us += (t_end - t_start); + } else { + // Handle wraparound by assuming timer rolls over at max int64_t value + node->perf_time_us += (INT64_MAX - t_start + t_end + 1); + } #endif } From d5ca4f7159f23086aca65aa86cc3252c225d9b0d Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Mon, 30 Jun 2025 12:20:43 -0700 Subject: [PATCH 4/5] Since some status coming along with prompt response, move the code. now it will be printed at last --- ggml/include/ggml.h | 2 -- ggml/src/ggml.c | 15 +-------------- src/llama-context.cpp | 27 ++++++++++++++++++++------- src/llama-context.h | 6 +++--- tools/main/main.cpp | 4 +++- 5 files changed, 27 insertions(+), 27 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 64dc3bc984dde..d739f4c44e0c7 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2227,8 +2227,6 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp); // capture perf into totals void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph); -// print final stats -void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]); #endif /* GGML_PERF */ #ifdef __cplusplus diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 839446d7a17d9..d16d6fafc979e 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -249,7 +249,7 @@ static void ggml_log_internal_v(enum ggml_log_level level, const char * format, void ggml_log_internal(enum ggml_log_level level, const char * format, ...) { va_list args; va_start(args, format); - //if (level == GGML_LOG_LEVEL_TSAVORITE) + if (level == GGML_LOG_LEVEL_TSAVORITE) ggml_log_internal_v(level, format, args); va_end(args); } @@ -6583,19 +6583,6 @@ void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct } } -void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) { - printf("\n=== GGML Perf Summary ===\n"); - for (int i = 0; i < GGML_OP_COUNT; ++i) { - if (totals[i].runs > 0) { - printf(" %-16s: %5ld runs, %8ld us total, avg %.2f us\n", - totals[i].op_name ? totals[i].op_name : "UNKNOWN", - totals[i].runs, - totals[i].total_us, - (double)totals[i].total_us / totals[i].runs); - } - } -} - FILE * ggml_perf_log_open(const char *filename) { // Try to delete existing file, ignore error if it doesn't exist remove(filename); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 3c4b86bc08e1d..2467b621e964c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -933,7 +933,7 @@ int llama_context::decode(llama_batch & inp_batch) { int64_t n_outputs_prev = 0; #ifdef GGML_PERF - FILE *perf_fp = ggml_perf_log_open("ggml_perf.log"); + FILE *perf_all_shape_fp = ggml_perf_log_open("ggml_perf-all-shape.log"); #endif /* GGML_PERF */ while (sbatch.n_tokens > 0) { @@ -975,8 +975,8 @@ int llama_context::decode(llama_batch & inp_batch) { const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); #ifdef GGML_PERF - if (perf_fp) { - ggml_perf_write_detailed_csv(gf, perf_fp); + if (perf_all_shape_fp) { + ggml_perf_write_detailed_csv(gf, perf_all_shape_fp); } ggml_perf_accumulate(perf_totals, gf); #endif /* GGML_PERF */ @@ -1148,10 +1148,6 @@ int llama_context::decode(llama_batch & inp_batch) { // overlap with device computation. ggml_backend_sched_reset(sched.get()); -#ifdef GGML_PERF - ggml_perf_print_totals(perf_totals); -#endif /* GGML_PERF */ - return 0; } @@ -2624,6 +2620,19 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) { return data; } +void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) { + LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n"); + for (int i = 0; i < GGML_OP_COUNT; ++i) { + if (totals[i].runs > 0) { + LLAMA_LOG_TSAVORITE(" %-16s: %5ld runs, %8ld us total, avg %.2f us\n", + totals[i].op_name ? totals[i].op_name : "UNKNOWN", + totals[i].runs, + totals[i].total_us, + (double)totals[i].total_us / totals[i].runs); + } + } +} + void llama_perf_context_print(const llama_context * ctx) { const auto data = llama_perf_context(ctx); @@ -2635,12 +2644,16 @@ void llama_perf_context_print(const llama_context * ctx) { __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); +#ifdef GGML_PERF LLAMA_LOG_TSAVORITE("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); LLAMA_LOG_TSAVORITE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); LLAMA_LOG_TSAVORITE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); LLAMA_LOG_TSAVORITE("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); + + ggml_perf_print_totals(const_cast(ctx->perf_totals)); +#endif /* GGML_PERF */ } void llama_perf_context_reset(llama_context * ctx) { diff --git a/src/llama-context.h b/src/llama-context.h index 4ff8fe2cfeaff..918004be9bb8e 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -184,6 +184,9 @@ struct llama_context { ggml_status graph_compute( ggml_cgraph * gf, bool batched); +#ifdef GGML_PERF + struct ggml_perf_totals perf_totals[GGML_OP_COUNT] = {}; // add this to llama_context +#endif private: llm_graph_result_ptr graph_build( @@ -274,7 +277,4 @@ struct llama_context { mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) mutable int32_t n_eval = 0; // number of eval calls -#ifdef GGML_PERF - struct ggml_perf_totals perf_totals[GGML_OP_COUNT] = {}; // add this to llama_context -#endif }; diff --git a/tools/main/main.cpp b/tools/main/main.cpp index d5a5797f1c9a4..d736ef9e515c6 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -126,7 +126,9 @@ int main(int argc, char ** argv) { LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - //llama_log_set(my_logger, nullptr); +#ifdef GGML_PERF + llama_log_set(my_logger, nullptr); +#endif /* GGML_PERF */ LOG_INF("%s: llama backend init\n", __func__); llama_backend_init(); From 44daa2b7cd3ca7fe012dfe733794ed7d8f49d293 Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Mon, 30 Jun 2025 13:09:19 -0700 Subject: [PATCH 5/5] Addressed Indentation comments raise by Ashish --- ggml/src/ggml-cpu/ggml-cpu.c | 13 ++++++------- ggml/src/ggml.c | 2 +- src/llama-context.cpp | 4 ++-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 8d5119474d996..221182445ea34 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2852,14 +2852,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { #ifdef GGML_PERF int64_t t_end = ggml_time_us(); node->perf_runs++; - if (t_end >= t_start) { - node->perf_time_us += (t_end - t_start); - } else { - // Handle wraparound by assuming timer rolls over at max int64_t value - node->perf_time_us += (INT64_MAX - t_start + t_end + 1); - } + if (t_end >= t_start) { + node->perf_time_us += (t_end - t_start); + } else { + // Handle wraparound by assuming timer rolls over at max int64_t value + node->perf_time_us += (INT64_MAX - t_start + t_end + 1); + } #endif - if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index d16d6fafc979e..cddeca9067288 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -250,7 +250,7 @@ void ggml_log_internal(enum ggml_log_level level, const char * format, ...) { va_list args; va_start(args, format); if (level == GGML_LOG_LEVEL_TSAVORITE) - ggml_log_internal_v(level, format, args); + ggml_log_internal_v(level, format, args); va_end(args); } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 2467b621e964c..4d8b6fdf5a74b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -975,9 +975,9 @@ int llama_context::decode(llama_batch & inp_batch) { const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); #ifdef GGML_PERF - if (perf_all_shape_fp) { + if (perf_all_shape_fp) { ggml_perf_write_detailed_csv(gf, perf_all_shape_fp); - } + } ggml_perf_accumulate(perf_totals, gf); #endif /* GGML_PERF */ if (compute_status != GGML_STATUS_SUCCESS) {