From a6918d847dd867fb88760489107d4494b9ccd506 Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Thu, 26 Jun 2025 22:43:37 -0700
Subject: [PATCH 1/5] @FIR-770 - LLama.cpp: Adding Perf Status for all GGML
 Operation

---
 ggml/include/ggml.h                        | 35 +++++++-
 ggml/src/ggml-cpu/ggml-cpu.c               |  9 ++
 ggml/src/ggml-tsavorite/ggml-tsavorite.cpp |  9 ++
 ggml/src/ggml.c                            | 96 +++++++++++++++++++++-
 src/llama-context.cpp                      | 13 +++
 src/llama-context.h                        |  4 +
 tools/main/main.cpp                        |  2 +-
 7 files changed, 165 insertions(+), 3 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index e6830b63ba8e1..0b354acc89f23 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -317,6 +317,14 @@ extern "C" {
     GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
     GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
 
+#ifdef GGML_PERF
+    enum ggml_compute_backend_type {
+        GGML_COMPUTE_BACKEND_CPU=0,
+        GGML_COMPUTE_BACKEND_TSAVORITE,
+        GGML_COMPUTE_BACKEND_COUNT
+    };
+#endif /* GGML_PERF */
+
     enum ggml_status {
         GGML_STATUS_ALLOC_FAILED = -2,
         GGML_STATUS_FAILED = -1,
@@ -603,8 +611,14 @@ extern "C" {
         char name[GGML_MAX_NAME];
 
         void * extra; // extra things e.g. for ggml-cuda.cu
-
+#ifdef GGML_PERF
+	int64_t perf_runs;
+        int64_t perf_time_us;
+	enum ggml_compute_backend_type ggml_compute_backend;
+        char padding[8+12];
+#else
         char padding[8];
+#endif /* GGML_PERF */
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -2197,6 +2211,25 @@ extern "C" {
     GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
     GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
     GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
+ 
+#ifdef GGML_PERF
+// internal perf accumulation struct
+struct ggml_perf_totals {
+    int op_count;
+    int64_t total_us;
+    int64_t runs;
+    const char * op_name;
+};
+
+FILE * ggml_perf_log_open(const char *filename);
+void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp);
+
+// capture perf into totals
+void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph);
+
+// print final stats
+void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]);
+#endif /* GGML_PERF */
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 46f75ad97cd61..4354106692b48 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2844,8 +2844,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
         struct ggml_tensor * node = cgraph->nodes[node_n];
 
+#ifdef GGML_PERF
+        int64_t t_start = ggml_time_us();
+#endif
         ggml_compute_forward(&params, node);
 
+#ifdef GGML_PERF
+        int64_t t_end = ggml_time_us();
+        node->perf_runs++;
+        node->perf_time_us += (t_end - t_start);
+#endif
+
         if (state->ith == 0 && cplan->abort_callback &&
                 cplan->abort_callback(cplan->abort_callback_data)) {
             atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
index c49d02375921f..d2cfa93b8bc97 100644
--- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
+++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
@@ -813,6 +813,9 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
   tensor_log log_data;
 
   for (int i = 0; i < cgraph->n_nodes; i++) {
+#ifdef GGML_PERF
+    int64_t t_start = ggml_time_us();
+#endif
     node = cgraph->nodes[i];
     src0 = node->src[0];
     src1 = node->src[1];
@@ -1122,6 +1125,12 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
           device->stats.op_run_count[kernel_type].max_num_of_elem < max_num_of_elem)
         device->stats.op_run_count[kernel_type].max_num_of_elem = max_num_of_elem;
     }
+#ifdef GGML_PERF
+    int64_t t_end = ggml_time_us();
+    node->perf_runs++;
+    node->perf_time_us += (t_end - t_start);
+    node->ggml_compute_backend = GGML_COMPUTE_BACKEND_TSAVORITE;
+#endif
   }
 
   // This this need to implement correctly when we have mixture of CPU and accelerator operation
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 134b7420de746..839446d7a17d9 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -249,7 +249,7 @@ static void ggml_log_internal_v(enum ggml_log_level level, const char * format,
 void ggml_log_internal(enum ggml_log_level level, const char * format, ...) {
     va_list args;
     va_start(args, format);
-    if (level == GGML_LOG_LEVEL_TSAVORITE)
+    //if (level == GGML_LOG_LEVEL_TSAVORITE)
     ggml_log_internal_v(level, format, args);
     va_end(args);
 }
@@ -985,6 +985,13 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
+#ifdef GGML_PERF
+static const char * GGML_BACKEND_TYPE[GGML_COMPUTE_BACKEND_COUNT] = {
+    "CPU",
+    "TSAVORITE"
+};
+#endif /* GGML_PERF */
+
 static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -1200,6 +1207,12 @@ const char * ggml_op_name(enum ggml_op op) {
     return GGML_OP_NAME[op];
 }
 
+#ifdef GGML_PERF
+static const char * ggml_backend_type(enum ggml_compute_backend_type backend) {
+    return GGML_BACKEND_TYPE[backend];
+}
+#endif /* GGML_PERF */
+
 const char * ggml_op_symbol(enum ggml_op op) {
     return GGML_OP_SYMBOL[op];
 }
@@ -1617,6 +1630,11 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
         /*.name         =*/ { 0 },
         /*.extra        =*/ NULL,
+#ifdef GGML_PERF
+        /*.perf_runs    =*/ 0,
+        /*.perf_time_us =*/ 0,
+        /*.ggml_compute_backend =*/ GGML_COMPUTE_BACKEND_CPU,
+#endif /* GGML_PERF */
         /*.padding      =*/ { 0 },
     };
 
@@ -6549,3 +6567,79 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons
     if (p0->strict_cpu     != p1->strict_cpu )    return false;
     return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
 }
+
+#ifdef GGML_PERF
+void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph) {
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+        enum ggml_op op = node->op;
+
+        if (op >= GGML_OP_COUNT) continue;
+
+        totals[op].op_name = ggml_op_name(op);
+        totals[op].total_us += node->perf_time_us;
+        totals[op].runs     += node->perf_runs;
+        totals[op].op_count++;
+    }
+}
+
+void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
+    printf("\n=== GGML Perf Summary ===\n");
+    for (int i = 0; i < GGML_OP_COUNT; ++i) {
+        if (totals[i].runs > 0) {
+            printf("  %-16s: %5ld runs, %8ld us total, avg %.2f us\n",
+                   totals[i].op_name ? totals[i].op_name : "UNKNOWN",
+                   totals[i].runs,
+                   totals[i].total_us,
+                   (double)totals[i].total_us / totals[i].runs);
+        }
+    }
+}
+
+FILE * ggml_perf_log_open(const char *filename) {
+    // Try to delete existing file, ignore error if it doesn't exist
+    remove(filename);
+
+    // Create a new file in write mode
+    FILE *fp = fopen(filename, "w");
+    if (!fp) {
+        fprintf(stderr, "Error: Could not create file %s\n", filename);
+        return NULL;
+    }
+
+    return fp;
+}
+
+void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp) {
+    if (!fp) return;
+
+    int64_t total_time_us = 0;
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        if (cgraph->nodes[i]->perf_runs > 0) {
+            total_time_us += cgraph->nodes[i]->perf_time_us;
+        }
+    }
+
+    fprintf(fp, "ggml_graph_compute_perf: total compute time: %.3f ms\n", total_time_us / 1000.0);
+
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+        if (node->perf_runs == 0) continue;
+
+        double t_ms   = node->perf_time_us / 1000.0;
+        double avg_ms = t_ms / node->perf_runs;
+
+        fprintf(fp,
+            " - BACKEND:%s OP:%s: total %.3f ms over %d runs (avg %.3f ms) [shape=%d,%d,%d]\n",
+            ggml_backend_type(node->ggml_compute_backend),
+            ggml_op_name(node->op),
+            t_ms,
+            node->perf_runs,
+            avg_ms,
+            node->ne[0], node->ne[1], node->ne[2]);
+    }
+
+    fprintf(fp, "--------------------------------------------------\n\n");
+}
+
+#endif /* GGML_PERF */
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 984dbf14d14ae..3c4b86bc08e1d 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -932,6 +932,9 @@ int llama_context::decode(llama_batch & inp_batch) {
     kv_self_update();
 
     int64_t n_outputs_prev = 0;
+#ifdef GGML_PERF
+    FILE *perf_fp = ggml_perf_log_open("ggml_perf.log");
+#endif /* GGML_PERF */
 
     while (sbatch.n_tokens > 0) {
         llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
@@ -971,6 +974,12 @@ int llama_context::decode(llama_batch & inp_batch) {
         res->set_inputs(&ubatch);
 
         const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
+#ifdef GGML_PERF
+	if (perf_fp) {
+            ggml_perf_write_detailed_csv(gf, perf_fp);
+	}
+        ggml_perf_accumulate(perf_totals, gf);
+#endif /* GGML_PERF */
         if (compute_status != GGML_STATUS_SUCCESS) {
             switch (compute_status) {
                 case GGML_STATUS_ABORTED:
@@ -1139,6 +1148,10 @@ int llama_context::decode(llama_batch & inp_batch) {
     // overlap with device computation.
     ggml_backend_sched_reset(sched.get());
 
+#ifdef GGML_PERF
+    ggml_perf_print_totals(perf_totals);
+#endif /* GGML_PERF */
+
     return 0;
 }
 
diff --git a/src/llama-context.h b/src/llama-context.h
index c0ceacb10ce6f..4ff8fe2cfeaff 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -8,6 +8,7 @@
 
 #include "ggml-cpp.h"
 #include "ggml-opt.h"
+#include "ggml.h"
 
 #include <map>
 #include <vector>
@@ -273,4 +274,7 @@ struct llama_context {
 
     mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
     mutable int32_t n_eval   = 0; // number of eval calls
+#ifdef GGML_PERF
+    struct ggml_perf_totals perf_totals[GGML_OP_COUNT] = {};  // add this to llama_context
+#endif
 };
diff --git a/tools/main/main.cpp b/tools/main/main.cpp
index 26842116ec6df..d5a5797f1c9a4 100644
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@@ -126,7 +126,7 @@ int main(int argc, char ** argv) {
         LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
-    llama_log_set(my_logger, nullptr);
+    //llama_log_set(my_logger, nullptr);
     LOG_INF("%s: llama backend init\n", __func__);
 
     llama_backend_init();

From c7e934d6273f67b354b5f82a8ccc942006fa578b Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Thu, 26 Jun 2025 22:55:38 -0700
Subject: [PATCH 2/5] Updated the README

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index f222c9a1a8ae1..97be1e621c20d 100644
--- a/README.md
+++ b/README.md
@@ -619,6 +619,9 @@ cd ../../
 #Compile for posix with build-posix as a target folder
 
 cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix
+to enable STatus use below command
+cmake -B build-posix   -DCMAKE_BUILD_TYPE=Debug   -DGGML_TSAVORITE=ON   -DCMAKE_C_FLAGS="-DGGML_PERF"   -DCMAKE_CXX_FLAGS="-DGGML_PERF"
+
 cmake --build build-posix --config Release
 
 #Compile for fpga with build-fpga as a target folder

From d1bb3f6ba3344560377e9bf999e27c87e741c74a Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Fri, 27 Jun 2025 13:46:32 -0700
Subject: [PATCH 3/5] Addressed Ashish's PR comments

---
 ggml/include/ggml.h                        | 2 +-
 ggml/src/ggml-cpu/ggml-cpu.c               | 5 +++++
 ggml/src/ggml-tsavorite/ggml-tsavorite.cpp | 7 ++++++-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 0b354acc89f23..64dc3bc984dde 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -615,7 +615,7 @@ extern "C" {
 	int64_t perf_runs;
         int64_t perf_time_us;
 	enum ggml_compute_backend_type ggml_compute_backend;
-        char padding[8+12];
+        char padding[4];
 #else
         char padding[8];
 #endif /* GGML_PERF */
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 4354106692b48..8d5119474d996 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2852,7 +2852,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 #ifdef GGML_PERF
         int64_t t_end = ggml_time_us();
         node->perf_runs++;
+	if (t_end >= t_start) {
         node->perf_time_us += (t_end - t_start);
+    } else {
+        // Handle wraparound by assuming timer rolls over at max int64_t value
+        node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
+    }
 #endif
 
         if (state->ith == 0 && cplan->abort_callback &&
diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
index d2cfa93b8bc97..cdb9842bf7a0f 100644
--- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
+++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
@@ -1128,8 +1128,13 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
 #ifdef GGML_PERF
     int64_t t_end = ggml_time_us();
     node->perf_runs++;
-    node->perf_time_us += (t_end - t_start);
     node->ggml_compute_backend = GGML_COMPUTE_BACKEND_TSAVORITE;
+    if (t_end >= t_start) {
+        node->perf_time_us += (t_end - t_start);
+    } else {
+        // Handle wraparound by assuming timer rolls over at max int64_t value
+        node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
+    }
 #endif
   }
 

From d5ca4f7159f23086aca65aa86cc3252c225d9b0d Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Mon, 30 Jun 2025 12:20:43 -0700
Subject: [PATCH 4/5] Since some status coming along with prompt response, move
 the code. now it will be printed at last

---
 ggml/include/ggml.h   |  2 --
 ggml/src/ggml.c       | 15 +--------------
 src/llama-context.cpp | 27 ++++++++++++++++++++-------
 src/llama-context.h   |  6 +++---
 tools/main/main.cpp   |  4 +++-
 5 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 64dc3bc984dde..d739f4c44e0c7 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2227,8 +2227,6 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp);
 // capture perf into totals
 void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph);
 
-// print final stats
-void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]);
 #endif /* GGML_PERF */
 
 #ifdef  __cplusplus
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 839446d7a17d9..d16d6fafc979e 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -249,7 +249,7 @@ static void ggml_log_internal_v(enum ggml_log_level level, const char * format,
 void ggml_log_internal(enum ggml_log_level level, const char * format, ...) {
     va_list args;
     va_start(args, format);
-    //if (level == GGML_LOG_LEVEL_TSAVORITE)
+    if (level == GGML_LOG_LEVEL_TSAVORITE)
     ggml_log_internal_v(level, format, args);
     va_end(args);
 }
@@ -6583,19 +6583,6 @@ void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct
     }
 }
 
-void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
-    printf("\n=== GGML Perf Summary ===\n");
-    for (int i = 0; i < GGML_OP_COUNT; ++i) {
-        if (totals[i].runs > 0) {
-            printf("  %-16s: %5ld runs, %8ld us total, avg %.2f us\n",
-                   totals[i].op_name ? totals[i].op_name : "UNKNOWN",
-                   totals[i].runs,
-                   totals[i].total_us,
-                   (double)totals[i].total_us / totals[i].runs);
-        }
-    }
-}
-
 FILE * ggml_perf_log_open(const char *filename) {
     // Try to delete existing file, ignore error if it doesn't exist
     remove(filename);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 3c4b86bc08e1d..2467b621e964c 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -933,7 +933,7 @@ int llama_context::decode(llama_batch & inp_batch) {
 
     int64_t n_outputs_prev = 0;
 #ifdef GGML_PERF
-    FILE *perf_fp = ggml_perf_log_open("ggml_perf.log");
+    FILE *perf_all_shape_fp = ggml_perf_log_open("ggml_perf-all-shape.log");
 #endif /* GGML_PERF */
 
     while (sbatch.n_tokens > 0) {
@@ -975,8 +975,8 @@ int llama_context::decode(llama_batch & inp_batch) {
 
         const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
 #ifdef GGML_PERF
-	if (perf_fp) {
-            ggml_perf_write_detailed_csv(gf, perf_fp);
+	if (perf_all_shape_fp) {
+            ggml_perf_write_detailed_csv(gf, perf_all_shape_fp);
 	}
         ggml_perf_accumulate(perf_totals, gf);
 #endif /* GGML_PERF */
@@ -1148,10 +1148,6 @@ int llama_context::decode(llama_batch & inp_batch) {
     // overlap with device computation.
     ggml_backend_sched_reset(sched.get());
 
-#ifdef GGML_PERF
-    ggml_perf_print_totals(perf_totals);
-#endif /* GGML_PERF */
-
     return 0;
 }
 
@@ -2624,6 +2620,19 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) {
     return data;
 }
 
+void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
+    LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
+    for (int i = 0; i < GGML_OP_COUNT; ++i) {
+        if (totals[i].runs > 0) {
+            LLAMA_LOG_TSAVORITE("  %-16s: %5ld runs, %8ld us total, avg %.2f us\n",
+                   totals[i].op_name ? totals[i].op_name : "UNKNOWN",
+                   totals[i].runs,
+                   totals[i].total_us,
+                   (double)totals[i].total_us / totals[i].runs);
+        }
+    }
+}
+
 void llama_perf_context_print(const llama_context * ctx) {
     const auto data = llama_perf_context(ctx);
 
@@ -2635,12 +2644,16 @@ void llama_perf_context_print(const llama_context * ctx) {
             __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
     LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
 
+#ifdef GGML_PERF
     LLAMA_LOG_TSAVORITE("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
     LLAMA_LOG_TSAVORITE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
     LLAMA_LOG_TSAVORITE("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
     LLAMA_LOG_TSAVORITE("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+
+    ggml_perf_print_totals(const_cast<ggml_perf_totals *>(ctx->perf_totals));
+#endif /* GGML_PERF */
 }
 
 void llama_perf_context_reset(llama_context * ctx) {
diff --git a/src/llama-context.h b/src/llama-context.h
index 4ff8fe2cfeaff..918004be9bb8e 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -184,6 +184,9 @@ struct llama_context {
     ggml_status graph_compute(
             ggml_cgraph * gf,
                    bool   batched);
+#ifdef GGML_PERF
+    struct ggml_perf_totals perf_totals[GGML_OP_COUNT] = {};  // add this to llama_context
+#endif
 
 private:
     llm_graph_result_ptr graph_build(
@@ -274,7 +277,4 @@ struct llama_context {
 
     mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
     mutable int32_t n_eval   = 0; // number of eval calls
-#ifdef GGML_PERF
-    struct ggml_perf_totals perf_totals[GGML_OP_COUNT] = {};  // add this to llama_context
-#endif
 };
diff --git a/tools/main/main.cpp b/tools/main/main.cpp
index d5a5797f1c9a4..d736ef9e515c6 100644
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@@ -126,7 +126,9 @@ int main(int argc, char ** argv) {
         LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
-    //llama_log_set(my_logger, nullptr);
+#ifdef GGML_PERF
+    llama_log_set(my_logger, nullptr);
+#endif /* GGML_PERF */
     LOG_INF("%s: llama backend init\n", __func__);
 
     llama_backend_init();

From 44daa2b7cd3ca7fe012dfe733794ed7d8f49d293 Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Mon, 30 Jun 2025 13:09:19 -0700
Subject: [PATCH 5/5] Addressed Indentation comments raise by Ashish

---
 ggml/src/ggml-cpu/ggml-cpu.c | 13 ++++++-------
 ggml/src/ggml.c              |  2 +-
 src/llama-context.cpp        |  4 ++--
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 8d5119474d996..221182445ea34 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2852,14 +2852,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 #ifdef GGML_PERF
         int64_t t_end = ggml_time_us();
         node->perf_runs++;
-	if (t_end >= t_start) {
-        node->perf_time_us += (t_end - t_start);
-    } else {
-        // Handle wraparound by assuming timer rolls over at max int64_t value
-        node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
-    }
+        if (t_end >= t_start) {
+            node->perf_time_us += (t_end - t_start);
+        } else {
+            // Handle wraparound by assuming timer rolls over at max int64_t value
+            node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
+        }
 #endif
-
         if (state->ith == 0 && cplan->abort_callback &&
                 cplan->abort_callback(cplan->abort_callback_data)) {
             atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index d16d6fafc979e..cddeca9067288 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -250,7 +250,7 @@ void ggml_log_internal(enum ggml_log_level level, const char * format, ...) {
     va_list args;
     va_start(args, format);
     if (level == GGML_LOG_LEVEL_TSAVORITE)
-    ggml_log_internal_v(level, format, args);
+        ggml_log_internal_v(level, format, args);
     va_end(args);
 }
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 2467b621e964c..4d8b6fdf5a74b 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -975,9 +975,9 @@ int llama_context::decode(llama_batch & inp_batch) {
 
         const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
 #ifdef GGML_PERF
-	if (perf_all_shape_fp) {
+        if (perf_all_shape_fp) {
             ggml_perf_write_detailed_csv(gf, perf_all_shape_fp);
-	}
+        }
         ggml_perf_accumulate(perf_totals, gf);
 #endif /* GGML_PERF */
         if (compute_status != GGML_STATUS_SUCCESS) {