diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
index a3cc538794f9f..ae36728a8d3ce 100644
--- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
+++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
@@ -522,7 +522,7 @@ static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_d
   if (tsi_log_setup() == false)
     return NULL;
 
-  std::string mainProfilerName = "GGML Tsavorite ";
+  std::string mainProfilerName = "OPU ";
   tsirt::utils::TSIProfiler::initialize();
   tsirt::utils::TSIScopedProfiler mainProfiler(mainProfilerName);
   
@@ -630,7 +630,7 @@ static void ggml_tsavorite_free(struct ggml_backend_tsavorite_context *ctx) {
   tsi_finalize();
   GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
   tsirt::utils::TSIProfiler::finalize();
-  std::cout << "\nGGML Tsavorite Profiling Results:" << std::endl;
+  std::cout << "\nOPU Profiling Results:" << std::endl;
   std::cout << tsirt::utils::TSIProfiler::getFormattedResults(
                    /*truncateFuncNames*/ true)
             << std::endl;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index ac670388a6f54..39bb43ae2c744 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -988,7 +988,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
 #ifdef GGML_PERF
 static const char * GGML_BACKEND_TYPE[GGML_COMPUTE_BACKEND_COUNT] = {
     "CPU",
-    "TSAVORITE"
+    "OPU"
 };
 #endif /* GGML_PERF */
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 6cc08b71a9244..b5daa2179ca62 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2620,25 +2620,51 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) {
     return data;
 }
 
+
 #ifdef GGML_PERF
 void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
     LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
-    LLAMA_LOG_TSAVORITE("  %-16s: %7s  %14s  %16s\n", "Op", "Runs", "Total us", "Avg us");
+    LLAMA_LOG_TSAVORITE("  %-16s  %7s  %14s  %16s\n", "Op", "Runs", "Total us", "Avg us");
 
     for (int i = 0; i < GGML_OP_COUNT; ++i) {
         if (totals[i].runs > 0) {
-            // Main op row
-            LLAMA_LOG_TSAVORITE("  %-16s: %7ld  %14ld  %16.2f\n",
+            LLAMA_LOG_TSAVORITE("  %-16s  %7ld  %14ld  %16.2f\n",
                 totals[i].op_name ? totals[i].op_name : "UNKNOWN",
                 totals[i].runs,
                 totals[i].total_us,
                 (double)totals[i].total_us / totals[i].runs);
+        }
+
+        // Unary sub-op breakdown
+        if (i == GGML_OP_UNARY) {
+            for (int j = 0; j < GGML_UNARY_OP_COUNT; ++j) {
+                if (totals[i].unary_subtotals[j].runs > 0) {
+                    LLAMA_LOG_TSAVORITE("    -> %-11s  %7ld  %14ld  %16.2f\n",
+                        ggml_unary_op_name((enum ggml_unary_op) j),
+                        totals[i].unary_subtotals[j].runs,
+                        totals[i].unary_subtotals[j].total_us,
+                        (double)totals[i].unary_subtotals[j].total_us / totals[i].unary_subtotals[j].runs);
+                }
+            }
+        }
+    }
+}
+#else 
+void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
+    LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
+    LLAMA_LOG_TSAVORITE("  %-16s %-8s %7s  %14s  %16s\n", "Op", "Target", "Runs", "Total us", "Avg us");
 
-            // Backend subtotals
+    for (int i = 0; i < GGML_OP_COUNT; ++i) {
+        if (totals[i].runs > 0) {
             for (int b = 0; b < GGML_COMPUTE_BACKEND_COUNT; ++b) {
                 if (totals[i].backend_subtotals[b].runs > 0) {
-                    LLAMA_LOG_TSAVORITE("    [%-10s] : %7ld  %14ld  %16.2f\n",
-                        ggml_backend_type((enum ggml_compute_backend_type) b),
+                    const char *backend_name = ggml_backend_type((enum ggml_compute_backend_type) b);
+                    char padded_backend[7] = {0}; // 6 chars + null terminator
+                    snprintf(padded_backend, sizeof(padded_backend), "%-6s", backend_name);
+
+                    LLAMA_LOG_TSAVORITE("  %-16s %-8s %7ld  %14ld  %16.2f\n",
+                        totals[i].op_name ? totals[i].op_name : "UNKNOWN",
+                        padded_backend,
                         totals[i].backend_subtotals[b].runs,
                         totals[i].backend_subtotals[b].total_us,
                         (double)totals[i].backend_subtotals[b].total_us / totals[i].backend_subtotals[b].runs);
@@ -2649,8 +2675,21 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
             if (i == GGML_OP_UNARY) {
                 for (int j = 0; j < GGML_UNARY_OP_COUNT; ++j) {
                     if (totals[i].unary_subtotals[j].runs > 0) {
-                        LLAMA_LOG_TSAVORITE("    -> %-12s: %7ld  %14ld  %16.2f\n",
+                        // Find backend for unary op (assumes same as parent op)
+                        const char *backend_name = NULL;
+                        for (int b = 0; b < GGML_COMPUTE_BACKEND_COUNT; ++b) {
+                            if (totals[i].backend_subtotals[b].runs > 0) {
+                                backend_name = ggml_backend_type((enum ggml_compute_backend_type) b);
+                                break;
+                            }
+                        }
+
+                        char padded_backend[7] = {0};
+                        snprintf(padded_backend, sizeof(padded_backend), "%-6s", backend_name ? backend_name : "UNK");
+
+                        LLAMA_LOG_TSAVORITE("    -> %-11s %-8s %7ld  %14ld  %16.2f\n",
                             ggml_unary_op_name((enum ggml_unary_op) j),
+                            padded_backend,
                             totals[i].unary_subtotals[j].runs,
                             totals[i].unary_subtotals[j].total_us,
                             (double)totals[i].unary_subtotals[j].total_us / totals[i].unary_subtotals[j].runs);
@@ -2662,6 +2701,7 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
 }
 #endif /* GGML_PERF */
 
+
 void llama_perf_context_print(const llama_context * ctx) {
     const auto data = llama_perf_context(ctx);
 
diff --git a/tools/main/main.cpp b/tools/main/main.cpp
index d736ef9e515c6..eb0b49bf9d00d 100644
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@@ -971,16 +971,21 @@ int main(int argc, char ** argv) {
         LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
         llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
     }
+    // Ensure all prompt response output is fully completed before printing profiling data.
+   // Free threadpools first to guarantee all worker threads have finished processing and output.
+   // This prevents interleaving of prompt response and profiling output, ensuring correct order.
+   // fflush(stdout) is called to flush any remaining output before printing profiling stats.
+
+    ggml_threadpool_free_fn(threadpool);
+    ggml_threadpool_free_fn(threadpool_batch);
 
     LOG("\n\n");
+    fflush(stdout);
     common_perf_print(ctx, smpl);
 
     common_sampler_free(smpl);
 
     llama_backend_free();
 
-    ggml_threadpool_free_fn(threadpool);
-    ggml_threadpool_free_fn(threadpool_batch);
-
     return 0;
 }