tsisw · akapoor3518 · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025
@@ -662,9 +662,10 @@ extern "C" {
 
 #if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
         int64_t perf_runs;
+        int64_t tsi_kernel_runs;
         int64_t perf_time_us;
         enum ggml_compute_backend_type ggml_compute_backend;
-        char padding[4];
+        char padding[12];
 #else
         char padding[8];
 #endif /* GML_PERF-related flag */
@@ -2561,11 +2562,13 @@ extern "C" {
 struct ggml_perf_backend_subtotals {
     int64_t total_us;
     int64_t runs;
+    int64_t tsi_kernel_count;
 };
 
 struct ggml_perf_unary_subtotals {
     int64_t total_us;
     int64_t runs;
+    int64_t tsi_kernel_count;
 };
 // internal perf accumulation struct
 struct ggml_perf_totals {

@@ -1225,6 +1225,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
                         val[0] = scale;
                         ctx->kernels[kernel_type].pipeline->_mlir_fptr_3_input[kernel_sub_type](srcP0, srcP1, nodeP, glob_buf);
                         ++device->stats.op_run_count[kernel_type].num_of_kernel_call;
+                        ++node->tsi_kernel_runs;
 	            }
 	        }
 	    }
@@ -1258,6 +1259,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
                     // kernel call
                     ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input[kernel_sub_type](srcP0, srcP1, nodeP);
                     ++device->stats.op_run_count[kernel_type].num_of_kernel_call;
+                    ++node->tsi_kernel_runs;
                 }
             }
         }
@@ -1372,6 +1374,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
             ctx->kernels[kernel_type].pipeline->_mlir_fptr_1_input[kernel_sub_type](srcP0, nodeP);
 	}
         ++device->stats.op_run_count[kernel_type].num_of_kernel_call;
+        ++node->tsi_kernel_runs;
 
         if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) {
           log_data.data_type = GGML_TSAVORITE_TENSOR_NODE;

@@ -7249,12 +7249,14 @@ void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct
         if (be >= GGML_COMPUTE_BACKEND_CPU && be < GGML_COMPUTE_BACKEND_COUNT) {
             totals[op].backend_subtotals[be].total_us += node->perf_time_us;
 	    totals[op].backend_subtotals[be].runs     += node->perf_runs;
+	    totals[op].backend_subtotals[be].tsi_kernel_count   += node->tsi_kernel_runs;
         }
 
         if (op == GGML_OP_UNARY) {
             enum ggml_unary_op subop = ggml_get_unary_op(node);
             totals[op].unary_subtotals[subop].total_us += node->perf_time_us;
             totals[op].unary_subtotals[subop].runs     += node->perf_runs;
+            totals[op].unary_subtotals[subop].tsi_kernel_count   += node->tsi_kernel_runs;
         }
     }
 }

@@ -2791,7 +2791,7 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
 #elif defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
 void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
     LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
-    LLAMA_LOG_TSAVORITE("  %-16s %-8s %7s  %14s  %16s\n", "Op", "Target", "Runs", "Total us", "Avg us");
+    LLAMA_LOG_TSAVORITE("  %-16s %-8s %7s  %14s  %16s  %16s\n", "Op", "Target", "Runs", "TSI_KERNEL-RUN", "Total us", "Avg us");
 
     for (int i = 0; i < GGML_OP_COUNT; ++i) {
         if (totals[i].runs > 0) {
@@ -2801,10 +2801,11 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
                     char padded_backend[7] = {0}; // 6 chars + null terminator
                     snprintf(padded_backend, sizeof(padded_backend), "%-6s", backend_name);
 
-                    LLAMA_LOG_TSAVORITE("  %-16s %-8s %7ld  %14ld  %16.2f\n",
+                    LLAMA_LOG_TSAVORITE("  %-16s %-8s %7ld  %14ld  %16ld  %16.2f\n",
                         totals[i].op_name ? totals[i].op_name : "UNKNOWN",
                         padded_backend,
                         totals[i].backend_subtotals[b].runs,
+                        totals[i].backend_subtotals[b].tsi_kernel_count,
                         totals[i].backend_subtotals[b].total_us,
                         (double)totals[i].backend_subtotals[b].total_us / totals[i].backend_subtotals[b].runs);
                 }
@@ -2826,10 +2827,11 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
                         char padded_backend[7] = {0};
                         snprintf(padded_backend, sizeof(padded_backend), "%-6s", backend_name ? backend_name : "UNK");
 
-                        LLAMA_LOG_TSAVORITE("    -> %-11s %-8s %7ld  %14ld  %16.2f\n",
+                        LLAMA_LOG_TSAVORITE("    -> %-11s %-8s %7ld  %14ld  %16ld  %16.2f\n",
                             ggml_unary_op_name((enum ggml_unary_op) j),
                             padded_backend,
                             totals[i].unary_subtotals[j].runs,
+                            totals[i].unary_subtotals[j].tsi_kernel_count,
                             totals[i].unary_subtotals[j].total_us,
                             (double)totals[i].unary_subtotals[j].total_us / totals[i].unary_subtotals[j].runs);
                     }