From 2192d0f8abeb3f655496204373dddc6b1b7a0c2c Mon Sep 17 00:00:00 2001
From: Ashish Trivedi <atrivedi@tsavoritesi.com>
Date: Mon, 26 May 2025 21:52:38 -0700
Subject: [PATCH 1/2] @FIR-708: Added code to print out the profile information
 from runtime This change adds following 1. Add std=c++20 for using the
 aot-tests used profiler from runtime 2. Add initialization and dumping of
 profile informaion There is one crash which needs debugging but the results
 look as follows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

register_backend: registered backend Tsavorite (1 devices)
register_device: registered device Tsavorite (txe)
register_backend: registered backend CPU (1 devices)
register_device: registered device CPU (CPU)
load_backend: failed to find ggml_backend_init in ./libggml-tsavorite.so
load_backend: failed to find ggml_backend_init in ./libggml-cpu.so
build: 4826 (5255890c) with gcc (GCC) 13.3.0 for x86_64-pc-linux-gnu (debug)
main: llama backend init
main: load the model and apply lora adapter, if any

 TXE Device MEMORY Summary total 134217728 and free 134217728
llama_model_load_from_file_impl: using device Tsavorite (txe) - 128 MiB free
llama_model_loader: loaded meta data with 38 key-value pairs and 201 tensors from /tsi/akapoor/ggml/Tiny-Llama-v0.3-FP32-1.1B-F32.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Tiny Llama v0.3 FP32
llama_model_loader: - kv   3:                         general.size_label str              = 1.1B
llama_model_loader: - kv   4:                            general.license str              = apache-2.0
llama_model_loader: - kv   5:                      general.dataset.count u32              = 3
llama_model_loader: - kv   6:                     general.dataset.0.name str              = SlimPajama 627B
llama_model_loader: - kv   7:             general.dataset.0.organization str              = Cerebras
llama_model_loader: - kv   8:                 general.dataset.0.repo_url str              = https://huggingface.co/cerebras/SlimP...
llama_model_loader: - kv   9:                     general.dataset.1.name str              = Starcoderdata
llama_model_loader: - kv  10:             general.dataset.1.organization str              = Bigcode
llama_model_loader: - kv  11:                 general.dataset.1.repo_url str              = https://huggingface.co/bigcode/starco...
llama_model_loader: - kv  12:                     general.dataset.2.name str              = Oasst_Top1_2023 08 25
llama_model_loader: - kv  13:                  general.dataset.2.version str              = 08-25
llama_model_loader: - kv  14:             general.dataset.2.organization str              = OpenAssistant
llama_model_loader: - kv  15:                 general.dataset.2.repo_url str              = https://huggingface.co/OpenAssistant/...
llama_model_loader: - kv  16:                          general.languages arr[str,1]       = ["en"]
llama_model_loader: - kv  17:                          llama.block_count u32              = 22
llama_model_loader: - kv  18:                       llama.context_length u32              = 2048
llama_model_loader: - kv  19:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv  20:                  llama.feed_forward_length u32              = 5632
llama_model_loader: - kv  21:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv  22:              llama.attention.head_count_kv u32              = 4
llama_model_loader: - kv  23:                       llama.rope.freq_base f32              = 10000.000000
llama_model_loader: - kv  24:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
llama_model_loader: - kv  25:                          general.file_type u32              = 0
llama_model_loader: - kv  26:                           llama.vocab_size u32              = 32003
llama_model_loader: - kv  27:                 llama.rope.dimension_count u32              = 64
llama_model_loader: - kv  28:                       tokenizer.ggml.model str              = llama
llama_model_loader: - kv  29:                         tokenizer.ggml.pre str              = default
llama_model_loader: - kv  30:                      tokenizer.ggml.tokens arr[str,32003]   = ["<unk>", "<s>", "</s>", "<0x00>", "<...
llama_model_loader: - kv  31:                      tokenizer.ggml.scores arr[f32,32003]   = [-1000.000000, -1000.000000, -1000.00...
llama_model_loader: - kv  32:                  tokenizer.ggml.token_type arr[i32,32003]   = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
llama_model_loader: - kv  33:                tokenizer.ggml.bos_token_id u32              = 1
llama_model_loader: - kv  34:                tokenizer.ggml.eos_token_id u32              = 2
llama_model_loader: - kv  35:            tokenizer.ggml.unknown_token_id u32              = 0
llama_model_loader: - kv  36:            tokenizer.ggml.padding_token_id u32              = 32000
llama_model_loader: - kv  37:               general.quantization_version u32              = 2
llama_model_loader: - type  f32:  201 tensors
print_info: file format = GGUF V3 (latest)
print_info: file type   = all F32
print_info: file size   = 4.10 GiB (32.00 BPW)
load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
load: special tokens cache size = 6
load: token to piece cache size = 0.1684 MB
print_info: arch             = llama
print_info: vocab_only       = 0
print_info: n_ctx_train      = 2048
print_info: n_embd           = 2048
print_info: n_layer          = 22
print_info: n_head           = 32
print_info: n_head_kv        = 4
print_info: n_rot            = 64
print_info: n_swa            = 0
print_info: n_embd_head_k    = 64
print_info: n_embd_head_v    = 64
print_info: n_gqa            = 8
print_info: n_embd_k_gqa     = 256
print_info: n_embd_v_gqa     = 256
print_info: f_norm_eps       = 0.0e+00
print_info: f_norm_rms_eps   = 1.0e-05
print_info: f_clamp_kqv      = 0.0e+00
print_info: f_max_alibi_bias = 0.0e+00
print_info: f_logit_scale    = 0.0e+00
print_info: n_ff             = 5632
print_info: n_expert         = 0
print_info: n_expert_used    = 0
print_info: causal attn      = 1
print_info: pooling type     = 0
print_info: rope type        = 0
print_info: rope scaling     = linear
print_info: freq_base_train  = 10000.0
print_info: freq_scale_train = 1
print_info: n_ctx_orig_yarn  = 2048
print_info: rope_finetuned   = unknown
print_info: ssm_d_conv       = 0
print_info: ssm_d_inner      = 0
print_info: ssm_d_state      = 0
print_info: ssm_dt_rank      = 0
print_info: ssm_dt_b_c_rms   = 0
print_info: model type       = 1B
print_info: model params     = 1.10 B
print_info: general.name     = Tiny Llama v0.3 FP32
print_info: vocab type       = SPM
print_info: n_vocab          = 32003
print_info: n_merges         = 0
print_info: BOS token        = 1 '<s>'
print_info: EOS token        = 2 '</s>'
print_info: EOT token        = 32002 '<|im_end|>'
print_info: UNK token        = 0 '<unk>'
print_info: PAD token        = 32000 '[PAD]'
print_info: LF token         = 13 '<0x0A>'
print_info: EOG token        = 2 '</s>'
print_info: EOG token        = 32002 '<|im_end|>'
print_info: max token length = 48
load_tensors: loading model tensors, this can take a while... (mmap = true)

 TXE Device MEMORY Summary total 134217728 and free 134217728
load_tensors: offloading 0 repeating layers to GPU
load_tensors: offloaded 0/23 layers to GPU
load_tensors:   CPU_Mapped model buffer size =  4196.40 MiB
..........................................................................................
llama_init_from_model: n_seq_max     = 1
llama_init_from_model: n_ctx         = 12288
llama_init_from_model: n_ctx_per_seq = 12288
llama_init_from_model: n_batch       = 1024
llama_init_from_model: n_ubatch      = 512
llama_init_from_model: flash_attn    = 0
llama_init_from_model: freq_base     = 10000.0
llama_init_from_model: freq_scale    = 1
llama_init_from_model: n_ctx_pre_seq (12288) > n_ctx_train (2048) -- possible training context overflow
[2018-03-09 12:35:52.849854] 272:273 [ info]  :: </proj/work/atrivedi/workspace/05_25_2025/tsi_yocto_workspace/tsi-apc-manager/platform/rsm_mgr/rsm_process_req.c:129> TXE resource allocation request processed successfully.
llama_kv_cache_init: kv_size = 12288, offload = 1, type_k = 'f16', type_v = 'f16', n_layer = 22, can_shift = 1
llama_kv_cache_init:        CPU KV buffer size =   264.00 MiB
llama_init_from_model: KV self size  =  264.00 MiB, K (f16):  132.00 MiB, V (f16):  132.00 MiB
llama_init_from_model:        CPU  output buffer size =     0.12 MiB
ggml_backend_tsavorite_buffer_type_alloc_buffer is called from llama data Loader

 ANoop Allocating memory from tsi_alloc with size  8400896

 Allocating memory from tsi_alloc with size  8400896 starting memory 0xfffe83cb3080

Address of Newly Created BUffer 0xfffe83cb3080 and size 8400896
llama_init_from_model:  tsavorite compute buffer size =     8.01 MiB
llama_init_from_model:        CPU compute buffer size =   808.01 MiB
llama_init_from_model: graph nodes  = 710
llama_init_from_model: graph splits = 179 (with bs=512), 93 (with bs=1)
common_init_from_params: setting dry_penalty_last_n to ctx_size = 12288
main: llama threadpool init, n_threads = 4
main: model was trained on only 2048 context tokens (12288 specified)

system_info: n_threads = 4 (n_threads_batch = 4) / 4 | CPU : NEON = 1 | ARM_FMA = 1 | LLAMAFILE = 1 | AARCH64_REPACK = 1 |

sampler seed: 2241389473
sampler params:
	repeat_last_n = 5, repeat_penalty = 1.500, frequency_penalty = 0.000, presence_penalty = 0.000
	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 12288
	top_k = 50, top_p = 0.900, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.000
	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
sampler chain: logits -> logit-bias -> penalties -> dry -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist
generate: n_ctx = 12288, n_batch = 1024, n_predict = 10, n_keep = 1

 my cat’s name is Luna.
I’m a software

llama_perf_sampler_print:    sampling time =     207.24 ms /    16 runs   (   12.95 ms per token,    77.21 tokens per second)
llama_perf_context_print:        load time =   85299.65 ms
llama_perf_context_print: prompt eval time =   61885.34 ms /     6 tokens (10314.22 ms per token,     0.10 tokens per second)
llama_perf_context_print:        eval time =  528257.38 ms /     9 runs   (58695.26 ms per token,     0.02 tokens per second)
llama_perf_context_print:       total time =  613808.39 ms /    15 tokens

LLAMA SP Profiling Results:
------------------------------------------------------------------------------------------------------------------------
Calls  Total(ms)    T/call  Self(ms)  Function
------------------------------------------------------------------------------------------------------------------------
    0      0.000     0.000     0.000  [ 0%] LLAMA SP Main
21600  23152.000     1.072     0.000  └─ [ 4%] RuntimeHostShim::awaitCommandListCompletion
21280  33107.073     1.556 33107.073    └─ [ 5%] [ txe_mult_blob ]
  320    500.108     1.563   500.108    └─ [ 0%] [ txe_add_blob ]
    1     87.000    87.000    87.000  └─ [ 0%] RuntimeHostShim::initialize
21600     25.000     0.001    25.000  └─ [ 0%] RuntimeHostShim::loadBlob
21600     11.000     0.001    11.000  └─ [ 0%] RuntimeHostShim::finalizeCommandList
21600      7.000     0.000     7.000  └─ [ 0%] RuntimeHostShim::createCommandList
86400      6.000     0.000     6.000  └─ [ 0%] RuntimeHostShim::getShmemManager
21600      4.000     0.000     4.000  └─ [ 0%] RuntimeHostShim::deallocate
21601      3.000     0.000     3.000  └─ [ 0%] RuntimeHostShim::allocate
21600      3.000     0.000     3.000  └─ [ 0%] RuntimeHostShim::launchBlob
21600      3.000     0.000     3.000  └─ [ 0%] RuntimeHostShim::addCommandToList
21600      0.000     0.000     0.000  └─ [ 0%] RuntimeHostShim::unloadBlob
========================================================================================================================
    0 613845.000     0.000613845.000  [100%] TOTAL
========================================================================================================================

 TXE_ADD Operation, total tensor: 10  Number of Kernel Call: 320  Number of tensor got spilt: 10 Min Num of Elem 2048 Max Num of Elem 2048

 TXE_SUB Operation, total tensor: 0  Number of Kernel Call: 0  Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0

 TXE_MULT Operation, total tensor: 450  Number of Kernel Call: 21280  Number of tensor got spilt: 450 Min Num of Elem 2048 Max Num of Elem 12288

 TXE_DIV Operation, total tensor: 0  Number of Kernel Call: 0  Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0

 TXE_SQRT Operation, total tensor: 0  Number of Kernel Call: 0  Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0

 TXE_NEG Operation, total tensor: 0  Number of Kernel Call: 0  Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0

 TXE_ABS Operation, total tensor: 0  Number of Kernel Call: 0  Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0

 TXE_SIN Operation, total tensor: 0  Number of Kernel Call: 0  Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0

 TXE_SIGMOID Operation, total tensor: 0  Number of Kernel Call: 0  Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0
terminate called after throwing an instance of 'std::runtime_error'
  what():  Profiler not initialized!
Stack trace:
  ./llama-cli(tsi::runtime::utils::ScopedProfiler<true>::ScopedProfiler(tsi::runtime::utils::ProfileLocation)+0x8c) [0x478d04]
  /usr/bin/tsi/v0.1.1.tsv30_05_24_2025/bin/aot-tests/lib/libTsavRTFPGAShimCAPI.so(tsi_finalize+0xb0) [0xffff93ce83d8]
  libggml-tsavorite.so(+0x57f0) [0xffff945c57f0]
  libggml-tsavorite.so(+0x7bb4) [0xffff945c7bb4]
  libggml-base.so(ggml_backend_free+0x28) [0xffff944d89d8]
  libllama.so(ggml_backend_deleter::operator()(ggml_backend*)+0x18) [0xffff949aa7a0]
  libllama.so(std::unique_ptr<ggml_backend, ggml_backend_deleter>::~unique_ptr()+0x50) [0xffff949df3bc]
  libllama.so(void std::_Destroy<std::unique_ptr<ggml_backend, ggml_backend_deleter> >(std::unique_ptr<ggml_backend, ggml_backend_deleter>*)+0x14) [0xffff94a0d32c]
  libllama.so(void std::_Destroy_aux<false>::__destroy<std::unique_ptr<ggml_backend, ggml_backend_deleter>*>(std::unique_ptr<ggml_backend, ggml_backend_deleter>*, std::unique_ptr<ggml_backend, ggml_backend_deleter>*)+0x20) [0xffff94a0d1a8]
  libllama.so(void std::_Destroy<std::unique_ptr<ggml_backend, ggml_backend_deleter>*>(std::unique_ptr<ggml_backend, ggml_backend_deleter>*, std::unique_ptr<ggml_backend, ggml_backend_deleter>*)+0x1c) [0xffff94a0cc28]
  libllama.so(std::vector<std::unique_ptr<ggml_backend, ggml_backend_deleter>, std::allocator<std::unique_ptr<ggml_backend, ggml_backend_deleter> > >::~vector()+0x40) [0xffff94a0b0d4]
  libllama.so(llama_context::~llama_context()+0x78) [0xffff94a07ee0]
  libllama.so(llama_free+0x24) [0xffff94a05d3c]
  ./llama-cli() [0x472488]
  ./llama-cli() [0x478f60]
  ./llama-cli() [0x476300]
  ./llama-cli() [0x471074]
  /lib/libc.so.6(+0x22104) [0xffff933a2104]
  /lib/libc.so.6(__libc_start_main+0x9c) [0xffff933a21e4]
  ./llama-cli() [0x46c1b0]

./run_platform_test.sh: line 55:   559 Aborted                 ./llama-cli -p "my cat’s name" -m /tsi/akapoor/ggml/Tiny-Llama-v0.3-FP32-1.1B-F32.gguf --device tSavorite -c 12288 --temp 0.0 --n-predict 10 --repeat-penalty 1.5 -b 1024 --top-k 50 --top-p 0.9 --repeat-last-n 5 --no-warmup
root@agilex7_dk_si_agf014ea:/usr/bin/tsi/v0.1.1.tsv30_05_24_2025/bin#
---
 examples/main/CMakeLists.txt |  1 +
 examples/main/main.cpp       | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
index af3d9150f8640..e285c1f58538f 100644
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(TARGET llama-cli)
+add_compile_options(--std=c++20)
 add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index acf79a89268fb..2f9dfbbd2e015 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -26,6 +26,11 @@
 #include <windows.h>
 #include <signal.h>
 #endif
+#define GGML_TSAVORITE_TARGET 1
+#if defined (GGML_TSAVORITE_TARGET)
+#include "HostShimCAPI.h"
+#include "tsi-rt/utils/Profiler.h"
+#endif
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -41,6 +46,9 @@ static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting  = false;
 static bool need_insert_eot = false;
 
+using namespace std;
+namespace tsirt = ::tsi::runtime;
+
 static void print_usage(int argc, char ** argv) {
     (void) argc;
 
@@ -92,6 +100,13 @@ int main(int argc, char ** argv) {
 
     common_init();
 
+#if defined (GGML_TSAVORITE_TARGET)
+    std::string mainProfilerName = "LLAMA SP Main ";
+    tsirt::utils::TSIProfiler::initialize();
+
+    tsirt::utils::TSIScopedProfiler mainProfiler(mainProfilerName);
+#endif
+
     auto & sparams = params.sampling;
 
     // save choice to use color for later
@@ -933,10 +948,20 @@ int main(int argc, char ** argv) {
 
     common_sampler_free(smpl);
 
+#if defined (GGML_TSAVORITE_TARGET)
+  tsirt::utils::TSIProfiler::finalize();
+  std::cout << "\nLLAMA SP Profiling Results:" << std::endl;
+  std::cout << tsirt::utils::TSIProfiler::getFormattedResults(
+                   /*truncateFuncNames*/ true)
+            << std::endl;
+#endif
+
     llama_backend_free();
 
     ggml_threadpool_free_fn(threadpool);
     ggml_threadpool_free_fn(threadpool_batch);
 
+
+
     return 0;
 }

From 5511ddb8f672a687145a193fc5b947ac47aacb05 Mon Sep 17 00:00:00 2001
From: Ashish Trivedi <atrivedi@tsavoritesi.com>
Date: Tue, 27 May 2025 11:24:11 -0700
Subject: [PATCH 2/2] @FIR-708: Fixed the crash and moved profiling to inside
 tsavorite This change has following 1. Removed changes from
 examples/main/main.cpp 2. Added profiling change to ggml-tsavorite.cpp

---
 examples/main/CMakeLists.txt               |  1 -
 examples/main/main.cpp                     | 24 ----------------------
 ggml/src/ggml-tsavorite/CMakeLists.txt     |  2 +-
 ggml/src/ggml-tsavorite/ggml-tsavorite.cpp | 21 ++++++++++++++++---
 4 files changed, 19 insertions(+), 29 deletions(-)

diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
index e285c1f58538f..af3d9150f8640 100644
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -1,5 +1,4 @@
 set(TARGET llama-cli)
-add_compile_options(--std=c++20)
 add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 2f9dfbbd2e015..06cc233924360 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -26,11 +26,6 @@
 #include <windows.h>
 #include <signal.h>
 #endif
-#define GGML_TSAVORITE_TARGET 1
-#if defined (GGML_TSAVORITE_TARGET)
-#include "HostShimCAPI.h"
-#include "tsi-rt/utils/Profiler.h"
-#endif
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -46,8 +41,6 @@ static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting  = false;
 static bool need_insert_eot = false;
 
-using namespace std;
-namespace tsirt = ::tsi::runtime;
 
 static void print_usage(int argc, char ** argv) {
     (void) argc;
@@ -100,13 +93,6 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-#if defined (GGML_TSAVORITE_TARGET)
-    std::string mainProfilerName = "LLAMA SP Main ";
-    tsirt::utils::TSIProfiler::initialize();
-
-    tsirt::utils::TSIScopedProfiler mainProfiler(mainProfilerName);
-#endif
-
     auto & sparams = params.sampling;
 
     // save choice to use color for later
@@ -948,20 +934,10 @@ int main(int argc, char ** argv) {
 
     common_sampler_free(smpl);
 
-#if defined (GGML_TSAVORITE_TARGET)
-  tsirt::utils::TSIProfiler::finalize();
-  std::cout << "\nLLAMA SP Profiling Results:" << std::endl;
-  std::cout << tsirt::utils::TSIProfiler::getFormattedResults(
-                   /*truncateFuncNames*/ true)
-            << std::endl;
-#endif
-
     llama_backend_free();
 
     ggml_threadpool_free_fn(threadpool);
     ggml_threadpool_free_fn(threadpool_batch);
 
-
-
     return 0;
 }
diff --git a/ggml/src/ggml-tsavorite/CMakeLists.txt b/ggml/src/ggml-tsavorite/CMakeLists.txt
index 8aecc0d6778c1..1e156df10eb16 100644
--- a/ggml/src/ggml-tsavorite/CMakeLists.txt
+++ b/ggml/src/ggml-tsavorite/CMakeLists.txt
@@ -2,7 +2,7 @@ message(STATUS "Tsavorite framework is found")
 #
 # tsavorite Kernel Library
 file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "../../${GGML_TSI_KERNEL_DIR}/host/*.o")
-
+add_compile_options(--std=c++20)
 ggml_add_backend_library(ggml-tsavorite
                          ggml-tsavorite.cpp
                         )
diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
index a73e8b244998a..f8bec1b61b9bb 100644
--- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
+++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
@@ -20,10 +20,16 @@
 #include <inttypes.h>
 #include <math.h>
 #include <string>
-
+#include <iostream>
 #include "ggml-backend-impl.h"
 #include "ggml-impl.h"
 #include "ggml.h"
+#include "HostShimCAPI.h"
+#include "tsi-rt/utils/Profiler.h"
+
+using namespace std;
+namespace tsirt = ::tsi::runtime;
+
 
 typedef struct _txe_device_t *txe_device_s;
 typedef struct _txe_compute_pipeline_state_t *txe_compute_pipeline_state_s;
@@ -513,9 +519,12 @@ static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_d
   if (tsi_log_setup() == false)
     return NULL;
 
-  // TSI Run time Initalization
-  tsi_initialize(NUM_OF_TXES);
+  std::string mainProfilerName = "LLAMA SP Main ";
+  tsirt::utils::TSIProfiler::initialize();
+  tsirt::utils::TSIScopedProfiler mainProfiler(mainProfilerName);
 
+  // TSI Run time Initalization
+  tsi_initialize(NUM_OF_TXES, NULL);
   // init context
   struct ggml_backend_tsavorite_context *ctx = (struct ggml_backend_tsavorite_context *)calloc(
       1, sizeof(struct ggml_backend_tsavorite_context));
@@ -614,6 +623,12 @@ static void ggml_tsavorite_free(struct ggml_backend_tsavorite_context *ctx) {
   GGML_TSAVORITE_LOG_INFO("Delaying tsi_finalize for 2 sec");
   sleep(2);
   tsi_finalize();
+  tsirt::utils::TSIProfiler::finalize();
+  std::cout << "\nLLAMA SP Profiling Results:" << std::endl;
+  std::cout << tsirt::utils::TSIProfiler::getFormattedResults(
+                   /*truncateFuncNames*/ true)
+            << std::endl;
+  //tsi_finalize();
   GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
 }