From 2192d0f8abeb3f655496204373dddc6b1b7a0c2c Mon Sep 17 00:00:00 2001 From: Ashish Trivedi Date: Mon, 26 May 2025 21:52:38 -0700 Subject: [PATCH 1/2] @FIR-708: Added code to print out the profile information from runtime This change adds following 1. Add std=c++20 for using the aot-tests used profiler from runtime 2. Add initialization and dumping of profile informaion There is one crash which needs debugging but the results look as follows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit register_backend: registered backend Tsavorite (1 devices) register_device: registered device Tsavorite (txe) register_backend: registered backend CPU (1 devices) register_device: registered device CPU (CPU) load_backend: failed to find ggml_backend_init in ./libggml-tsavorite.so load_backend: failed to find ggml_backend_init in ./libggml-cpu.so build: 4826 (5255890c) with gcc (GCC) 13.3.0 for x86_64-pc-linux-gnu (debug) main: llama backend init main: load the model and apply lora adapter, if any TXE Device MEMORY Summary total 134217728 and free 134217728 llama_model_load_from_file_impl: using device Tsavorite (txe) - 128 MiB free llama_model_loader: loaded meta data with 38 key-value pairs and 201 tensors from /tsi/akapoor/ggml/Tiny-Llama-v0.3-FP32-1.1B-F32.gguf (version GGUF V3 (latest)) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = llama llama_model_loader: - kv 1: general.type str = model llama_model_loader: - kv 2: general.name str = Tiny Llama v0.3 FP32 llama_model_loader: - kv 3: general.size_label str = 1.1B llama_model_loader: - kv 4: general.license str = apache-2.0 llama_model_loader: - kv 5: general.dataset.count u32 = 3 llama_model_loader: - kv 6: general.dataset.0.name str = SlimPajama 627B llama_model_loader: - kv 7: general.dataset.0.organization str = Cerebras llama_model_loader: - kv 8: general.dataset.0.repo_url str = https://huggingface.co/cerebras/SlimP... llama_model_loader: - kv 9: general.dataset.1.name str = Starcoderdata llama_model_loader: - kv 10: general.dataset.1.organization str = Bigcode llama_model_loader: - kv 11: general.dataset.1.repo_url str = https://huggingface.co/bigcode/starco... llama_model_loader: - kv 12: general.dataset.2.name str = Oasst_Top1_2023 08 25 llama_model_loader: - kv 13: general.dataset.2.version str = 08-25 llama_model_loader: - kv 14: general.dataset.2.organization str = OpenAssistant llama_model_loader: - kv 15: general.dataset.2.repo_url str = https://huggingface.co/OpenAssistant/... llama_model_loader: - kv 16: general.languages arr[str,1] = ["en"] llama_model_loader: - kv 17: llama.block_count u32 = 22 llama_model_loader: - kv 18: llama.context_length u32 = 2048 llama_model_loader: - kv 19: llama.embedding_length u32 = 2048 llama_model_loader: - kv 20: llama.feed_forward_length u32 = 5632 llama_model_loader: - kv 21: llama.attention.head_count u32 = 32 llama_model_loader: - kv 22: llama.attention.head_count_kv u32 = 4 llama_model_loader: - kv 23: llama.rope.freq_base f32 = 10000.000000 llama_model_loader: - kv 24: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 llama_model_loader: - kv 25: general.file_type u32 = 0 llama_model_loader: - kv 26: llama.vocab_size u32 = 32003 llama_model_loader: - kv 27: llama.rope.dimension_count u32 = 64 llama_model_loader: - kv 28: tokenizer.ggml.model str = llama llama_model_loader: - kv 29: tokenizer.ggml.pre str = default llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,32003] = ["", "", "", "<0x00>", "<... llama_model_loader: - kv 31: tokenizer.ggml.scores arr[f32,32003] = [-1000.000000, -1000.000000, -1000.00... llama_model_loader: - kv 32: tokenizer.ggml.token_type arr[i32,32003] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ... llama_model_loader: - kv 33: tokenizer.ggml.bos_token_id u32 = 1 llama_model_loader: - kv 34: tokenizer.ggml.eos_token_id u32 = 2 llama_model_loader: - kv 35: tokenizer.ggml.unknown_token_id u32 = 0 llama_model_loader: - kv 36: tokenizer.ggml.padding_token_id u32 = 32000 llama_model_loader: - kv 37: general.quantization_version u32 = 2 llama_model_loader: - type f32: 201 tensors print_info: file format = GGUF V3 (latest) print_info: file type = all F32 print_info: file size = 4.10 GiB (32.00 BPW) load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect load: special tokens cache size = 6 load: token to piece cache size = 0.1684 MB print_info: arch = llama print_info: vocab_only = 0 print_info: n_ctx_train = 2048 print_info: n_embd = 2048 print_info: n_layer = 22 print_info: n_head = 32 print_info: n_head_kv = 4 print_info: n_rot = 64 print_info: n_swa = 0 print_info: n_embd_head_k = 64 print_info: n_embd_head_v = 64 print_info: n_gqa = 8 print_info: n_embd_k_gqa = 256 print_info: n_embd_v_gqa = 256 print_info: f_norm_eps = 0.0e+00 print_info: f_norm_rms_eps = 1.0e-05 print_info: f_clamp_kqv = 0.0e+00 print_info: f_max_alibi_bias = 0.0e+00 print_info: f_logit_scale = 0.0e+00 print_info: n_ff = 5632 print_info: n_expert = 0 print_info: n_expert_used = 0 print_info: causal attn = 1 print_info: pooling type = 0 print_info: rope type = 0 print_info: rope scaling = linear print_info: freq_base_train = 10000.0 print_info: freq_scale_train = 1 print_info: n_ctx_orig_yarn = 2048 print_info: rope_finetuned = unknown print_info: ssm_d_conv = 0 print_info: ssm_d_inner = 0 print_info: ssm_d_state = 0 print_info: ssm_dt_rank = 0 print_info: ssm_dt_b_c_rms = 0 print_info: model type = 1B print_info: model params = 1.10 B print_info: general.name = Tiny Llama v0.3 FP32 print_info: vocab type = SPM print_info: n_vocab = 32003 print_info: n_merges = 0 print_info: BOS token = 1 '' print_info: EOS token = 2 '' print_info: EOT token = 32002 '<|im_end|>' print_info: UNK token = 0 '' print_info: PAD token = 32000 '[PAD]' print_info: LF token = 13 '<0x0A>' print_info: EOG token = 2 '' print_info: EOG token = 32002 '<|im_end|>' print_info: max token length = 48 load_tensors: loading model tensors, this can take a while... (mmap = true) TXE Device MEMORY Summary total 134217728 and free 134217728 load_tensors: offloading 0 repeating layers to GPU load_tensors: offloaded 0/23 layers to GPU load_tensors: CPU_Mapped model buffer size = 4196.40 MiB .......................................................................................... llama_init_from_model: n_seq_max = 1 llama_init_from_model: n_ctx = 12288 llama_init_from_model: n_ctx_per_seq = 12288 llama_init_from_model: n_batch = 1024 llama_init_from_model: n_ubatch = 512 llama_init_from_model: flash_attn = 0 llama_init_from_model: freq_base = 10000.0 llama_init_from_model: freq_scale = 1 llama_init_from_model: n_ctx_pre_seq (12288) > n_ctx_train (2048) -- possible training context overflow [2018-03-09 12:35:52.849854] 272:273 [ info] :: TXE resource allocation request processed successfully. llama_kv_cache_init: kv_size = 12288, offload = 1, type_k = 'f16', type_v = 'f16', n_layer = 22, can_shift = 1 llama_kv_cache_init: CPU KV buffer size = 264.00 MiB llama_init_from_model: KV self size = 264.00 MiB, K (f16): 132.00 MiB, V (f16): 132.00 MiB llama_init_from_model: CPU output buffer size = 0.12 MiB ggml_backend_tsavorite_buffer_type_alloc_buffer is called from llama data Loader ANoop Allocating memory from tsi_alloc with size 8400896 Allocating memory from tsi_alloc with size 8400896 starting memory 0xfffe83cb3080 Address of Newly Created BUffer 0xfffe83cb3080 and size 8400896 llama_init_from_model: tsavorite compute buffer size = 8.01 MiB llama_init_from_model: CPU compute buffer size = 808.01 MiB llama_init_from_model: graph nodes = 710 llama_init_from_model: graph splits = 179 (with bs=512), 93 (with bs=1) common_init_from_params: setting dry_penalty_last_n to ctx_size = 12288 main: llama threadpool init, n_threads = 4 main: model was trained on only 2048 context tokens (12288 specified) system_info: n_threads = 4 (n_threads_batch = 4) / 4 | CPU : NEON = 1 | ARM_FMA = 1 | LLAMAFILE = 1 | AARCH64_REPACK = 1 | sampler seed: 2241389473 sampler params: repeat_last_n = 5, repeat_penalty = 1.500, frequency_penalty = 0.000, presence_penalty = 0.000 dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 12288 top_k = 50, top_p = 0.900, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.000 mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 sampler chain: logits -> logit-bias -> penalties -> dry -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist generate: n_ctx = 12288, n_batch = 1024, n_predict = 10, n_keep = 1 my cat’s name is Luna. I’m a software llama_perf_sampler_print: sampling time = 207.24 ms / 16 runs ( 12.95 ms per token, 77.21 tokens per second) llama_perf_context_print: load time = 85299.65 ms llama_perf_context_print: prompt eval time = 61885.34 ms / 6 tokens (10314.22 ms per token, 0.10 tokens per second) llama_perf_context_print: eval time = 528257.38 ms / 9 runs (58695.26 ms per token, 0.02 tokens per second) llama_perf_context_print: total time = 613808.39 ms / 15 tokens LLAMA SP Profiling Results: ------------------------------------------------------------------------------------------------------------------------ Calls Total(ms) T/call Self(ms) Function ------------------------------------------------------------------------------------------------------------------------ 0 0.000 0.000 0.000 [ 0%] LLAMA SP Main 21600 23152.000 1.072 0.000 └─ [ 4%] RuntimeHostShim::awaitCommandListCompletion 21280 33107.073 1.556 33107.073 └─ [ 5%] [ txe_mult_blob ] 320 500.108 1.563 500.108 └─ [ 0%] [ txe_add_blob ] 1 87.000 87.000 87.000 └─ [ 0%] RuntimeHostShim::initialize 21600 25.000 0.001 25.000 └─ [ 0%] RuntimeHostShim::loadBlob 21600 11.000 0.001 11.000 └─ [ 0%] RuntimeHostShim::finalizeCommandList 21600 7.000 0.000 7.000 └─ [ 0%] RuntimeHostShim::createCommandList 86400 6.000 0.000 6.000 └─ [ 0%] RuntimeHostShim::getShmemManager 21600 4.000 0.000 4.000 └─ [ 0%] RuntimeHostShim::deallocate 21601 3.000 0.000 3.000 └─ [ 0%] RuntimeHostShim::allocate 21600 3.000 0.000 3.000 └─ [ 0%] RuntimeHostShim::launchBlob 21600 3.000 0.000 3.000 └─ [ 0%] RuntimeHostShim::addCommandToList 21600 0.000 0.000 0.000 └─ [ 0%] RuntimeHostShim::unloadBlob ======================================================================================================================== 0 613845.000 0.000613845.000 [100%] TOTAL ======================================================================================================================== TXE_ADD Operation, total tensor: 10 Number of Kernel Call: 320 Number of tensor got spilt: 10 Min Num of Elem 2048 Max Num of Elem 2048 TXE_SUB Operation, total tensor: 0 Number of Kernel Call: 0 Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0 TXE_MULT Operation, total tensor: 450 Number of Kernel Call: 21280 Number of tensor got spilt: 450 Min Num of Elem 2048 Max Num of Elem 12288 TXE_DIV Operation, total tensor: 0 Number of Kernel Call: 0 Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0 TXE_SQRT Operation, total tensor: 0 Number of Kernel Call: 0 Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0 TXE_NEG Operation, total tensor: 0 Number of Kernel Call: 0 Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0 TXE_ABS Operation, total tensor: 0 Number of Kernel Call: 0 Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0 TXE_SIN Operation, total tensor: 0 Number of Kernel Call: 0 Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0 TXE_SIGMOID Operation, total tensor: 0 Number of Kernel Call: 0 Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0 terminate called after throwing an instance of 'std::runtime_error' what(): Profiler not initialized! Stack trace: ./llama-cli(tsi::runtime::utils::ScopedProfiler::ScopedProfiler(tsi::runtime::utils::ProfileLocation)+0x8c) [0x478d04] /usr/bin/tsi/v0.1.1.tsv30_05_24_2025/bin/aot-tests/lib/libTsavRTFPGAShimCAPI.so(tsi_finalize+0xb0) [0xffff93ce83d8] libggml-tsavorite.so(+0x57f0) [0xffff945c57f0] libggml-tsavorite.so(+0x7bb4) [0xffff945c7bb4] libggml-base.so(ggml_backend_free+0x28) [0xffff944d89d8] libllama.so(ggml_backend_deleter::operator()(ggml_backend*)+0x18) [0xffff949aa7a0] libllama.so(std::unique_ptr::~unique_ptr()+0x50) [0xffff949df3bc] libllama.so(void std::_Destroy >(std::unique_ptr*)+0x14) [0xffff94a0d32c] libllama.so(void std::_Destroy_aux::__destroy*>(std::unique_ptr*, std::unique_ptr*)+0x20) [0xffff94a0d1a8] libllama.so(void std::_Destroy*>(std::unique_ptr*, std::unique_ptr*)+0x1c) [0xffff94a0cc28] libllama.so(std::vector, std::allocator > >::~vector()+0x40) [0xffff94a0b0d4] libllama.so(llama_context::~llama_context()+0x78) [0xffff94a07ee0] libllama.so(llama_free+0x24) [0xffff94a05d3c] ./llama-cli() [0x472488] ./llama-cli() [0x478f60] ./llama-cli() [0x476300] ./llama-cli() [0x471074] /lib/libc.so.6(+0x22104) [0xffff933a2104] /lib/libc.so.6(__libc_start_main+0x9c) [0xffff933a21e4] ./llama-cli() [0x46c1b0] ./run_platform_test.sh: line 55: 559 Aborted ./llama-cli -p "my cat’s name" -m /tsi/akapoor/ggml/Tiny-Llama-v0.3-FP32-1.1B-F32.gguf --device tSavorite -c 12288 --temp 0.0 --n-predict 10 --repeat-penalty 1.5 -b 1024 --top-k 50 --top-p 0.9 --repeat-last-n 5 --no-warmup root@agilex7_dk_si_agf014ea:/usr/bin/tsi/v0.1.1.tsv30_05_24_2025/bin# --- examples/main/CMakeLists.txt | 1 + examples/main/main.cpp | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index af3d9150f8640..e285c1f58538f 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -1,4 +1,5 @@ set(TARGET llama-cli) +add_compile_options(--std=c++20) add_executable(${TARGET} main.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index acf79a89268fb..2f9dfbbd2e015 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -26,6 +26,11 @@ #include #include #endif +#define GGML_TSAVORITE_TARGET 1 +#if defined (GGML_TSAVORITE_TARGET) +#include "HostShimCAPI.h" +#include "tsi-rt/utils/Profiler.h" +#endif #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -41,6 +46,9 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; +using namespace std; +namespace tsirt = ::tsi::runtime; + static void print_usage(int argc, char ** argv) { (void) argc; @@ -92,6 +100,13 @@ int main(int argc, char ** argv) { common_init(); +#if defined (GGML_TSAVORITE_TARGET) + std::string mainProfilerName = "LLAMA SP Main "; + tsirt::utils::TSIProfiler::initialize(); + + tsirt::utils::TSIScopedProfiler mainProfiler(mainProfilerName); +#endif + auto & sparams = params.sampling; // save choice to use color for later @@ -933,10 +948,20 @@ int main(int argc, char ** argv) { common_sampler_free(smpl); +#if defined (GGML_TSAVORITE_TARGET) + tsirt::utils::TSIProfiler::finalize(); + std::cout << "\nLLAMA SP Profiling Results:" << std::endl; + std::cout << tsirt::utils::TSIProfiler::getFormattedResults( + /*truncateFuncNames*/ true) + << std::endl; +#endif + llama_backend_free(); ggml_threadpool_free_fn(threadpool); ggml_threadpool_free_fn(threadpool_batch); + + return 0; } From 5511ddb8f672a687145a193fc5b947ac47aacb05 Mon Sep 17 00:00:00 2001 From: Ashish Trivedi Date: Tue, 27 May 2025 11:24:11 -0700 Subject: [PATCH 2/2] @FIR-708: Fixed the crash and moved profiling to inside tsavorite This change has following 1. Removed changes from examples/main/main.cpp 2. Added profiling change to ggml-tsavorite.cpp --- examples/main/CMakeLists.txt | 1 - examples/main/main.cpp | 24 ---------------------- ggml/src/ggml-tsavorite/CMakeLists.txt | 2 +- ggml/src/ggml-tsavorite/ggml-tsavorite.cpp | 21 ++++++++++++++++--- 4 files changed, 19 insertions(+), 29 deletions(-) diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index e285c1f58538f..af3d9150f8640 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -1,5 +1,4 @@ set(TARGET llama-cli) -add_compile_options(--std=c++20) add_executable(${TARGET} main.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 2f9dfbbd2e015..06cc233924360 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -26,11 +26,6 @@ #include #include #endif -#define GGML_TSAVORITE_TARGET 1 -#if defined (GGML_TSAVORITE_TARGET) -#include "HostShimCAPI.h" -#include "tsi-rt/utils/Profiler.h" -#endif #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -46,8 +41,6 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; -using namespace std; -namespace tsirt = ::tsi::runtime; static void print_usage(int argc, char ** argv) { (void) argc; @@ -100,13 +93,6 @@ int main(int argc, char ** argv) { common_init(); -#if defined (GGML_TSAVORITE_TARGET) - std::string mainProfilerName = "LLAMA SP Main "; - tsirt::utils::TSIProfiler::initialize(); - - tsirt::utils::TSIScopedProfiler mainProfiler(mainProfilerName); -#endif - auto & sparams = params.sampling; // save choice to use color for later @@ -948,20 +934,10 @@ int main(int argc, char ** argv) { common_sampler_free(smpl); -#if defined (GGML_TSAVORITE_TARGET) - tsirt::utils::TSIProfiler::finalize(); - std::cout << "\nLLAMA SP Profiling Results:" << std::endl; - std::cout << tsirt::utils::TSIProfiler::getFormattedResults( - /*truncateFuncNames*/ true) - << std::endl; -#endif - llama_backend_free(); ggml_threadpool_free_fn(threadpool); ggml_threadpool_free_fn(threadpool_batch); - - return 0; } diff --git a/ggml/src/ggml-tsavorite/CMakeLists.txt b/ggml/src/ggml-tsavorite/CMakeLists.txt index 8aecc0d6778c1..1e156df10eb16 100644 --- a/ggml/src/ggml-tsavorite/CMakeLists.txt +++ b/ggml/src/ggml-tsavorite/CMakeLists.txt @@ -2,7 +2,7 @@ message(STATUS "Tsavorite framework is found") # # tsavorite Kernel Library file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "../../${GGML_TSI_KERNEL_DIR}/host/*.o") - +add_compile_options(--std=c++20) ggml_add_backend_library(ggml-tsavorite ggml-tsavorite.cpp ) diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp index a73e8b244998a..f8bec1b61b9bb 100644 --- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp +++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp @@ -20,10 +20,16 @@ #include #include #include - +#include #include "ggml-backend-impl.h" #include "ggml-impl.h" #include "ggml.h" +#include "HostShimCAPI.h" +#include "tsi-rt/utils/Profiler.h" + +using namespace std; +namespace tsirt = ::tsi::runtime; + typedef struct _txe_device_t *txe_device_s; typedef struct _txe_compute_pipeline_state_t *txe_compute_pipeline_state_s; @@ -513,9 +519,12 @@ static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_d if (tsi_log_setup() == false) return NULL; - // TSI Run time Initalization - tsi_initialize(NUM_OF_TXES); + std::string mainProfilerName = "LLAMA SP Main "; + tsirt::utils::TSIProfiler::initialize(); + tsirt::utils::TSIScopedProfiler mainProfiler(mainProfilerName); + // TSI Run time Initalization + tsi_initialize(NUM_OF_TXES, NULL); // init context struct ggml_backend_tsavorite_context *ctx = (struct ggml_backend_tsavorite_context *)calloc( 1, sizeof(struct ggml_backend_tsavorite_context)); @@ -614,6 +623,12 @@ static void ggml_tsavorite_free(struct ggml_backend_tsavorite_context *ctx) { GGML_TSAVORITE_LOG_INFO("Delaying tsi_finalize for 2 sec"); sleep(2); tsi_finalize(); + tsirt::utils::TSIProfiler::finalize(); + std::cout << "\nLLAMA SP Profiling Results:" << std::endl; + std::cout << tsirt::utils::TSIProfiler::getFormattedResults( + /*truncateFuncNames*/ true) + << std::endl; + //tsi_finalize(); GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); }