diff --git a/.gitmodules b/.gitmodules
index 23ce5ff059b1b..001504ec9ed07 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "kompute"]
 	path = ggml/src/ggml-kompute/kompute
 	url = https://github.com/nomic-ai/kompute.git
+[submodule "ggml-tsi-kernel"]
+	path = ggml-tsi-kernel
+	url = git@github.com:tsisw/ggml-tsi-kernel.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ac3e9090336d9..96a8a393817d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,10 +5,59 @@ include(CheckIncludeFileCXX)
 #set(CMAKE_WARN_DEPRECATED YES)
 set(CMAKE_WARN_UNUSED_CLI YES)
 
+if (GGML_TSAVORITE)
+    if  (NOT DEFINED GGML_TSAVORITE_TARGET)
+        set(GGML_TSAVORITE_TARGET "posix")
+    endif()
+    if (NOT ${GGML_TSAVORITE_TARGET} STREQUAL fpga)
+        set(GGML_TSAVORITE_TARGET "posix")
+    endif()
+
+    if (NOT DEFINED MLIR_COMPILER_DIR)
+        if (NOT DEFINED $ENV{MLIR_SDK_VERSION})
+            set (MLIR_COMPILER_DIR /proj/work/rel/sw/sdk-r.0.1.1/compiler)
+        else()
+            set (MLIR_COMPILER_DIR $ENV{MLIR_SDK_VERSION}/compiler)
+        endif()
+    endif()
+
+    if (NOT DEFINED RUNTIME_DIR)
+        if (NOT DEFINED $ENV{MLIR_SDK_VERSION})
+            set (RUNTIME_DIR /proj/work/rel/sw/sdk-r.0.1.0/${GGML_TSAVORITE_TARGET}/runtime)
+        else()
+            set (RUNTIME_DIR $ENV{MLIR_SDK_VERSION}/${GGML_TSAVORITE_TARGET}/runtime)
+        endif()
+    endif()
+
+    if (NOT DEFINED GGML_TSI_KERNEL_DIR)
+       set (GGML_TSI_KERNEL_DIR ${CMAKE_SOURCE_DIR}/ggml-tsi-kernel/${GGML_TSAVORITE_TARGET})
+    endif()
+
+    file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o")
+
+    if (${GGML_TSAVORITE_TARGET} STREQUAL fpga)
+        set(CMAKE_CROSSCOMPILING ON)
+        set(ARCH_FLAGS -march=armv8-a)
+        message("Setting target as fpga")
+    elseif (${GGML_TSAVORITE_TARGET} STREQUAL "posix")
+	list(APPEND TLIBS "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so")
+        message("Setting target as posix for tsavorite")
+    endif()
+
+    set(GGML_TSAVORITE_TARGET "${GGML_TSAVORITE_TARGET}" CACHE STRING "Target for tsavorite")
+    set (TSAVORITE_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/ggml/src/ggml-tsavorite/include)
+
+    include_directories(${TSAVORITE_INCLUDE_DIR})
+    include_directories(${MLIR_COMPILER_DIR}/include/runtime/shim)
+    include_directories(${RUNTIME_DIR}/include)
+    message("tsavorite backend is enabled")
+endif()
+
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    #set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
@@ -82,9 +131,18 @@ option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 
 # 3rd party libs
-option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
 
+if (GGML_TSAVORITE)
+    if (${GGML_TSAVORITE_TARGET} STREQUAL fpga)
+        option(LLAMA_CURL       "llama: use libcurl to download model from an URL" OFF)
+    else()
+        option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
+    endif()
+else()
+    option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
+endif()
+
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index a7ff3ac16c446..9eafc9bb2b659 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -145,8 +145,16 @@ endif ()
 
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)
-target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
 
+if (GGML_TSAVORITE)
+    if (${GGML_TSAVORITE_TARGET} STREQUAL fpga)
+        target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${TLIBS} PUBLIC llama Threads::Threads)
+    else()
+        target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+    endif()
+else()
+    target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+endif()
 
 #
 # copy the license files
diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt
index 15c5c68c6f402..0d9272b663d1a 100644
--- a/examples/gguf-hash/CMakeLists.txt
+++ b/examples/gguf-hash/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET llama-gguf-hash)
 add_executable(${TARGET} gguf-hash.cpp)
+target_link_libraries(${TARGET} PRIVATE ${TLIBS})
 install(TARGETS ${TARGET} RUNTIME)
 
 # clibs dependencies
diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt
index fb04eb83f34ce..48365a0b054ce 100644
--- a/examples/gguf/CMakeLists.txt
+++ b/examples/gguf/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-gguf)
 add_executable(${TARGET} gguf.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT} ${TLIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt
index fba78ceda6fd7..f7626a45dedd8 100644
--- a/examples/lookup/CMakeLists.txt
+++ b/examples/lookup/CMakeLists.txt
@@ -1,23 +1,23 @@
 set(TARGET llama-lookup)
 add_executable(${TARGET} lookup.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-lookup-create)
 add_executable(${TARGET} lookup-create.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-lookup-merge)
 add_executable(${TARGET} lookup-merge.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-lookup-stats)
 add_executable(${TARGET} lookup-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/simple-chat/CMakeLists.txt b/examples/simple-chat/CMakeLists.txt
index 567f7fbbbf43a..cdf65e58a9d7d 100644
--- a/examples/simple-chat/CMakeLists.txt
+++ b/examples/simple-chat/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-simple-chat)
 add_executable(${TARGET} simple-chat.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt
index 104ecabfd7236..a87dac20c82da 100644
--- a/examples/simple/CMakeLists.txt
+++ b/examples/simple/CMakeLists.txt
@@ -1,5 +1,24 @@
+#
+# simple-ctx
 set(TARGET llama-simple)
 add_executable(${TARGET} simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama ${TLIBS} ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+#
+if (GGML_TSAVORITE)
+    #
+    # tsavorite backend test cases
+    if (${GGML_TSAVORITE_TARGET} STREQUAL fpga)
+        file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "../../${GGML_TSI_KERNEL_DIR}/host/*.o")
+    else()
+        file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so" "../../${GGML_TSI_KERNEL_DIR}/host/*.o")
+    endif()
+    #
+    # simple-backend-tsi
+
+    set(TEST_TARGET simple-backend-tsi)
+    add_executable(${TEST_TARGET} simple-backend-tsi.cpp)
+    target_link_libraries(${TEST_TARGET} PRIVATE ggml ${TLIBS}  dl rt)
+endif()
diff --git a/examples/simple/simple-backend-tsi.cpp b/examples/simple/simple-backend-tsi.cpp
new file mode 100644
index 0000000000000..2f56f34168062
--- /dev/null
+++ b/examples/simple/simple-backend-tsi.cpp
@@ -0,0 +1,578 @@
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-tsavorite.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <math.h>
+#include <float.h>
+
+#define NUM_INPUT_TENSORS 2
+#define NUM_INPUT_URINARY_TENSORS 1
+#define  NUM_ELEMENTS 32
+#define  NUM_ELEMENTS_SCALE 32*4 + 25
+
+// index 0 for addition, index 1 for subtraction, index 2 for multiplication, index 3 for division
+float test_input_1[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS] = {
+	//ADD KERNEL
+	{1.1,  2.3,  3.2,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//SUB KERNEL
+	{2.2,  10.3,  10.4,  2.2,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//MULT KERNEL
+	{1.1,  2.3,  3.2,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//DIV KERNEL
+	{1.1,  4.4,  10,  5,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	// SQRT Kernel
+	{1,  4,  9.6,  16,  25,  36,  49,  64,  81,  100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529, 576, 625, 676, 729, 784, 841, 900, 961, 1024},
+	//NEG Kernel
+	{1.1,  -4.4,  10,  -5,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, -23, 24, 25, -26, 27, -28, 29, -30, 31, -32.6},
+	//ABS Kernel
+	{1.1,  -4.4,  10,  -5,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, -23, 24, 25, -26, 27, -28, 29, -30, 31, -32.6},
+	//SIN Kernel
+	{1.1,  4.4,  10,  5,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 20, 20, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32.6}
+};
+float test_input_2[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS] = {
+	//ADD KERNEL
+	{1.1,  2.2,  3.3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//SUB KERNEL
+	{1.1,  2.2,  3.0,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//MULT KERNEL
+	{1.1,  2.2,  3.3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//DIV KERNEL
+	{1.1,  2.2,  5,  10,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//Below ROW value not used for Unary OPS-SQRT, NEG, ABS, SIN
+	//SQRT KERNEL input not used
+	{1.1,  2.2,  5,  10,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//NEG KERNEL input not used
+	{1.1,  2.2,  5,  10,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//ABS KERNEL input not used
+	{1.1,  2.2,  5,  10,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//SIN Kernel input not used
+	{1.1,  2.2,  5,  10,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+};
+
+float test_result[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS] = {
+	//ADD KERNEL
+	{2.20, 4.50, 6.50, 8.00, 10.00, 12.00, 14.00, 16.00, 18.00, 20.00, 22.00, 24.00, 26.00, 28.00, 30.00, 32.00, 34.00, 36.00, 38.00, 40.00, 42.00, 44.00, 46.00, 48.00, 50.00, 52.00, 54.00, 56.00, 58.00, 60.00, 62.00, 64.00},
+	//SUB KERNEL
+	{1.1, 8.1, 7.4, -1.8, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00},
+	//MULT KERNEL
+	{1.21, 5.06, 10.56, 16.00, 25.00, 36.00, 49.00, 64.00, 81.00, 100.00, 121.00, 144.00, 169.00, 196.00, 225.00, 256.00, 289.00, 324.00, 361.00, 400.00, 441.00, 484.00, 529.00, 576.00, 625.00, 676.00, 729.00, 784.00, 841.00, 900.00, 961.00, 1024.00},
+	//DIV KERNEL
+	{1.0, 2.0, 2, 0.5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+	//SQRT Kernel
+	{1,  2,  3.098387,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//NEG Kernel
+	{-1.1,  4.4,  -10,  5,  -5,  -6,  -7,  -8,  -9,  -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, 23, -24, -25, 26, -27, 28, -29, 30, -31, 32.6},
+	//ABS Kernel
+	{1.1,  4.4,  10,  5,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32.6},
+	//SIN Kernel
+	{0.891207,  -0.951602,  -0.544021,  -0.958924,  -0.958924,  -0.279416,  0.656987,  0.989358,  0.412118,  -0.544021, -0.999990, -0.536573, 0.420167, 0.990607, 0.650288, -0.287903, -0.961398, -0.750987, 0.149877, 0.912945, 0.912945, 0.912945, -0.846220, -0.905578, -0.132352, 0.762559, 0.956376, 0.270906, -0.663634, -0.988032, -0.404039, 0.926149}
+};
+
+float test_input_scale_1[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = {
+	//ADD KERNEL
+	{1.3, 2.3, 3.3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+	//SUB KERNEL
+	{8.5, 2.5, 3.5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 64,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 63, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 63, 32,
+	 4, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 63, 32,
+	 2, 4, 8, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+	//MULT KERNEL
+	{1.5, 2.5, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  10,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  10,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  10,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  10,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//DIV KERNEL
+	{4.2, 8.4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 4,   8,   1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 4,   8,   1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 4,   8,   1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 4,   8,   1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//SQRT KERNEL
+	{1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 9, 4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 16, 25, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//NEG KERNEL
+	{-1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -9, 4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -16, 25, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//ABS KERNEL
+	{-1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -9, 4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -16, 25, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//SIN KERNEL
+	{-1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -9, 4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -16, 25, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1}
+};
+
+float test_input_scale_2[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = {
+	// ADD KERNEL
+	{1.3, 2.3, 3.3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+	// SUB KERNEL
+	{1, 8, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 6, 8, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+	// MULT KERNEL
+	{2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	// DIV KERNEL
+	{2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//Below ROW value not used for Unary OPS-SQRT, NEG, ABS, SIN
+	//SQRT KERNEL input not used
+	{1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//NEG KERNEL input not used
+	{1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//ABS KERNEL input not used
+	{-1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -9, 4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -16, 25, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//SIN KERNEL input not used
+	{-1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -9, 4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -16, 25, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1}
+};
+float test_result_scale[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = {
+	// ADD KERNEL
+	{2.6, 4.6, 6.6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64,
+	 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64,
+	 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64,
+	 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64,
+	 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50},
+	// SUB KERNEL
+	{7.5, -5.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  32,
+	 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0,
+        -5, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0,
+	 3, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0,
+	 1, 2,  5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+	// MULT KERNEL
+	{3, 5,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+	// DIV KERNEL
+	{2.1, 4.2,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+	// SQRT KERNEL
+	{1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 3, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 4, 5, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	// NEG KERNEL
+	{1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+	 9, -4, -1, -1, -1, -1, -1, -1, -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+	 16, -25, -1, -1, -1, -1, -1, -1, -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+	 1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+	 1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1},
+	// ABS KERNEL
+	{1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 9, 4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 16, 25, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	// SIN KERNEL
+	{-0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	 -0.412118,-0.756802, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.287903,-0.132352, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	 -0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	 -0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471}
+};
+
+// This is a simple model with two tensors a and b
+struct simple_model {
+    struct ggml_tensor * a;
+    struct ggml_tensor * b;
+
+    // the backend to perform the computation (TSAVORITE)
+    ggml_backend_t backend = NULL;
+
+    // the backend buffer to storage the tensors data of a and b
+    ggml_backend_buffer_t buffer;
+
+    // the context to define the tensor information (dimensions, size, memory address)
+    struct ggml_context * ctx;
+};
+
+
+static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    fputs(text, stderr);
+    fflush(stderr);
+}
+
+static bool ggml_tsi_compare_two_float(float a, float b) {
+    float epsilon = 1e-5;
+    float absA = abs(a);
+    float absB = abs(b);
+    float diff = abs(a - b);
+    float minV = std::numeric_limits<float>::min();
+    float maxV = std::numeric_limits<float>::max();
+
+    if (a == b) { // shortcut, handles infinities
+        return true;
+    } else if (a == 0 || b == 0 || (absA + absB < minV)) {
+                        // a or b is zero or both are extremely close to it
+                        // relative error is less meaningful here
+        return diff < (epsilon * minV);
+    }
+    // use relative error
+    return diff /std::min((absA + absB), maxV) < epsilon;
+}
+
+
+static bool load_model(simple_model & model, float * a, float * b, enum ggml_type data_type, int elements_A, int elements_B) {
+    ggml_log_set(ggml_log_callback_default, nullptr);
+
+    // initialize the backend
+    fprintf(stderr, "%s: using TSavorite backend \n", __func__);
+    model.backend = ggml_backend_tsavorite_init();
+    if (!model.backend) {
+        fprintf(stderr, "%s: ggml_backend_tsavorite_init() failed\n", __func__);
+	return false;
+    }
+
+    int num_tensors;
+
+    if (!b)
+        num_tensors = NUM_INPUT_URINARY_TENSORS;
+    else
+        num_tensors = NUM_INPUT_TENSORS;
+
+    // Since we are not passing the mem_buffer ggml context will create
+    /* .mem_buffer = params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size) */
+    // mem_buffer for ctx is used for any object creation and used for tensor data if
+    // backend doesnt have own memory
+    // Since we are using backend memory hence i have removed extra bytes: 100, removed from mem_size at below
+    struct ggml_init_params params {
+            /*.mem_size   =*/ (ggml_tensor_overhead() * num_tensors),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+    };
+    fprintf(stderr, "\n Calculating mem_size %ld  %d  and creating ggml context \n", ggml_tensor_overhead(), num_tensors); 
+
+    // create context
+    model.ctx = ggml_init(params);
+    if (!model.ctx) {
+        fprintf(stderr, "%s: ggml_init failed\n", __func__);
+	return false;
+    }
+
+    // create tensors
+    // //  BELOW CODE NO CHANGE FOR tsavorite Backend
+    // Tensor just created with OBJ(Structure)+Tensor(structure)
+    // Still Buffer need to attached to Tensor since we are using Backend
+    // We will using tsi_alloc called under tsavorite-backend
+
+    fprintf(stderr, "\n Creating input Tensor \n");
+
+    //int64_t ne[GGML_MAX_DIMS]; // number of elements
+    //size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
+    model.a = ggml_new_tensor_1d(model.ctx, data_type, elements_A);
+    if (b)
+        model.b = ggml_new_tensor_1d(model.ctx, data_type, elements_B);
+
+    // create a backend buffer (backend memory) and alloc the tensors from the context
+    fprintf(stderr, "\n Creating Backend Buffer \n");
+
+    // Here at ggml Context we have only two input tensors, hence backend memory is
+    // created for two input tensors
+    model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend);
+
+    // load data from cpu memory to backend buffer
+    fprintf(stderr, "\n Loading Input Tensor Data to Backend Buffer \n");
+
+    // loading the data to tensor
+    ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a));
+    if (b)
+        ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b));
+
+    // create a array to print input tensor
+    std::vector<float> out_data(ggml_nelements(model.a));
+    // bring the data from the backend memory
+    ggml_backend_tensor_get(model.a, out_data.data(), 0, ggml_nbytes(model.a));
+
+
+    fprintf(stderr, "\nBringing  tensor data from Backend buffer and printing %d  tensor data:\n[", (int) model.a->ne[0]);
+
+    for (int i = 0; i < model.a->ne[0] /* cols */; i++) {
+        fprintf(stderr, " %.2f", out_data[i]);
+    }
+    fprintf(stderr, " ]\n");
+    return true;
+}
+
+// build the compute graph
+static struct ggml_cgraph * build_graph(const simple_model& model, enum ggml_tsavorite_kernel_type ops_type) {
+    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params params0 = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+
+    // create a temporally context to build the graph
+    struct ggml_context * ctx0 = ggml_init(params0);
+
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+
+
+    struct ggml_tensor * result;
+    switch(ops_type) {
+	    case GGML_TSAVORITE_KERNEL_TYPE_ADD:
+    		result = ggml_add(ctx0, model.a, model.b);
+		break;
+	    case GGML_TSAVORITE_KERNEL_TYPE_SUB:
+    		result = ggml_sub(ctx0, model.a, model.b);
+		break;
+	    case GGML_TSAVORITE_KERNEL_TYPE_MULT:
+    		result = ggml_mul(ctx0, model.a, model.b);
+		break;
+	    case GGML_TSAVORITE_KERNEL_TYPE_DIV:
+    		result = ggml_div(ctx0, model.a, model.b);
+		break;
+	    case GGML_TSAVORITE_KERNEL_TYPE_SQRT:
+    		result = ggml_sqrt(ctx0, model.a);
+		break;
+	    case GGML_TSAVORITE_KERNEL_TYPE_NEG:
+                result = ggml_neg(ctx0, model.a);
+		break;
+	    case GGML_TSAVORITE_KERNEL_TYPE_ABS:
+                result = ggml_abs(ctx0, model.a);
+		break;
+	    case GGML_TSAVORITE_KERNEL_TYPE_SIN:
+                result = ggml_sin(ctx0, model.a);
+		break;
+	     default:
+    		ggml_free(ctx0);
+    		fprintf(stderr, "\n Non Supported Operation \n");
+		return NULL;
+    }
+    // build operations nodes
+    ggml_build_forward_expand(gf, result);
+
+    // delete the temporally context used to build the graph
+    ggml_free(ctx0);
+    return gf;
+}
+
+// compute with backend
+static struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr, enum ggml_tsavorite_kernel_type ops_type) {
+    // reset the allocator to free all the memory allocated during the previous inference
+
+    fprintf(stderr, "\n Under Test case for  compute API creating  build_graph  \n");
+    struct ggml_cgraph * gf = build_graph(model, ops_type);
+    if (!gf) { 
+	    fprintf(stderr, "\ncompute failed\n");
+	    return NULL;
+    }
+	   
+    // allocate tensors
+    ggml_gallocr_alloc_graph(allocr, gf);
+
+    ggml_backend_graph_compute(model.backend, gf);
+
+    // in this case, the output tensor is the last one in the graph
+    return ggml_graph_node(gf, -1);
+}
+
+enum ggml_tsavorite_kernel_type convert_testcase_to_ops_type (const char *testCase) {
+        if (!strcmp(testCase,"add"))
+            return GGML_TSAVORITE_KERNEL_TYPE_ADD;
+        else if (!strcmp(testCase,"sub"))
+            return GGML_TSAVORITE_KERNEL_TYPE_SUB;
+        else if (!strcmp(testCase,"mult"))
+            return GGML_TSAVORITE_KERNEL_TYPE_MULT;
+        else if (!strcmp(testCase,"div"))
+            return GGML_TSAVORITE_KERNEL_TYPE_DIV;
+        else if (!strcmp(testCase,"sqrt"))
+            return GGML_TSAVORITE_KERNEL_TYPE_SQRT;
+        else if (!strcmp(testCase,"neg"))
+            return GGML_TSAVORITE_KERNEL_TYPE_NEG;
+        else if (!strcmp(testCase,"abs"))
+            return GGML_TSAVORITE_KERNEL_TYPE_ABS;
+        else if (!strcmp(testCase,"sin"))
+            return GGML_TSAVORITE_KERNEL_TYPE_SIN;
+
+    	fprintf(stderr, "\n un-supported test case %s hence running default test case which is add operation  \n", testCase);
+	return GGML_TSAVORITE_KERNEL_TYPE_ADD;
+}
+
+int main(int argc, char *argv[]) {
+    ggml_time_init();
+    bool test_case_flag = true;
+    enum ggml_tsavorite_kernel_type ops_type;
+    simple_model model;
+    float *input1[GGML_TSAVORITE_KERNEL_TYPE_COUNT];
+    float *input2[GGML_TSAVORITE_KERNEL_TYPE_COUNT];
+    float *result_data[GGML_TSAVORITE_KERNEL_TYPE_COUNT];
+    bool data_scale = false;
+
+    int elements_A=0, elements_B=0;
+    int num_of_input_tensors;
+
+    if (argc > 1) {
+    	ops_type = convert_testcase_to_ops_type(argv[1]);
+	if (argc > 2 && !strcmp(argv[2], "scale"))
+		data_scale = true;
+    } else {
+	// Default Case
+    	ops_type = convert_testcase_to_ops_type("add");
+    }
+    if (ops_type == GGML_TSAVORITE_KERNEL_TYPE_SQRT ||
+		    ops_type == GGML_TSAVORITE_KERNEL_TYPE_NEG ||
+		    ops_type == GGML_TSAVORITE_KERNEL_TYPE_ABS ||
+		    ops_type == GGML_TSAVORITE_KERNEL_TYPE_SIN)
+	    num_of_input_tensors = NUM_INPUT_URINARY_TENSORS;
+    else 
+	    num_of_input_tensors = NUM_INPUT_TENSORS;
+
+    if (data_scale) {
+	    input1[ops_type]      = test_input_scale_1[ops_type];
+	    elements_A            = NUM_ELEMENTS_SCALE; 
+	    if (num_of_input_tensors != NUM_INPUT_URINARY_TENSORS) {
+	        input2[ops_type]      = test_input_scale_2[ops_type];
+	        elements_B            = NUM_ELEMENTS_SCALE; 
+	    }
+	    result_data[ops_type] = test_result_scale[ops_type];
+    } else {
+	    input1[ops_type]      = test_input_1[ops_type];
+	    elements_A            = NUM_ELEMENTS; 
+	    if (num_of_input_tensors != NUM_INPUT_URINARY_TENSORS) {
+	        input2[ops_type]      = test_input_2[ops_type];
+	        elements_B            = NUM_ELEMENTS; 
+	    }
+	    result_data[ops_type] = test_result[ops_type];
+    }
+
+    if(!load_model(model, input1[ops_type], input2[ops_type], GGML_TYPE_F32, elements_A, elements_B)) {
+	    fprintf(stderr, "\n\n TEST CASE FAILED \n\n");
+	    return -1;
+    }
+    // since tsavorite-backend init set the debug level to none, we are overwritting here
+    ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_DEBUG;
+
+    ggml_gallocr_t allocr = NULL;
+
+    allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
+
+    if (!allocr) {
+    	fprintf(stderr, "\n\n TEST CASE FAILED \n\n");
+	return -1;
+    }
+
+    // create the worst case graph for memory usage estimation
+    struct ggml_cgraph * gf = build_graph(model, ops_type);
+    if (!gf) {
+    	fprintf(stderr, "\n\n TEST CASE FAILED \n\n");
+	return -1;
+    }
+    ggml_gallocr_reserve(allocr, gf);
+    size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
+
+    fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0);
+
+    // perform computation
+    struct ggml_tensor * result = compute(model, allocr, ops_type);
+    if (!result) {
+	fprintf(stderr, "\n\n TEST CASE FAILED \n\n");
+	return -1;
+    }
+    fprintf(stderr, "\n Compute Done \n");
+
+    std::vector<float> out_data(ggml_nelements(result));
+
+    // bring the data from the backend memory
+    ggml_backend_tensor_get(result, out_data.data(), 0, ggml_nbytes(result));
+
+    // expected result:
+
+    fprintf(stderr, "\n operation type: %d, num of elements %d  \n", ops_type, (int) result->ne[0]);
+
+    fprintf(stderr, "\n compute is also done \n");
+    for (int i = 0; i < result->ne[0] /* cols */; i++) {
+	if (ggml_tsi_compare_two_float(out_data[i], result_data[ops_type][i])) {
+		continue;
+	}
+	test_case_flag = false;
+    	fprintf(stderr, "\n result for index %d is not matching expected %f got %f \n", i, result_data[ops_type][i], out_data[i]);
+    }
+
+    if (test_case_flag == false) {
+	fprintf(stderr, "\n\n TEST CASE FAILED \n\n");
+	return -1;
+    }
+    fprintf(stderr, "\n\n TEST CASE PASSED \n\n");
+
+    // free memory
+    ggml_free(model.ctx);
+
+    // release backend memory and free backend
+    //ggml_backend_buffer_free(model.buffer);
+    ggml_backend_free(model.backend);
+    return 0;
+}
diff --git a/ggml-tsi-kernel b/ggml-tsi-kernel
new file mode 160000
index 0000000000000..f7a3ac1ee334c
--- /dev/null
+++ b/ggml-tsi-kernel
@@ -0,0 +1 @@
+Subproject commit f7a3ac1ee334c242958ccb2053ecc4854822d87e
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 4746d5cb76c08..93a72d6cc84e4 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -267,6 +267,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-kompute.h
     include/ggml-opt.h
     include/ggml-metal.h
+    include/ggml-tsavorite.h
     include/ggml-rpc.h
     include/ggml-sycl.h
     include/ggml-vulkan.h
diff --git a/ggml/include/ggml-tsavorite.h b/ggml/include/ggml-tsavorite.h
new file mode 100644
index 0000000000000..cd380ddf61ed3
--- /dev/null
+++ b/ggml/include/ggml-tsavorite.h
@@ -0,0 +1,189 @@
+// ------------------------------------------------------------------------------
+// Copyright (c) 2023 Tsavorite Scalable Intelligence, Inc . All rights reserved.
+//
+//
+// This file is the confidential and proprietary property of
+// Tsavorite Scalable Intelligence, Inc
+//
+// Possession or use of this file requires a written license from
+// Tsavorite Scalable Intelligence, Inc
+
+/******************************************************************************
+ * File: ggml-tsavorite.h
+ * Author TSI Inc
+ *
+ * Description:
+ * ***************************************************************************/
+
+//
+//
+// Note: this description is outdated
+//
+// An interface allowing to compute ggml_cgraph with tSovrite
+//
+// This is a fully functional interface that extends ggml with Hardware Accelerator support for
+// tSovrite devices. A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA,
+// etc.)
+//
+// How it works?
+//
+// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
+// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
+// use ggml_tsavorite_graph_compute()
+//
+// You only need to make sure that all memory buffers that you used during the graph creation
+// are mapped to the device unified memory with the ggml_tsavorite_add_buffer() function. This
+// mapping is used during the graph evaluation to determine the arguments of the compute kernels.
+//
+// Synchronization between device and host memory (for example for input and output tensors)
+// is done with the ggml_tsavorite_set_tensor() and ggml_tsavorite_get_tensor() functions.
+//
+
+#pragma once
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+#include "TestModel.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TSAVORITE_KERNEL_SIZE 64
+#define TSAVORITE_DEVICE_MAX_BUF_LEN 1024 * 1024 * 128
+
+enum ggml_tsavorite_input_tensors_count {
+  TSAVORITE_UNARY_INPUT_TENSORS = 1,
+  TSAVORITE_TWO_INPUT_TENSORS = 2
+};
+
+enum ggml_tsavorite_log_type {
+  GGML_TSAVORITE_LOG_NONE,
+  GGML_TSAVORITE_LOG_CONT,
+  GGML_TSAVORITE_LOG_ERROR,
+  GGML_TSAVORITE_LOG_WARN,
+  GGML_TSAVORITE_LOG_DEBUG,
+  GGML_TSAVORITE_LOG_INFO,
+  GGML_TSAVORITE_LOG_ALL
+};
+
+enum ggml_tsavorite_kernel_mode {
+    GGML_TSAVORITE_KERNEL_MODE_CPU,
+    GGML_TSAVORITE_KERNEL_MODE_MLIR
+};
+
+enum ggml_tsavorite_kernel_mode ggml_tsavorite_kernel_mode_flag = GGML_TSAVORITE_KERNEL_MODE_MLIR; 
+enum ggml_tsavorite_log_type ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_ALL;
+#define GGML_TSAVORITE_LOG_INFO(...)                                                               \
+  do {                                                                                             \
+    if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_INFO) {                                  \
+      ggml_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__);                                         \
+    }                                                                                              \
+  } while (0)
+#define GGML_TSAVORITE_LOG_DEBUG(...)                                                              \
+  do {                                                                                             \
+    if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_DEBUG) {                                 \
+      ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__);                                        \
+    }                                                                                              \
+  } while (0)
+#define GGML_TSAVORITE_LOG_WARN(...)                                                               \
+  do {                                                                                             \
+    if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_WARN) {                                  \
+      ggml_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__);                                         \
+    }                                                                                              \
+  } while (0)
+#define GGML_TSAVORITE_LOG_ERROR(...)                                                              \
+  do {                                                                                             \
+    if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_ERROR) {                                 \
+      ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__);                                        \
+    }                                                                                              \
+  } while (0)
+#define GGML_TSAVORITE_LOG_CONT(...)                                                               \
+  do {                                                                                             \
+    if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_CONT) {                                  \
+      ggml_log_internal(GGML_LOG_LEVEL_CONT, __VA_ARGS__);                                         \
+    }                                                                                              \
+  } while (0)
+
+enum ggml_tsavorite_tensor_data_type {
+  GGML_TSAVORITE_TENSOR_HEADER,
+  GGML_TSAVORITE_TENSOR_LEAF1,
+  GGML_TSAVORITE_TENSOR_LEAF2,
+  GGML_TSAVORITE_TENSOR_NODE,
+  GGML_TSAVORITE_TENSOR_END_DATA
+};
+
+enum ggml_tsavorite_kernel_type {
+  GGML_TSAVORITE_KERNEL_TYPE_ADD,
+  GGML_TSAVORITE_KERNEL_TYPE_SUB,
+  GGML_TSAVORITE_KERNEL_TYPE_MULT,
+  GGML_TSAVORITE_KERNEL_TYPE_DIV,
+  GGML_TSAVORITE_KERNEL_TYPE_SQRT,
+  GGML_TSAVORITE_KERNEL_TYPE_NEG,
+  GGML_TSAVORITE_KERNEL_TYPE_ABS,
+  GGML_TSAVORITE_KERNEL_TYPE_SIN,
+  GGML_TSAVORITE_KERNEL_TYPE_SIGMOID,
+
+  GGML_TSAVORITE_KERNEL_TYPE_COUNT
+};
+
+// max memory buffers that can be mapped to the device
+#define GGML_TSAVORITE_MAX_BUFFERS 64
+
+// max number of TSAVORITECommandBuffer used to submit a graph for processing
+#define GGML_TSAVORITE_MAX_COMMAND_BUFFERS 8
+#define tsi_nil 0
+#define TSI_UNUSED(x) (void)(x)
+
+typedef struct tensor_log_ {
+  uint32_t leaf1_len;
+  uint32_t leaf2_len;
+  uint32_t node_len;
+  enum ggml_tsavorite_tensor_data_type data_type;
+  enum ggml_tsavorite_kernel_type kernel_type;
+  uint64_t num_of_op;
+  FILE *log_file;
+  const ggml_tensor *tensor;
+} tensor_log;
+
+extern void _mlir_ciface_txe_add(void *a, void *b, void *res);
+extern void _mlir_ciface_txe_sub(void *a, void *b, void *res);
+extern void _mlir_ciface_txe_mult(void *a, void *b, void *res);
+extern void _mlir_ciface_txe_div(void *a, void *b, void *res);
+extern void _mlir_ciface_txe_sqrt(void *a, void *res);
+extern void _mlir_ciface_txe_neg(void *a, void *res);
+extern void _mlir_ciface_txe_abs(void *a, void *res);
+extern void _mlir_ciface_txe_sin(void *a, void *res);
+extern void _mlir_ciface_txe_sigmoid(void *a, void *res);
+extern void ggml_tsi_log_tensor_data(tensor_log log_data);
+
+#define NUM_OF_TXES 1
+#define MEM_REF_DESCRIPTOR_RANK 1
+
+//
+// backend API
+// user-code should use only these functions
+//
+
+GGML_BACKEND_API ggml_backend_t ggml_backend_tsavorite_init(void);
+
+GGML_BACKEND_API bool ggml_backend_is_tsavorite(ggml_backend_t backend);
+
+GGML_BACKEND_API void ggml_backend_tsavorite_set_abort_callback(ggml_backend_t backend,
+                                                                ggml_abort_callback abort_callback,
+                                                                void *user_data);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_tsavorite_buffer_type(void);
+
+// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
+GGML_BACKEND_API void ggml_backend_tsavorite_capture_next_compute(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_tsavorite_reg(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index ddea5ad3891e5..0a14bbb74ced7 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -308,6 +308,7 @@ ggml_add_backend(CUDA)
 ggml_add_backend(HIP)
 ggml_add_backend(Kompute)
 ggml_add_backend(METAL)
+ggml_add_backend(TSAVORITE)
 ggml_add_backend(MUSA)
 ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 405d8e31514b5..f48a23bf83151 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -37,6 +37,10 @@
 #include "ggml-metal.h"
 #endif
 
+#ifdef GGML_USE_TSAVORITE
+#include "ggml-tsavorite.h"
+#endif
+
 #ifdef GGML_USE_SYCL
 #include "ggml-sycl.h"
 #endif
@@ -166,6 +170,11 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_METAL
         register_backend(ggml_backend_metal_reg());
 #endif
+
+#ifdef GGML_USE_TSAVORITE
+        register_backend(ggml_backend_tsavorite_reg());
+#endif
+
 #ifdef GGML_USE_SYCL
         register_backend(ggml_backend_sycl_reg());
 #endif
@@ -572,6 +581,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     ggml_backend_load_best("hip", silent, dir_path);
     ggml_backend_load_best("kompute", silent, dir_path);
     ggml_backend_load_best("metal", silent, dir_path);
+    ggml_backend_load_best("tsavorite", silent, dir_path);
     ggml_backend_load_best("rpc", silent, dir_path);
     ggml_backend_load_best("sycl", silent, dir_path);
     ggml_backend_load_best("vulkan", silent, dir_path);
diff --git a/ggml/src/ggml-tsavorite/CMakeLists.txt b/ggml/src/ggml-tsavorite/CMakeLists.txt
new file mode 100644
index 0000000000000..f58331fd68d30
--- /dev/null
+++ b/ggml/src/ggml-tsavorite/CMakeLists.txt
@@ -0,0 +1,8 @@
+message(STATUS "Tsavorite framework is found")
+#
+# tsavorite Kernel Library
+ggml_add_backend_library(ggml-tsavorite
+                         ggml-tsavorite.cpp
+                        )
+
+target_link_libraries(ggml-tsavorite PRIVATE ${TLIBS} dl rt)
diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
new file mode 100644
index 0000000000000..7939a0f8cfa13
--- /dev/null
+++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
@@ -0,0 +1,1887 @@
+// -----------------------------------------------------------------------------n
+// Copyright (c) 2023 Tsavorite Scalable Intelligence, Inc . All rights reserved.
+//
+//
+// This file is the confidential and proprietary property of
+// Tsavorite Scalable Intelligence, Inc
+//
+// Possession or use of this file requires a written license from
+// Tsavorite Scalable Intelligence, Inc
+
+/******************************************************************************
+ * File: ggml-tsavorite.cpp
+ * Author TSI Inc
+ *
+ * Description:
+ * ***************************************************************************/
+
+#include "ggml-tsavorite.h"
+#include <unistd.h>
+#include <inttypes.h>
+#include <math.h>
+#include <string>
+
+#include "ggml-backend-impl.h"
+#include "ggml-impl.h"
+#include "ggml.h"
+
+typedef struct _txe_device_t *txe_device_s;
+typedef struct _txe_compute_pipeline_state_t *txe_compute_pipeline_state_s;
+FILE *tsi_op_log_file;
+uint64_t num_of_op;
+
+#ifdef USE_COMMAND_BUFFERS
+typedef struct _txe_command_queue_t *txe_command_queue_s;
+typedef struct _txe_dispatch_queue_t *txe_dispatch_queue_s;
+typedef struct _txe_command_buffer_t *txe_command_buffer_s;
+#endif /* USE_COMMAND_BUFFERS */
+typedef struct ggml_backend_tsavorite_buffer ggml_backend_tsavorite_buffer_s;
+
+struct _txe_device_t {
+  char name[100];
+  uint32_t max_buf_len;
+  size_t recommended_max_working_set_size;
+  size_t current_allocated_size;
+  int reserved;
+  struct _stats {
+    struct _op_run_count {
+      // Each Kernel operation belong to one tensor. Below count will increment for each Node Tensor
+      uint64_t total_tensor_count;
+      // This counter increment whenever kernel call are  made
+      uint64_t num_of_kernel_call;
+      // below field count all tensors whose num of elements are larger than  kernel number of
+      // elements
+      uint64_t num_of_tensor_spilt;
+      // For Any application below field maintain smallest tensor num of elem
+      uint64_t min_num_of_elem;
+      // For Any application below field maintain largest tensor num of elem
+      uint64_t max_num_of_elem;
+    } op_run_count[GGML_TSAVORITE_KERNEL_TYPE_COUNT];
+  } stats;
+};
+
+struct _txe_compute_pipeline_state_t {
+  void (*_mlir_fptr_2_input)(void *, void *, void *);
+  void (*_mlir_fptr_1_input)(void *, void *);
+  std::string kernel_name;
+  int reserved;
+};
+
+#ifdef USE_COMMAND_BUFFERS
+struct _txe_command_queue_t {
+  int reserved;
+};
+
+struct _txe_dispatch_queue_t {
+  int reserved;
+};
+
+struct _txe_command_buffer_t {
+  int reserved;
+};
+#endif /* USE_COMMAND_BUFFERS */
+
+static txe_device_s tsi_system_default_device_create();
+
+// kernels
+
+struct ggml_tsavorite_kernel {
+  txe_compute_pipeline_state_s pipeline;
+};
+
+struct ggml_backend_tsavorite_context {
+#ifdef USE_COMMAND_BUFFERS
+  txe_command_queue_s queue;
+
+  txe_dispatch_queue_s d_queue;
+#endif /* USE_COMMAND_BUFFERS */
+
+  struct ggml_tsavorite_kernel kernels[GGML_TSAVORITE_KERNEL_TYPE_COUNT];
+
+  // capture state
+  bool capture_next_compute;
+  bool capture_started;
+
+  // command buffer state
+  int n_cb;       // number of extra threads used to submit the command buffers
+  int n_nodes_0;  // number of nodes submitted by the main thread
+  int n_nodes_1;  // remaining number of nodes submitted by the n_cb threads
+  int n_nodes_per_cb;
+
+  struct ggml_cgraph *gf;
+
+  // the callback given to the thread pool
+  // void (^encode_async)(size_t ith);
+
+#ifdef USE_COMMAND_BUFFERS
+  // n_cb command buffers + 1 used by the main thread
+  txe_command_buffer_s command_buffers[GGML_TSAVORITE_MAX_COMMAND_BUFFERS + 1];
+#endif /* USE_COMMAND_BUFFERS */
+
+  // abort ggml_tsavorite_graph_compute if callback returns true
+  ggml_abort_callback abort_callback;
+  void *abort_callback_data;
+
+  // picking CPU compute example
+  int n_threads;
+  ggml_threadpool_t threadpool;
+
+  uint8_t *work_data;
+  size_t work_size;
+};
+
+// global
+
+// initialized in ggml_backend_tsavorite_reg
+static struct ggml_backend_reg g_ggml_backend_tsavorite_reg;
+static struct ggml_backend_device g_ggml_backend_tsavorite_device;
+
+// information about a tSavorite device
+// note: assumes single GPU device - the default one
+// Need to Add Support for multiple GPU devices
+static struct ggml_backend_tsavorite_device_context {
+  txe_device_s device;
+  int ref_count;
+
+  char name[128];
+} g_ggml_ctx_dev_main = {
+    /*.device                  =*/tsi_nil,
+    /*.ref_count               =*/0,
+    /*.name                    =*/"",
+};
+
+// temporarily defined here for compatibility between ggml-backend and the old API
+
+struct ggml_backend_tsavorite_buffer {
+  void *data;
+  size_t size;
+};
+
+struct ggml_backend_tsavorite_buffer_context {
+  void *all_data;
+  size_t all_size;
+  bool owned;
+
+  // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
+  int n_buffers;
+  ggml_backend_tsavorite_buffer_s buffers[GGML_TSAVORITE_MAX_BUFFERS];
+};
+
+static txe_device_s tsi_system_default_device_create() {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  txe_device_s device = (txe_device_s)malloc(sizeof(struct _txe_device_t));
+  device->max_buf_len = TSAVORITE_DEVICE_MAX_BUF_LEN;
+  device->recommended_max_working_set_size = TSAVORITE_DEVICE_MAX_BUF_LEN;
+  device->current_allocated_size = 0;
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return device;
+}
+
+static void tsi_device_free(txe_device_s device) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  free(device);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return;
+}
+
+#ifdef USE_COMMAND_BUFFERS
+static txe_command_queue_s tsi_command_queue_create() {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  txe_command_queue_s cqueue = (txe_command_queue_s)malloc(sizeof(struct _txe_command_queue_t));
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return cqueue;
+}
+
+static txe_dispatch_queue_s tsi_dispatch_queue_create() {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  txe_dispatch_queue_s dqueue = (txe_dispatch_queue_s)malloc(sizeof(struct _txe_dispatch_queue_t));
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return dqueue;
+}
+
+static void tsi_command_queue_free(txe_command_queue_s cqueue) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  if (cqueue)
+    free(cqueue);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return;
+}
+
+static void tsi_dispatch_queue_free(txe_dispatch_queue_s dqueue) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  if (dqueue)
+    free(dqueue);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return;
+}
+#endif /* USE_COMMAND_BUFFERS */
+
+static void tsi_buffer_free(void *data) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  if (data)
+    free(data);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return;
+}
+
+static bool tsi_log_setup() {
+  tsi_op_log_file = fopen("tsi-op.txt", "w+");
+  if (tsi_op_log_file == NULL) {
+    printf("Error Creating or opening log file\n");
+    return false;
+  }
+  return true;
+}
+
+void ggml_tsi_log_tensor_data(tensor_log log_data) {
+  if (!log_data.log_file) {
+    GGML_TSAVORITE_LOG_ERROR("%s: error: log file Cant be NULL\n", __func__);
+    return;
+  }
+
+  switch (log_data.data_type) {
+  case GGML_TSAVORITE_TENSOR_HEADER:
+    fprintf(log_data.log_file, "\n\n");
+    fprintf(log_data.log_file, "#############################################################\n");
+    fprintf(log_data.log_file,
+            "Tensor Number %ld and Type %d \n leaf1  len %d, leaf2 len %d, Node len %d\n",
+            log_data.num_of_op, log_data.kernel_type, log_data.leaf1_len, log_data.leaf2_len,
+            log_data.node_len);
+    fprintf(log_data.log_file, "############################################################\n");
+    fprintf(log_data.log_file, "\n\n");
+    fflush(log_data.log_file);
+    return;
+  case GGML_TSAVORITE_TENSOR_LEAF1:
+    fprintf(log_data.log_file, "\n---------------------------------------------------\n");
+    fprintf(log_data.log_file, "leaf1 Detail:\n");
+    break;
+  case GGML_TSAVORITE_TENSOR_LEAF2:
+    fprintf(log_data.log_file, "\n---------------------------------------------------\n");
+    fprintf(log_data.log_file, "leaf2 Detail:\n");
+    break;
+  case GGML_TSAVORITE_TENSOR_NODE:
+    fprintf(log_data.log_file, "\n---------------------------------------------------\n");
+    fprintf(log_data.log_file, "Node Detail:\n");
+    break;
+  case GGML_TSAVORITE_TENSOR_END_DATA:
+    fprintf(log_data.log_file, "DONE WITH THIS OPERATION %ld\n", log_data.num_of_op);
+    fprintf(log_data.log_file, "############################################################\n");
+    fprintf(log_data.log_file, "\n\n");
+    fflush(log_data.log_file);
+    return;
+  default:
+    GGML_TSAVORITE_LOG_ERROR("%s: error: Invalid Data Type Passed\n", __func__);
+    return;
+  }
+  if (!log_data.tensor) {
+    GGML_TSAVORITE_LOG_ERROR("%s: error: tensor pointer is  NULL\n", __func__);
+    return;
+  }
+  float *p;
+  int64_t count = (log_data.tensor->ne[0]) * (log_data.tensor->ne[1]) * (log_data.tensor->ne[2]) *
+                  (log_data.tensor->ne[3]);
+  p = (float *)log_data.tensor->data;
+  if ((!p) || (count == 0)) {
+    fprintf(log_data.log_file, "\n\n");
+    fprintf(log_data.log_file, "Tensor Data is Empty");
+    fprintf(log_data.log_file, "\n---------------------------------------------------\n");
+    fprintf(log_data.log_file, "\n\n");
+    fflush(log_data.log_file);
+    return;
+  }
+  fprintf(tsi_op_log_file, "%.16f ", p[0]);
+  for (int64_t ii = 1; ii < count; ++ii) {
+    if (!(ii % 4))
+      fprintf(log_data.log_file, "\n");
+    fprintf(log_data.log_file, "%.16f ", p[ii]);
+  }
+  fprintf(log_data.log_file, "\n\n");
+  fprintf(log_data.log_file, "\n---------------------------------------------------\n");
+  fflush(log_data.log_file);
+  return;
+}
+
+static void ggml_tsavorite_disp_stats(struct ggml_backend_tsavorite_context *ctx,
+                                      txe_device_s device) {
+  if (!ctx || !device) {
+    GGML_TSAVORITE_LOG_ERROR(
+        "At %s Either backend context or device or both are NULL, hence cant display Stats",
+        __func__);
+    return;
+  }
+  for (int i = 0; i < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++i) {
+    if (!ctx->kernels[i].pipeline)
+      continue;
+    GGML_TSAVORITE_LOG_CONT(
+        "\n %s Operation, total tensor: %lu  Number of Kernel Call: %lu  Number of tensor got "
+        "spilt: %lu Min Num of Elem %lu Max Num of Elem %lu \n",
+        ctx->kernels[i].pipeline->kernel_name.c_str(),
+        device->stats.op_run_count[i].total_tensor_count,
+        device->stats.op_run_count[i].num_of_kernel_call,
+        device->stats.op_run_count[i].num_of_tensor_spilt,
+        device->stats.op_run_count[i].min_num_of_elem,
+        device->stats.op_run_count[i].max_num_of_elem);
+  }
+  return;
+}
+
+static void _mlir_ciface_txe_add_test (void *src0, void *src1, void *res)
+{
+    // MemRefDescriptor
+    if (!src0 || !src1 || !res)
+        return;
+
+    const int Rank = MEM_REF_DESCRIPTOR_RANK;
+    MemRefDescriptor<Rank> *srcP0, *srcP1, *nodeP;
+    srcP0 = (MemRefDescriptor<Rank> *)src0;
+    srcP1 = (MemRefDescriptor<Rank> *)src1;
+    nodeP = (MemRefDescriptor<Rank> *)res;
+
+    uint32_t count = srcP0->shape[Rank - 1];
+    float *s0      = (float*)srcP0->data;
+    float *s1      = (float*)srcP1->data;
+    float *n       = (float*)nodeP->data;
+
+    for(uint32_t i=0; i < count; ++i)
+        n[i] = s0[i] + s1[i];
+    //printf("\n Calling mlir_add cpu function-5 \n");
+    return;
+}
+
+static void _mlir_ciface_txe_mult_test (void *src0, void *src1, void *res)
+{
+    // MemRefDescriptor
+    if (!src0 || !src1 || !res)
+        return;
+
+    const int Rank = MEM_REF_DESCRIPTOR_RANK;
+    MemRefDescriptor<Rank> *srcP0, *srcP1, *nodeP;
+    srcP0 = (MemRefDescriptor<Rank> *)src0;
+    srcP1 = (MemRefDescriptor<Rank> *)src1;
+    nodeP = (MemRefDescriptor<Rank> *)res;
+
+    uint32_t count = srcP0->shape[Rank - 1];
+    float *s0      = (float*)srcP0->data;
+    float *s1      = (float*)srcP1->data;
+    float *n       = (float*)nodeP->data;
+
+    for(uint32_t i=0; i < count; ++i)
+        n[i] = s0[i]*s1[i];
+    return;
+}
+
+static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_type kernel_type) {
+  txe_compute_pipeline_state_s kernel_pipeline =
+      (txe_compute_pipeline_state_s)calloc(1, sizeof(struct _txe_compute_pipeline_state_t));
+  bool flag = false;
+  if (!kernel_pipeline) {
+    GGML_TSAVORITE_LOG_ERROR("Calloc failing while setting up kernel");
+    return NULL;
+  }
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  switch (kernel_type) {
+      case GGML_TSAVORITE_KERNEL_TYPE_ADD:
+          if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU)
+              kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_add_test;
+          else
+              kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_add;
+          kernel_pipeline->kernel_name = "TXE_ADD";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_SUB:
+          kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_sub;
+          kernel_pipeline->kernel_name = "TXE_SUB";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_MULT:
+          if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU)
+              kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_mult_test;
+          else
+              kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_mult;
+          kernel_pipeline->kernel_name = "TXE_MULT";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_DIV:
+          kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_div;
+          kernel_pipeline->kernel_name = "TXE_DIV";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_SQRT:
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sqrt;
+          kernel_pipeline->kernel_name = "TXE_SQRT";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_NEG:
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_neg;
+          kernel_pipeline->kernel_name = "TXE_NEG";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_ABS:
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_abs;
+          kernel_pipeline->kernel_name = "TXE_ABS";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_SIN:
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sin;
+          kernel_pipeline->kernel_name = "TXE_SIN";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_SIGMOID:
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sigmoid;
+          kernel_pipeline->kernel_name = "TXE_SIGMOID";
+          flag = true;
+          break;
+      default:
+          break;
+  }
+  if (!flag) {
+    GGML_TSAVORITE_LOG_INFO("Kernel %d not supported \n", kernel_type);
+    if (kernel_pipeline) {
+      free(kernel_pipeline);
+      kernel_pipeline = NULL;
+    }
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return kernel_pipeline;
+}
+
+static void tsi_kernel_release(txe_compute_pipeline_state_s kernel_pipeline) {
+  // clear kernel_pipeline
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  if (kernel_pipeline) {
+    free(kernel_pipeline);
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return;
+}
+
+// acquire
+static txe_device_s
+ggml_backend_tsavorite_device_acq(struct ggml_backend_tsavorite_device_context *ctx) {
+  assert(ctx != NULL);
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  if (ctx->device == tsi_nil) {
+    ctx->device = tsi_system_default_device_create();
+    snprintf(ctx->name, sizeof("txe"), "txe");
+  }
+
+  ctx->ref_count++;
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return ctx->device;
+}
+
+// release
+static void ggml_backend_tsavorite_device_rel(struct ggml_backend_tsavorite_device_context *ctx) {
+  assert(ctx != NULL);
+  assert(ctx->ref_count > 0);
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  ctx->ref_count--;
+
+  // Need to define function txe_device_free
+  if (ctx->ref_count == 0) {
+    tsi_device_free(ctx->device);
+    ctx->device = tsi_nil;
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+// We will use Unified Memory this memory is used for buffer
+static void *ggml_tsavorite_host_malloc(size_t n) {
+  void *data = NULL;
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  GGML_TSAVORITE_LOG_INFO("\n Allocating memory from tsi_alloc with size  %ld \n", n);
+  printf("\n ANoop Allocating memory from tsi_alloc with size  %ld \n", n);
+  data = tsi_alloc(n);
+  GGML_TSAVORITE_LOG_CONT("\n Allocating memory from tsi_alloc with size  %ld starting memory %p\n",
+                          n, data);
+
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return data;
+}
+
+static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_dev_t dev) {
+  GGML_TSAVORITE_LOG_INFO("%s: Start\n", __func__);
+  // Open a file named "tsi-op.txt" in the current directory for writing
+  num_of_op = 0;
+
+  if (tsi_log_setup() == false)
+    return NULL;
+
+  // TSI Run time Initalization
+  tsi_initialize(NUM_OF_TXES);
+
+  // init context
+  struct ggml_backend_tsavorite_context *ctx = (struct ggml_backend_tsavorite_context *)calloc(
+      1, sizeof(struct ggml_backend_tsavorite_context));
+  struct ggml_backend_tsavorite_device_context *ctx_dev =
+      (struct ggml_backend_tsavorite_device_context *)dev->context;
+
+  // setup the devie context
+  txe_device_s device = ggml_backend_tsavorite_device_acq(ctx_dev);
+  GGML_TSAVORITE_LOG_INFO("%s: picking default device: %s\n", __func__, device->name);
+  for (uint32_t op = GGML_TSAVORITE_KERNEL_TYPE_ADD; op < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++op) {
+    device->stats.op_run_count[op].total_tensor_count = 0;
+    device->stats.op_run_count[op].num_of_kernel_call = 0;
+    device->stats.op_run_count[op].num_of_tensor_spilt = 0;
+    device->stats.op_run_count[op].min_num_of_elem = 0;
+    device->stats.op_run_count[op].max_num_of_elem = 0;
+  }
+  ctx->n_threads = GGML_DEFAULT_N_THREADS;
+  ctx->threadpool = NULL;
+  ctx->work_data = NULL;
+  ctx->work_size = 0;
+  ctx->abort_callback = NULL;
+  ctx->abort_callback_data = NULL;
+
+  // We dont need it for now, we will revisit
+#ifdef USE_COMMAND_BUFFERS
+  // setting up backend context
+  ctx->queue = tsi_command_queue_create();
+  ctx->d_queue = tsi_dispatch_queue_create();
+#endif /* USE_COMMAND_BUFFERS */
+
+  ctx->capture_next_compute = false;
+  ctx->capture_started = false;
+
+  ctx->gf = tsi_nil;
+  // ctx->encode_async = tsi_nil;
+
+#ifdef USE_COMMAND_BUFFERS
+  for (int i = 0; i < GGML_TSAVORITE_MAX_COMMAND_BUFFERS; ++i) {
+    ctx->command_buffers[i] = tsi_nil;
+  }
+#endif /* USE_COMMAND_BUFFERS */
+
+  // load TSavorite kernels
+  {
+    for (int i = 0; i < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++i) {
+      ctx->kernels[i].pipeline = tsi_nil;
+    }
+
+#define GGML_TSAVORITE_KERNEL(e, supported)                                                        \
+  if (supported) {                                                                                 \
+    ctx->kernels[e].pipeline = tsi_kernel_setup(e);                                                \
+    GGML_TSAVORITE_LOG_INFO(" TSAVORITE SUPPORTED KERNEL ");                                       \
+  } else {                                                                                         \
+    GGML_TSAVORITE_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_" #e);       \
+  }
+
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ADD, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SUB, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_MULT, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_DIV, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SQRT, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_NEG, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ABS, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIN, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIGMOID, true);
+  }
+
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return ctx;
+}
+
+static void ggml_tsavorite_free(struct ggml_backend_tsavorite_context *ctx) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  for (int i = 0; i < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++i) {
+    if (ctx->kernels[i].pipeline) {
+      tsi_kernel_release(ctx->kernels[i].pipeline);
+      ctx->kernels[i].pipeline = tsi_nil;
+    }
+  }
+
+  // Block_release(ctx->encode_async);
+  //
+#ifdef USE_COMMAND_BUFFERS
+  tsi_command_queue_free(ctx->queue);
+
+  tsi_dispatch_queue_free(ctx->d_queue);
+#endif /* USE_COMMAND_BUFFERS */
+
+  free(ctx);
+
+  // TSI run time free
+  GGML_TSAVORITE_LOG_INFO("\n Calling tsi_finalize \n");
+  // delay to allow any file operations to complete for runtime
+
+  GGML_TSAVORITE_LOG_INFO("Delaying tsi_finalize for 2 sec");
+  sleep(2);
+  tsi_finalize();
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+#if 0
+// finds the tSavorite buffer that contains the tensor data on the TXE device unified memory
+// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
+// tSavorite buffer based on the host memory pointer
+//
+static ggml_backend_tsavorite_buffer_s ggml_tsavorite_get_buffer(struct ggml_tensor * t, size_t * offs) {
+    // GGML_TSAVORITE_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
+    GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+    const int64_t tsize = ggml_nbytes(t);
+
+    ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
+
+    struct ggml_backend_tsavorite_buffer_context * buf_ctx = (struct ggml_backend_tsavorite_buffer_context *) buffer->context;
+
+    // find the view that contains the tensor fully
+    for (int i = 0; i < buf_ctx->n_buffers; ++i) {
+        const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
+
+        // GGML_TSAVORITE_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
+        if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
+            *offs = (size_t) ioffs;
+
+            // GGML_TSAVORITE_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
+            GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+            return buf_ctx->buffers[i];
+        }
+    }
+
+    GGML_TSAVORITE_LOG_ERROR("%s: error: tensor '%s' buffer is tsi_nil\n", __func__, t->name);
+    GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+    return tsi_nil;
+}
+#endif
+
+static bool ggml_tsavorite_supports_op(const struct ggml_backend_tsavorite_device_context *ctx_dev,
+                                       const struct ggml_tensor *op) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  if (!ctx_dev)
+    return false;
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  for (size_t i = 0, n = 3; i < n; ++i) {
+    if (op->src[i] != NULL && op->src[i]->type != GGML_TYPE_F32) {
+      return false;
+    }
+  }
+
+  if (op->type != GGML_TYPE_F32)
+    return false;
+  switch (op->op) {
+  case GGML_OP_NONE:
+  case GGML_OP_ADD:
+  case GGML_OP_SUB:
+  case GGML_OP_MUL:
+  case GGML_OP_DIV:
+  case GGML_OP_SQRT:
+  case GGML_OP_SIN:
+    break;
+  case GGML_OP_UNARY:
+    switch (ggml_get_unary_op(op)) {
+    case GGML_UNARY_OP_NEG:
+    case GGML_UNARY_OP_ABS:
+    case GGML_UNARY_OP_SIGMOID:
+      break;
+    default:
+      return false;
+    }
+    break;
+  default:
+    return false;
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return true;
+}
+
+/*
+static void ggml_tsavorite_encode_node(
+                        ggml_backend_t   backend,
+                                   int   idx,
+          tsi_command_encoder   encoder) {
+}
+*/
+
+static void ggml_tsavorite_decompose_unary_kernel_sin(uint32_t num_elem, ggml_tensor *src) {
+  float *p = (float *)(src->data);
+  for (uint32_t i = 0; i < num_elem; ++i) {
+    *p = (*p) / (2 * M_PI);
+    ++p;
+  }
+  return;
+}
+
+static void ggml_tsavorite_decompose_unary_kernel(uint32_t num_elem, ggml_tensor *src,
+                                                  ggml_tensor *node) {
+  switch (node->op) {
+  case GGML_OP_SIN:
+    ggml_tsavorite_decompose_unary_kernel_sin(num_elem, src);
+    break;
+  default:
+    break;
+  }
+  return;
+}
+
+// nodes are intermediate which has multiple src tensors & operation
+// Here we create multiple thread
+// Each Thread run the command buffer & pick Tensor and execute and get the result back base on
+// async or sync all Compute wil finish all tensors execution
+static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
+                                                     struct ggml_cgraph *cgraph) {
+#if 0
+    GGML_LOG_INFO("Start %s\n", __func__);
+    struct ggml_backend_tsavorite_context        * ctx     = backend->context;
+    struct ggml_backend_tsavorite_device_context * ctx_dev = backend->device->context;
+
+    // number of nodes encoded by the main thread (empirically determined)
+    const int n_main = 128;
+
+    // number of threads in addition to the main thread
+    const int n_cb = ctx->n_cb;
+
+    // submit the ggml compute graph to the TXE by creating command buffers and encoding the ops in them
+    // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
+    // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
+    // each thread creates it's own command buffer and enqueues the ops in parallel
+
+    GGML_LOG_INFO("End %s\n", __func__);
+    return GGML_STATUS_SUCCESS;
+#endif
+
+  struct ggml_backend_tsavorite_context *ctx =
+      (struct ggml_backend_tsavorite_context *)backend->context;
+  if (!ctx) {
+    GGML_LOG_ERROR("\n backend ctx is NULL \n");
+    return GGML_STATUS_FAILED;
+  }
+
+#if 0
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, ctx->n_threads, ctx->threadpool);
+
+    if (ctx->work_size < cplan.work_size) {
+        delete[] ctx->work_data;
+        ctx->work_data = new uint8_t[cplan.work_size];
+        if (ctx->work_data == NULL) {
+            ctx->work_size = 0;
+            return GGML_STATUS_ALLOC_FAILED;
+        }
+        ctx->work_size = cplan.work_size;
+    }
+    cplan.work_data = (uint8_t *)ctx->work_data;
+
+    cplan.abort_callback      = ctx->abort_callback;
+    cplan.abort_callback_data = ctx->abort_callback_data;
+#endif
+
+  txe_device_s device = ggml_backend_tsavorite_device_acq(
+      (struct ggml_backend_tsavorite_device_context *)backend->device->context);
+
+  if (!device) {
+    GGML_TSAVORITE_LOG_ERROR("\n tsavorite device is NULL \n");
+    return GGML_STATUS_FAILED;
+  }
+  // MemRefDescriptor
+  const int Rank = MEM_REF_DESCRIPTOR_RANK;
+  MemRefDescriptor<Rank> *srcP0, *srcP1, *nodeP;
+  struct ggml_tensor *src0, *src1, *node;
+  uint32_t num_elem_src0, num_elem_src1, num_elem_node;
+  enum ggml_tsavorite_kernel_type kernel_type;
+  // This variable not needed since src0 or node will have max elem size
+  //  and src1 size will min elem size
+  uint64_t max_num_of_elem, min_num_of_elem;
+  enum ggml_tsavorite_input_tensors_count num_of_input_tensors;
+  tensor_log log_data;
+
+  for (int i = 0; i < cgraph->n_nodes; i++) {
+    node = cgraph->nodes[i];
+    src0 = node->src[0];
+    src1 = node->src[1];
+    min_num_of_elem = 0;
+    max_num_of_elem = 0;
+
+    switch (node->op) {
+    case GGML_OP_ADD:
+      kernel_type = GGML_TSAVORITE_KERNEL_TYPE_ADD;
+      num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS;
+      break;
+    case GGML_OP_SUB:
+      kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SUB;
+      num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS;
+      break;
+    case GGML_OP_MUL:
+      kernel_type = GGML_TSAVORITE_KERNEL_TYPE_MULT;
+      num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS;
+      break;
+    case GGML_OP_DIV:
+      kernel_type = GGML_TSAVORITE_KERNEL_TYPE_DIV;
+      num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS;
+      break;
+    case GGML_OP_SQRT:
+      kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SQRT;
+      num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS;
+      break;
+    case GGML_OP_SIN:
+      kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SIN;
+      num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS;
+      break;
+    case GGML_OP_UNARY:
+      switch (ggml_get_unary_op(node)) {
+      case GGML_UNARY_OP_NEG:
+        kernel_type = GGML_TSAVORITE_KERNEL_TYPE_NEG;
+        num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS;
+        break;
+      case GGML_UNARY_OP_ABS:
+        kernel_type = GGML_TSAVORITE_KERNEL_TYPE_ABS;
+        num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS;
+        break;
+      case GGML_UNARY_OP_SIGMOID:
+        kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SIGMOID;
+        num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS;
+        break;
+      default:
+        ggml_backend_tsavorite_device_rel(
+            (struct ggml_backend_tsavorite_device_context *)backend->device->context);
+        return GGML_STATUS_ABORTED;
+      }
+      break;
+    default:
+      ggml_backend_tsavorite_device_rel(
+          (struct ggml_backend_tsavorite_device_context *)backend->device->context);
+      return GGML_STATUS_ABORTED;
+    }
+
+    if (!ctx->kernels[kernel_type].pipeline ||
+        (!ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input &&
+         !ctx->kernels[kernel_type].pipeline->_mlir_fptr_1_input)) {
+      GGML_TSAVORITE_LOG_ERROR("Kernel Type %d, not supported \n", kernel_type);
+      return GGML_STATUS_ABORTED;
+    }
+    ++num_of_op;
+
+    if (num_of_input_tensors == TSAVORITE_TWO_INPUT_TENSORS) {
+      if (node->src[0] && node->src[1]) {
+        if (!src0->data || !src1->data || !node->data) {
+          GGML_TSAVORITE_LOG_ERROR(
+              "One of tensor Data doesnt have memory leaf1 %p, leaf2 %p, node %p \n", src0->data,
+              src1->data, node->data);
+          ggml_backend_tsavorite_device_rel(
+              (struct ggml_backend_tsavorite_device_context *)backend->device->context);
+          return GGML_STATUS_ABORTED;
+        }
+        srcP0 = (MemRefDescriptor<Rank> *)src0->data;
+        srcP1 = (MemRefDescriptor<Rank> *)src1->data;
+        nodeP = (MemRefDescriptor<Rank> *)node->data;
+        // This is for tsavorite MemRef Header hence getting header
+        --srcP0;
+        --srcP1;
+        --nodeP;
+        srcP0->data = srcP0->base = src0->data;
+        srcP1->data = srcP1->base = src1->data;
+        nodeP->data = nodeP->base = node->data;
+        // offset & shape size will be update base on Tensor Size
+        // TSAVORITE KERNEL CAN Take max of TSAVORITE_KERNEL_SIZE
+        // Hence we need to load tensor  data at multiple iteration
+        // for large Tensor Dataset
+        srcP0->offset = 0;
+        srcP1->offset = 0;
+        nodeP->offset = 0;
+
+        // currently _mlir_ as restriction to hold max of 64 elements, we need to spilt the work if
+        // its more than 64, i will address this at future PR Initalizing num_elem
+        num_elem_src0 = 1;
+        for (int i = 0; i < GGML_MAX_DIMS && src0->nb[i] != 0; ++i)
+          num_elem_src0 *= src0->ne[i];
+
+        num_elem_src1 = 1;
+        for (int i = 0; i < GGML_MAX_DIMS && src1->nb[i] != 0; ++i)
+          num_elem_src1 *= src1->ne[i];
+
+        num_elem_node = 1;
+        for (int i = 0; i < GGML_MAX_DIMS && node->nb[i] != 0; ++i)
+          num_elem_node *= node->ne[i];
+
+        if (!num_elem_src0 || !num_elem_src1 || !num_elem_node) {
+          GGML_TSAVORITE_LOG_ERROR("\nOne or more of Tensor length is zero of kernel_type %d\n",
+                                   kernel_type);
+          return GGML_STATUS_ABORTED;
+        }
+
+        min_num_of_elem = max_num_of_elem = num_elem_src0;
+
+        if (min_num_of_elem > num_elem_src1)
+          min_num_of_elem = num_elem_src1;
+        if (min_num_of_elem > num_elem_node)
+          min_num_of_elem = num_elem_node;
+
+        if (max_num_of_elem < num_elem_src1)
+          max_num_of_elem = num_elem_src1;
+        if (max_num_of_elem < num_elem_node)
+          max_num_of_elem = num_elem_node;
+
+        if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) {
+          bzero((char *)&log_data, sizeof(log_data));
+          log_data.leaf1_len = num_elem_src0;
+          log_data.leaf2_len = num_elem_src1;
+          log_data.node_len = num_elem_node;
+          log_data.log_file = tsi_op_log_file;
+          log_data.num_of_op = num_of_op;
+          log_data.kernel_type = kernel_type;
+
+          log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER;
+          ggml_tsi_log_tensor_data(log_data);
+
+          log_data.data_type = GGML_TSAVORITE_TENSOR_LEAF1;
+          log_data.tensor = src0;
+          ggml_tsi_log_tensor_data(log_data);
+
+          log_data.data_type = GGML_TSAVORITE_TENSOR_LEAF2;
+          log_data.tensor = src1;
+          ggml_tsi_log_tensor_data(log_data);
+        }
+
+        ggml_tensor *dst = node;
+        const int nr = ggml_nrows(src0);
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        for (int ir = 0; ir < nr; ++ir) {
+          const int64_t i03 = ir / (ne02 * ne01);
+          const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
+          const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
+
+          const int64_t i13 = i03 % ne13;
+          const int64_t i12 = i02 % ne12;
+          const int64_t i11 = i01 % ne11;
+          const int64_t nr0 = ne00 / ne10;
+
+          float *dst_ptr = (float *)((char *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1);
+          float *src0_ptr = (float *)((char *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01);
+          float *src1_ptr = (float *)((char *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11);
+
+          for (int64_t r = 0; r < nr0; ++r) {
+            // While loop is added to  handle the scenario when kernel number of elements
+            // less than ggml tensor number of elements.GGML tensor number of elements decided
+            // base on application like llama.cpp. Currently we have build Kernel elements
+            // statically hence we have MACRO: TSAVORITE_KERNEL_SIZE to track this
+            int count = 0;
+            while (count < ne10) {
+              int kernel_size;
+              srcP1->data =  srcP1->base = (void *)(src1_ptr + count);
+              srcP0->data =  srcP0->base = (void *)(src0_ptr + r * ne10 + count);
+              nodeP->data =  nodeP->base = (void *)(dst_ptr + r * ne10 + count);
+              if ((count + TSAVORITE_KERNEL_SIZE) > ne10)
+                kernel_size = ne10 - count;
+              else
+                kernel_size = TSAVORITE_KERNEL_SIZE;
+              count += kernel_size;
+              srcP0->shape[Rank - 1]   = kernel_size;
+              srcP1->shape[Rank - 1]   = kernel_size;
+              nodeP->shape[Rank - 1]   = kernel_size;
+              srcP0->strides[Rank - 1] = 0;
+              srcP1->strides[Rank - 1] = 0;
+              nodeP->strides[Rank - 1] = 0;
+              // kernel call
+              ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input(srcP0, srcP1, nodeP);
+              ++device->stats.op_run_count[kernel_type].num_of_kernel_call;
+            }
+          }
+        }
+
+        if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) {
+          log_data.data_type = GGML_TSAVORITE_TENSOR_NODE;
+          log_data.tensor = node;
+          ggml_tsi_log_tensor_data(log_data);
+
+          log_data.data_type = GGML_TSAVORITE_TENSOR_END_DATA;
+          log_data.tensor = NULL;
+          ggml_tsi_log_tensor_data(log_data);
+        }
+      }
+    }
+
+    if (num_of_input_tensors == TSAVORITE_UNARY_INPUT_TENSORS) {
+      if (node->src[0]) {
+        if (!src0->data || !node->data) {
+          GGML_TSAVORITE_LOG_ERROR(
+              "input or output tensor Data doesnt have memory leaf %p,  node %p \n", src0->data,
+              node->data);
+          ggml_backend_tsavorite_device_rel(
+              (struct ggml_backend_tsavorite_device_context *)backend->device->context);
+          return GGML_STATUS_ABORTED;
+        }
+        srcP0 = (MemRefDescriptor<Rank> *)src0->data;
+        nodeP = (MemRefDescriptor<Rank> *)node->data;
+        // This is for tsavorite MemRef Header hence getting header
+        --srcP0;
+        --nodeP;
+        srcP0->data = srcP0->base = src0->data;
+        nodeP->data = nodeP->base = node->data;
+        // offset & shape size will be update base on Tensor Size
+        // TSAVORITE KERNEL CAN Take max of TSAVORITE_KERNEL_SIZE
+        // Hence we need to load tensor  data at multiple iteration
+        // for large Tensor Dataset
+        srcP0->offset = 0;
+        nodeP->offset = 0;
+
+        // currently _mlir_ as restriction to hold max of 64 elements, we need to spilt the work if
+        // its more than 64, i will address this at future PR Initalizing num_elem
+        num_elem_src0 = 1;
+        for (int i = 0; i < GGML_MAX_DIMS && src0->nb[i] != 0; ++i)
+          num_elem_src0 *= src0->ne[i];
+        max_num_of_elem = min_num_of_elem = num_elem_src0;
+
+        if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) {
+          bzero((char *)&log_data, sizeof(log_data));
+          log_data.leaf1_len = num_elem_src0;
+          log_data.leaf2_len = 0;
+          log_data.node_len = num_elem_src0;
+          log_data.log_file = tsi_op_log_file;
+          log_data.num_of_op = num_of_op;
+          log_data.kernel_type = kernel_type;
+
+          log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER;
+          ggml_tsi_log_tensor_data(log_data);
+
+          log_data.data_type = GGML_TSAVORITE_TENSOR_LEAF1;
+          log_data.tensor = src0;
+          ggml_tsi_log_tensor_data(log_data);
+        }
+        // While loop is added to  handle the scenario when kernel number of elements
+        // less than ggml tensor number of elements.GGML tensor number of elements decided
+        // base on application like llama.cpp. Currently we have build Kernel elements statically
+        // hence we have MACRO: TSAVORITE_KERNEL_SIZE to track this
+        uint32_t count = 0;
+
+        if (node->op == GGML_OP_SIN) {
+          ggml_tsavorite_decompose_unary_kernel(num_elem_src0, src0, node);
+        }
+        while (count < num_elem_src0) {
+          int kernel_size;
+          srcP0->data = srcP0->base = (void *)((float *)src0->data + count);
+          nodeP->data = nodeP->base = (void *)((float *)node->data + count);
+          if ((count + TSAVORITE_KERNEL_SIZE) > num_elem_src0)
+            kernel_size = num_elem_src0 - count;
+          else
+            kernel_size = TSAVORITE_KERNEL_SIZE;
+          count += kernel_size;
+          srcP0->shape[Rank - 1]    = kernel_size;
+          nodeP->shape[Rank - 1]    = kernel_size;
+          srcP0->strides[Rank - 1]  = 0;
+          nodeP->strides[Rank - 1]  = 0;
+          // kernel call
+          ctx->kernels[kernel_type].pipeline->_mlir_fptr_1_input(srcP0, nodeP);
+          ++device->stats.op_run_count[kernel_type].num_of_kernel_call;
+        }
+
+        if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) {
+          log_data.data_type = GGML_TSAVORITE_TENSOR_NODE;
+          log_data.tensor = node;
+          ggml_tsi_log_tensor_data(log_data);
+
+          log_data.data_type = GGML_TSAVORITE_TENSOR_END_DATA;
+          log_data.tensor = NULL;
+          ggml_tsi_log_tensor_data(log_data);
+        }
+      }
+    }
+    if (min_num_of_elem > 0) {
+      ++device->stats.op_run_count[kernel_type].total_tensor_count;
+
+      if (min_num_of_elem > TSAVORITE_KERNEL_SIZE)
+        ++device->stats.op_run_count[kernel_type].num_of_tensor_spilt;
+
+      if (!(device->stats.op_run_count[kernel_type].min_num_of_elem) ||
+          device->stats.op_run_count[kernel_type].min_num_of_elem > min_num_of_elem)
+        device->stats.op_run_count[kernel_type].min_num_of_elem = min_num_of_elem;
+
+      if (!(device->stats.op_run_count[kernel_type].max_num_of_elem) ||
+          device->stats.op_run_count[kernel_type].max_num_of_elem < max_num_of_elem)
+        device->stats.op_run_count[kernel_type].max_num_of_elem = max_num_of_elem;
+    }
+  }
+
+  // This this need to implement correctly when we have mixture of CPU and accelerator operation
+  // return ggml_graph_compute(cgraph, &cplan);
+  ggml_backend_tsavorite_device_rel(
+      (struct ggml_backend_tsavorite_device_context *)backend->device->context);
+  return GGML_STATUS_SUCCESS;
+
+  GGML_UNUSED(backend);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+#if 0
+static const char * ggml_backend_tsavorite_buffer_get_name(ggml_backend_buffer_t buffer) {
+    GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+    GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+    return "tSavorite";
+
+    TSI_UNUSED(buffer);
+}
+#endif
+
+static void ggml_backend_tsavorite_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_buffer_context *ctx =
+      (struct ggml_backend_tsavorite_buffer_context *)buffer->context;
+
+#if 0
+    // ctx->all_data & tsi_buffer_free(ctx->buffers[i].data and same memory and created by tsi_alloc
+    // tsi_finalize called when ggml call backend free all memory
+    // this fucntion called when ggml free backend particular buffer, currently we cant provide this support
+    // and just return NoOps
+    // But at end there is no memory leak but memory can grow since we free at last once backend is shutdown
+    // We need to revisit this hence i kept the stuff under if 0
+    for (int i = 0; i < ctx->n_buffers; i++) {
+        tsi_buffer_free(ctx->buffers[i].data);
+    }
+    ggml_backend_tsavorite_device_rel((struct ggml_backend_tsavorite_device_context *)buffer->buft->device->context);
+
+    if (ctx->owned) {
+        free(ctx->all_data);
+    }
+#endif
+
+  free(ctx);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+static void *ggml_backend_tsavorite_buffer_get_base(ggml_backend_buffer_t buffer) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_buffer_context *ctx =
+      (struct ggml_backend_tsavorite_buffer_context *)buffer->context;
+
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return ctx->all_data;
+}
+
+static ggml_status ggml_backend_tsavorite_buffer_init_tensor(ggml_backend_buffer_t buffer,
+                                                      struct ggml_tensor *tensor) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  const int Rank = MEM_REF_DESCRIPTOR_RANK;
+  MemRefDescriptor<Rank> tensor_data_header;
+  tensor->data = (void *)(sizeof(tensor_data_header) + (char *)tensor->data);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return GGML_STATUS_SUCCESS;
+
+  TSI_UNUSED(buffer);
+}
+
+static void ggml_backend_tsavorite_buffer_memset_tensor(ggml_backend_buffer_t buffer,
+                                                        struct ggml_tensor *tensor, uint8_t value,
+                                                        size_t offset, size_t size) {
+  if (!tensor || !tensor->data) {
+    GGML_TSAVORITE_LOG_ERROR("\n tensor or data cant be null under func: %s\n", __func__);
+    return;
+  }
+  memset((char *)tensor->data + offset, value, size);
+
+  GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_tsavorite_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                                     struct ggml_tensor *tensor, const void *data,
+                                                     size_t offset, size_t size) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  memcpy((char *)tensor->data + offset, data, size);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  TSI_UNUSED(buffer);
+}
+
+static void ggml_backend_tsavorite_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                                     const struct ggml_tensor *tensor, void *data,
+                                                     size_t offset, size_t size) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  memcpy(data, (const char *)tensor->data + offset, size);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  TSI_UNUSED(buffer);
+}
+
+static bool ggml_backend_tsavorite_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
+                                                     const struct ggml_tensor *src,
+                                                     struct ggml_tensor *dst) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  if (ggml_backend_buffer_is_host(src->buffer)) {
+    memcpy(dst->data, src->data, (ggml_nbytes(src)));
+    return true;
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return false;
+
+  TSI_UNUSED(buffer);
+}
+
+static void ggml_backend_tsavorite_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_buffer_context *ctx =
+      (struct ggml_backend_tsavorite_buffer_context *)buffer->context;
+  if (!ctx || !ctx->all_data) {
+    GGML_TSAVORITE_LOG_ERROR("\n ctx or all_data cant be null under func: %s\n", __func__);
+    return;
+  }
+  memset((char *)ctx->all_data, value, ctx->all_size);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+static struct ggml_backend_buffer_i ggml_backend_tsavorite_buffer_i = {
+    /* .free_buffer     = */ ggml_backend_tsavorite_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_tsavorite_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_tsavorite_buffer_init_tensor,
+    /* .memset_tensor   = */ ggml_backend_tsavorite_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_tsavorite_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_tsavorite_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_tsavorite_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_tsavorite_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// default buffer type
+
+static const char *ggml_backend_tsavorite_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return "tsavorite";
+
+  TSI_UNUSED(buft);
+}
+
+static void ggml_backend_tsavorite_log_allocated_size(txe_device_s device, size_t size_aligned) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+#ifndef GGML_TSAVORITE_NDEBUG
+#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
+  GGML_TSAVORITE_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n", __func__,
+                          size_aligned / 1024.0 / 1024.0,
+                          device.currentAllocatedSize / 1024.0 / 1024.0);
+#endif
+#endif
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  TSI_UNUSED(device);
+  TSI_UNUSED(size_aligned);
+}
+
+static ggml_backend_buffer_t
+ggml_backend_tsavorite_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_buffer_context *ctx =
+      (struct ggml_backend_tsavorite_buffer_context *)calloc(
+          1, sizeof(struct ggml_backend_tsavorite_buffer_context));
+
+  const size_t size_page = sysconf(_SC_PAGESIZE);
+  GGML_TSAVORITE_LOG_CONT(
+      "ggml_backend_tsavorite_buffer_type_alloc_buffer is called from llama data Loader \n");
+
+  size_t size_aligned = size;
+  if ((size_aligned % size_page) != 0) {
+    size_aligned += (size_page - (size_aligned % size_page));
+  }
+
+  txe_device_s device = ggml_backend_tsavorite_device_acq(
+      (struct ggml_backend_tsavorite_device_context *)buft->device->context);
+  if (!device)
+    return NULL;
+
+  ctx->all_data = ggml_tsavorite_host_malloc(size_aligned);
+  ctx->all_size = size_aligned;
+  ctx->owned = true;
+  ctx->n_buffers = 1;
+  GGML_TSAVORITE_LOG_INFO("\n\n\n\n  Memory Starting address %p and size %ld \n\n\n", ctx->all_data,
+                          ctx->all_size);
+
+  if (ctx->all_data != NULL) {
+    GGML_TSAVORITE_LOG_CONT("\nAddress of Newly Created BUffer %p and size %ld \n", ctx->all_data,
+                            ctx->all_size);
+    if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) {
+      fprintf(tsi_op_log_file, "Address of Newly Created BUffer %p and size %ld \n", ctx->all_data,
+              ctx->all_size);
+    }
+    ctx->buffers[0].data = NULL;
+    ctx->buffers[0].data = ctx->all_data;
+    ctx->buffers[0].size = size;
+    memset((char *)ctx->all_data, 0, ctx->all_size);
+  }
+
+  if (size_aligned > 0 && (ctx->all_data == NULL)) {
+    GGML_TSAVORITE_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__,
+                             size_aligned / 1024.0 / 1024.0);
+    free(ctx);
+    ggml_backend_tsavorite_device_rel(
+        (struct ggml_backend_tsavorite_device_context *)buft->device->context);
+    return NULL;
+  }
+
+  // ggml_backend_tsavorite_log_allocated_size(device, size_aligned);
+  device->current_allocated_size += ctx->all_size;
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return ggml_backend_buffer_init(buft, ggml_backend_tsavorite_buffer_i, ctx, size);
+}
+
+static size_t ggml_backend_tsavorite_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return 32;
+  TSI_UNUSED(buft);
+}
+
+static size_t ggml_backend_tsavorite_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  txe_device_s device = ggml_backend_tsavorite_device_acq(
+      (struct ggml_backend_tsavorite_device_context *)buft->device->context);
+  const size_t max_size = device->max_buf_len;
+  ggml_backend_tsavorite_device_rel(
+      (struct ggml_backend_tsavorite_device_context *)buft->device->context);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return max_size;
+
+  TSI_UNUSED(buft);
+}
+
+static size_t ggml_backend_tsavorite_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
+                                                                const struct ggml_tensor *tensor) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  txe_device_s device = ggml_backend_tsavorite_device_acq(
+      (struct ggml_backend_tsavorite_device_context *)buft->device->context);
+  if (!device) {
+    GGML_TSAVORITE_LOG_ERROR("\n tsavorite device is NULL \n");
+    return 0;
+  }
+  const int Rank = MEM_REF_DESCRIPTOR_RANK;
+  MemRefDescriptor<Rank> tensor_data_header;
+  ggml_backend_tsavorite_device_rel(
+      (struct ggml_backend_tsavorite_device_context *)buft->device->context);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  GGML_TSAVORITE_LOG_INFO(
+      "\n\n\n\n Calculating---- Alloc ----Size header %lu  and data %lu \n\n\n\n ",
+      sizeof(tensor_data_header), ggml_nbytes(tensor));
+
+  return (sizeof(tensor_data_header) + ggml_nbytes(tensor));
+
+  TSI_UNUSED(buft);
+}
+
+static bool ggml_backend_tsavorite_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  // For Now CPU is loading all data and then copy some tensor to Tsavorite Backend
+  // Once we have most of Operation supported by Tsavorite
+  // We will figure out to make tsavorite Backend also host
+  return false;
+
+  TSI_UNUSED(buft);
+}
+
+ggml_backend_buffer_type_t ggml_backend_tsavorite_buffer_type(void) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  static struct ggml_backend_buffer_type ggml_backend_buffer_type_tsavorite = {
+      /* .iface = */ {
+          /* .get_name         = */ ggml_backend_tsavorite_buffer_type_get_name,
+          /* .alloc_buffer     = */ ggml_backend_tsavorite_buffer_type_alloc_buffer,
+          /* .get_alignment    = */ ggml_backend_tsavorite_buffer_type_get_alignment,
+          /* .get_max_size     = */ ggml_backend_tsavorite_buffer_type_get_max_size,
+          /* .get_alloc_size   = */
+          ggml_backend_tsavorite_buffer_type_get_alloc_size,  // defaults to ggml_nbytes
+          /* .is_host          = */ ggml_backend_tsavorite_buffer_type_is_host,
+      },
+      /* .device  = */ &g_ggml_backend_tsavorite_device,
+      /* .context = */ NULL,
+  };
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return &ggml_backend_buffer_type_tsavorite;
+}
+
+// backend
+
+static const char *ggml_backend_tsavorite_name(ggml_backend_t backend) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return "Tsavorite";
+
+  TSI_UNUSED(backend);
+}
+
+static void ggml_backend_tsavorite_free(ggml_backend_t backend) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  if (!backend || !backend->context || !backend->device || !backend->device->context) {
+    GGML_TSAVORITE_LOG_ERROR("At %s One of more pointer among: Backend, backend_context, "
+                             "device_context or device are NULL",
+                             __func__);
+    return;
+  }
+  struct ggml_backend_tsavorite_context *ctx =
+      (struct ggml_backend_tsavorite_context *)backend->context;
+  struct ggml_backend_tsavorite_device_context *ctx_dev =
+      (struct ggml_backend_tsavorite_device_context *)backend->device->context;
+  ggml_tsavorite_disp_stats(ctx, ctx_dev->device);
+
+  ggml_backend_tsavorite_device_rel(ctx_dev);
+  ggml_tsavorite_free(ctx);
+
+  free(backend);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+static void ggml_backend_tsavorite_synchronize(ggml_backend_t backend) {
+// We need to implement ASYN  Method to take output of tensor data to input of other Tensor
+// We will evaluate and implement at later PR
+#ifdef SYNC_DEBUG
+  usleep(100000);
+#endif /* SYNC_DEBUG */
+  TSI_UNUSED(backend);
+}
+
+static ggml_backend_buffer_type_t
+ggml_backend_tsavorite_get_default_buffer_type(ggml_backend_t backend) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return ggml_backend_tsavorite_buffer_type();
+
+  TSI_UNUSED(backend);
+}
+
+static enum ggml_status ggml_backend_tsavorite_graph_compute(ggml_backend_t backend,
+                                                             struct ggml_cgraph *cgraph) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return ggml_tsavorite_graph_compute(backend, cgraph);
+}
+
+static void ggml_backend_tsavorite_set_n_cb(ggml_backend_t backend, int n_cb) {
+  // GGML_ASSERT(ggml_backend_is_tsavorite(backend));
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  struct ggml_backend_tsavorite_context *ctx =
+      (struct ggml_backend_tsavorite_context *)backend->context;
+
+  if (ctx->n_cb != n_cb) {
+    ctx->n_cb = MIN(n_cb, GGML_TSAVORITE_MAX_COMMAND_BUFFERS);
+
+    if (ctx->n_cb > 2) {
+      GGML_TSAVORITE_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade "
+                              "the performance in some cases\n",
+                              __func__, n_cb);
+    }
+  }
+
+#if 0
+    if (ctx->encode_async) {
+        Block_release(ctx->encode_async);
+    }
+#endif
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+static struct ggml_backend_i ggml_backend_tsavorite_i = {
+    /* .get_name                = */ ggml_backend_tsavorite_name,
+    /* .free                    = */ ggml_backend_tsavorite_free,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ ggml_backend_tsavorite_synchronize,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_tsavorite_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_tsavorite_guid(void) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  static ggml_guid guid = {0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed,
+                           0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6};
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return &guid;
+}
+
+// This need to be removed in the future
+ggml_backend_t ggml_backend_tsavorite_init(void) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_tsavorite_reg(), 0);
+
+  struct ggml_backend_tsavorite_context *ctx = ggml_tsavorite_init(dev);
+  if (ctx == NULL) {
+    GGML_TSAVORITE_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
+    return NULL;
+  }
+
+  ggml_backend_t backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend));
+  if (backend) {
+    backend->guid = ggml_backend_tsavorite_guid();
+    backend->iface = ggml_backend_tsavorite_i;
+    backend->device = dev;
+    backend->context = ctx;
+  }
+  // Will enable later
+  // ggml_backend_tsavorite_set_n_cb(backend, 1);
+
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return backend;
+}
+
+bool ggml_backend_is_tsavorite(ggml_backend_t backend) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_tsavorite_guid());
+}
+
+void ggml_backend_tsavorite_set_abort_callback(ggml_backend_t backend,
+                                               ggml_abort_callback abort_callback,
+                                               void *user_data) {
+  GGML_ASSERT(ggml_backend_is_tsavorite(backend));
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  struct ggml_backend_tsavorite_context *ctx =
+      (struct ggml_backend_tsavorite_context *)backend->context;
+
+  ctx->abort_callback = abort_callback;
+  ctx->abort_callback_data = user_data;
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+void ggml_backend_tsavorite_capture_next_compute(ggml_backend_t backend) {
+  GGML_ASSERT(ggml_backend_is_tsavorite(backend));
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  struct ggml_backend_tsavorite_context *ctx =
+      (struct ggml_backend_tsavorite_context *)backend->context;
+  ctx->capture_next_compute = true;
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+// backend device
+
+static const char *ggml_backend_tsavorite_device_get_name(ggml_backend_dev_t dev) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return "Tsavorite";
+
+  GGML_UNUSED(dev);
+}
+
+static const char *ggml_backend_tsavorite_device_get_description(ggml_backend_dev_t dev) {
+  // acq/rel just to populate ctx->name in case it hasn't been done yet
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_device_context *ctx_dev =
+      (struct ggml_backend_tsavorite_device_context *)dev->context;
+  ggml_backend_tsavorite_device_acq(ctx_dev);
+  ggml_backend_tsavorite_device_rel(ctx_dev);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return ctx_dev->name;
+}
+
+static void ggml_backend_tsavorite_device_get_memory(ggml_backend_dev_t dev, size_t *free,
+                                                     size_t *total) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  if (!dev || !free || !total) {
+    GGML_TSAVORITE_LOG_INFO("One of more pointers(dev, free, total) are NULL\n");
+    GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+    return;
+  }
+  *total = 0;
+  *total = 0;
+  struct ggml_backend_tsavorite_device_context *ctx_dev =
+      (struct ggml_backend_tsavorite_device_context *)dev->context;
+  if (ctx_dev) {
+    txe_device_s device = ggml_backend_tsavorite_device_acq(ctx_dev);
+    *total = device->recommended_max_working_set_size;
+    *free = *total - device->current_allocated_size;
+    GGML_TSAVORITE_LOG_CONT("\n TXE Device MEMORY Summary total %lu and free %lu \n", *total,
+                            *free);
+    ggml_backend_tsavorite_device_rel(ctx_dev);
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return;
+}
+
+// Currently We are setting our TXE accerator at GPU Type
+static enum ggml_backend_dev_type ggml_backend_tsavorite_device_get_type(ggml_backend_dev_t dev) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return GGML_BACKEND_DEVICE_TYPE_GPU;
+
+  GGML_UNUSED(dev);
+}
+
+// Need to understand the scope of this API since this is not used
+// // use by Structure llama_model_loader
+// func llm_load_tensors
+// structure lama_new_context_with_model
+static void ggml_backend_tsavorite_device_get_props(ggml_backend_dev_t dev,
+                                                    struct ggml_backend_dev_props *props) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  props->name = ggml_backend_tsavorite_device_get_name(dev);
+  props->description = ggml_backend_tsavorite_device_get_description(dev);
+  props->type = ggml_backend_tsavorite_device_get_type(dev);
+  ggml_backend_tsavorite_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+  if (props) {
+    props->caps.async = false;
+    props->caps.host_buffer = false;
+    props->caps.buffer_from_host_ptr = true;
+    props->caps.buffer_from_host_ptr = false;
+    props->caps.events = false;
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+static ggml_backend_t ggml_backend_tsavorite_device_init(ggml_backend_dev_t dev,
+                                                         const char *params) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_context *ctx = ggml_tsavorite_init(dev);
+  if (ctx == NULL) {
+    GGML_TSAVORITE_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
+    return NULL;
+  }
+
+  ggml_backend_t backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend));
+
+  if (backend) {
+    backend->guid = ggml_backend_tsavorite_guid();
+    backend->iface = ggml_backend_tsavorite_i;
+    backend->device = dev;
+    backend->context = ctx;
+  }
+
+  ggml_backend_tsavorite_set_n_cb(backend, 1);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return backend;
+
+  GGML_UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t
+ggml_backend_tsavorite_device_get_buffer_type(ggml_backend_dev_t dev) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return ggml_backend_tsavorite_buffer_type();
+
+  GGML_UNUSED(dev);
+}
+
+// Currently for llama.cpp model below API it seems not used
+// llama.cpp is using as part llm_load_tensors
+// buffer_from_host_ptr_supported
+// is_default_buft
+// else they will be using
+// ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+// Need to revist when we will look at buffer section implementation
+static ggml_backend_buffer_t ggml_backend_tsavorite_device_buffer_from_ptr(ggml_backend_dev_t dev,
+                                                                           void *ptr, size_t size,
+                                                                           size_t max_tensor_size) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_buffer_context *ctx =
+      (struct ggml_backend_tsavorite_buffer_context *)calloc(
+          1, sizeof(struct ggml_backend_tsavorite_buffer_context));
+
+  ctx->all_data = ptr;
+  ctx->all_size = size;
+  ctx->owned = false;
+  ctx->n_buffers = 0;
+
+  const size_t size_page = sysconf(_SC_PAGESIZE);
+
+  // page-align the data ptr
+  {
+    const uintptr_t offs = (uintptr_t)ptr % size_page;
+    ptr = (void *)((char *)ptr - offs);
+    size += offs;
+  }
+
+  size_t size_aligned = size;
+  if ((size_aligned % size_page) != 0) {
+    size_aligned += (size_page - (size_aligned % size_page));
+  }
+
+  struct ggml_backend_tsavorite_device_context *ctx_dev =
+      (struct ggml_backend_tsavorite_device_context *)dev->context;
+  txe_device_s device = ggml_backend_tsavorite_device_acq(ctx_dev);
+
+  // the buffer fits into the max buffer size allowed by the device
+  if (size_aligned <= device->max_buf_len) {
+    ctx->buffers[ctx->n_buffers].data = ptr;
+    ctx->buffers[ctx->n_buffers].size = size;
+
+    // ggml_backend_tsavorite_log_allocated_size(device, size_aligned);
+
+    ++ctx->n_buffers;
+  } else {
+    // this overlap between the views will guarantee that the tensor with the maximum size will
+    // fully fit into one of the views
+    const size_t size_ovlp = ((max_tensor_size + size_page - 1) / size_page + 1) *
+                             size_page;  // round-up 2 pages just in case
+    const size_t size_step = device->max_buf_len - size_ovlp;
+    const size_t size_view = device->max_buf_len;
+
+    for (size_t i = 0; i < size; i += size_step) {
+      const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
+
+      ctx->buffers[ctx->n_buffers].data = (void *)((uint8_t *)ptr + i);
+      ctx->buffers[ctx->n_buffers].size = size_step_aligned;
+
+      // ggml_backend_tsavorite_log_allocated_size(device, size_step_aligned);
+
+      if (i + size_step < size) {
+        GGML_TSAVORITE_LOG_INFO("\n");
+      }
+
+      ++ctx->n_buffers;
+    }
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return ggml_backend_buffer_init(ggml_backend_tsavorite_buffer_type(),
+                                  ggml_backend_tsavorite_buffer_i, ctx, size);
+}
+
+// llama_build_graph -> ggml_backend_supports_op -> gml_backend_dev_supports_op
+// basically if true then it will call ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur,
+// backend.get()); here is cur is tensor
+static bool ggml_backend_tsavorite_device_supports_op(ggml_backend_dev_t dev,
+                                                      const struct ggml_tensor *op) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_device_context *ctx_dev =
+      (struct ggml_backend_tsavorite_device_context *)dev->context;
+
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return ggml_tsavorite_supports_op(ctx_dev, op);
+}
+
+// template<typename F>
+// static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {}
+//  ggml_backend_dev_supports_op(dev, op_tensor);
+static bool ggml_backend_tsavorite_device_supports_buft(ggml_backend_dev_t dev,
+                                                        ggml_backend_buffer_type_t buft) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return buft->iface.get_name == ggml_backend_tsavorite_buffer_type_get_name;
+
+  TSI_UNUSED(dev);
+}
+
+// // returns the backend that should be used for the node based on the current locations
+// ggml_backend_sched_backend_id_from_cur  -> ggml_backend_offload_op ->
+static bool ggml_backend_tsavorite_device_offload_op(ggml_backend_dev_t dev,
+                                                     const struct ggml_tensor *op) {
+  // printf("\n ANoop Calling %s \n ", __func__);
+  if (op->type != GGML_TYPE_F32)
+    return false;
+  switch (op->op) {
+  // case GGML_OP_NONE:
+  case GGML_OP_ADD:
+  case GGML_OP_SUB:
+  case GGML_OP_DIV:
+  case GGML_OP_MUL:
+  case GGML_OP_SQRT:
+  case GGML_OP_SIN:
+    break;
+  case GGML_OP_UNARY:
+    switch (ggml_get_unary_op(op)) {
+    case GGML_UNARY_OP_NEG:
+    case GGML_UNARY_OP_ABS:
+    case GGML_UNARY_OP_SIGMOID:
+      break;
+    default:
+      return false;
+    }
+    break;
+  default:
+    return false;
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return true;
+  TSI_UNUSED(dev);
+}
+#ifdef SYNC_DEBUG
+static void ggml_backend_tsavorite_device_synchronize(ggml_backend_dev_t dev,
+                                                      ggml_backend_event_t event) {
+  usleep(100);
+  TSI_UNUSED(dev);
+  TSI_UNUSED(event);
+}
+#endif /* SYNC_DEBUG */
+
+static struct ggml_backend_device_i ggml_backend_tsavorite_device_i = {
+    /* .get_name             = */ ggml_backend_tsavorite_device_get_name,
+    /* .get_description      = */ ggml_backend_tsavorite_device_get_description,
+    /* .get_memory           = */ ggml_backend_tsavorite_device_get_memory,
+    /* .get_type             = */ ggml_backend_tsavorite_device_get_type,
+    /* .get_props            = */ ggml_backend_tsavorite_device_get_props,
+    /* .init_backend         = */ ggml_backend_tsavorite_device_init,
+    /* .get_buffer_type      = */ ggml_backend_tsavorite_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_tsavorite_device_buffer_from_ptr,
+    /* .supports_op          = */ ggml_backend_tsavorite_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_tsavorite_device_supports_buft,
+    /* .offload_op           = */ ggml_backend_tsavorite_device_offload_op,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// backend registry
+
+static const char *ggml_backend_tsavorite_reg_get_name(ggml_backend_reg_t reg) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return "Tsavorite";
+
+  GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_tsavorite_reg_device_count(ggml_backend_reg_t reg) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return 1;
+
+  GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_tsavorite_reg_device_get(ggml_backend_reg_t reg,
+                                                                size_t index) {
+  GGML_ASSERT(index == 0);
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return &g_ggml_backend_tsavorite_device;
+
+  GGML_UNUSED(reg);
+  GGML_UNUSED(index);
+}
+
+static struct ggml_backend_reg_i ggml_backend_tsavorite_reg_i = {
+    /* .get_name         = */ ggml_backend_tsavorite_reg_get_name,
+    /* .device_count     = */ ggml_backend_tsavorite_reg_device_count,
+    /* .device_get       = */ ggml_backend_tsavorite_reg_device_get,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_tsavorite_reg(void) {
+  ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_ERROR;
+  ggml_tsavorite_kernel_mode_flag = GGML_TSAVORITE_KERNEL_MODE_MLIR;
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  g_ggml_backend_tsavorite_reg.iface = ggml_backend_tsavorite_reg_i;
+  g_ggml_backend_tsavorite_reg.context = NULL;
+
+  g_ggml_backend_tsavorite_device.iface = ggml_backend_tsavorite_device_i;
+  g_ggml_backend_tsavorite_device.reg = &g_ggml_backend_tsavorite_reg;
+  g_ggml_backend_tsavorite_device.context = &g_ggml_ctx_dev_main;
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return &g_ggml_backend_tsavorite_reg;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_tsavorite_reg)
diff --git a/ggml/src/ggml-tsavorite/include/TestModel.h b/ggml/src/ggml-tsavorite/include/TestModel.h
new file mode 100644
index 0000000000000..feff2539a96fa
--- /dev/null
+++ b/ggml/src/ggml-tsavorite/include/TestModel.h
@@ -0,0 +1,217 @@
+#pragma once
+
+#include "HostShimCAPI.h"
+#include <algorithm>
+#include <cstddef>
+#include <fstream>
+#include <random>
+#include <string>
+#include <array>
+
+#define MAX_RESULT_VALUES_TO_PRINT 32
+template <int N>
+struct MemRefDescriptor {
+  void *base;
+  void *data;
+  int64_t offset = 0;
+  int64_t shape[N];
+  int64_t strides[N];
+} __attribute__((aligned(128)));
+
+template <int Rank, int NumInputs, int NumOutputs>
+class TestModel {
+public:
+  TestModel(std::string name, int version, bool verbose = false)
+      : name_(name), version_(version), verbose_(verbose) {}
+
+  ~TestModel() {
+    // free memory
+    for (int i = 0; i < NumInputs; i++)
+      tsi_dealloc(inputs[i].base);
+    for (int i = 0; i < NumOutputs; i++)
+      tsi_dealloc(outputs[i].base);
+    tsi_finalize();
+  }
+
+  template <typename ElType>
+  void initRandom(size_t numElements,
+                  std::array<int, 2> inputRange = {-10, 10}) {
+    static_assert(Rank == 1,
+                  "initRandom(size_t) is only defined for Rank == 1");
+    size_t inputSizes[2][Rank] = {{numElements}, {numElements}};
+    size_t outputSizes[1][Rank] = {{numElements}};
+    init<ElType, ElType>(inputSizes, outputSizes,
+                         /*initWithRandom=*/true, inputRange);
+  }
+
+#if 0
+  template <typename ElType>
+  void initFill(size_t numElements, ElType val) {
+    static_assert(Rank == 1,
+                  "initRandom(size_t) is only defined for Rank == 1");
+    size_t inputSizes[2][Rank] = {{numElements}, {numElements}};
+    size_t outputSizes[1][Rank] = {{numElements}};
+    init<ElType, ElType>(inputSizes, outputSizes);
+    for (int i = 0; i < NumInputs; i++) {
+      auto nEls = getNumElements(inputs[i]);
+      for (size_t j = 0; j < nEls; j++)
+        static_cast<ElType *>(inputs[i].data)[j] = val;
+    }
+  }
+#endif /* 0 */
+
+  template <typename InputsElType, typename OutputsElType>
+  void init(size_t inputSizes[NumInputs][Rank],
+            size_t outputSizes[NumOutputs][Rank], bool initWithRandom = false,
+            std::array<int, 2> inputRange = {-10, 10}) {
+    tsi_initialize(1);
+
+    for (int i = 0; i < NumInputs; i++)
+      initMemRefDescriptor<InputsElType>(inputs[i], inputSizes[i],
+                                         initWithRandom, inputRange, i);
+
+    for (int i = 0; i < NumOutputs; i++) {
+      initMemRefDescriptor<OutputsElType>(outputs[i], outputSizes[i]);
+      // set default result values to -1
+      auto nEls = getNumElements(outputSizes[i]);
+      std::fill((OutputsElType *)outputs[i].base,
+                (OutputsElType *)outputs[i].base + nEls, -1);
+    }
+    if (verbose_) {
+      printf("[%s v.%d] Allocated DRAM arrays (host VAs):", name_.c_str(),
+             version_);
+      for (int i = 0; i < NumInputs; i++)
+        printf(" ANOOP input%d = %p ", i, inputs[i].base);
+      for (int i = 0; i < NumOutputs; i++)
+        printf(" ANOOP-1 output%d = %p ", i, outputs[i].base);
+      printf("\n");
+    }
+  }
+
+  template <typename ElType>
+  int validateResult(size_t index, ElType *expected, bool printErrs = false,
+                     float tolerance = 1e-5) {
+    if (verbose_) {
+      printf("[%s v.%d] Model executed successfully. Validating result...",
+             name_.c_str(), version_);
+    }
+
+    int retCode = 0;
+    size_t nEls = getNumElements(outputs[index].shape);
+    float sqrSumOfDiff = 0.0;
+    for (size_t j = 0; j < nEls; j++) {
+      sqrSumOfDiff +=
+          std::pow(((ElType *)outputs[index].base)[j] - expected[j], 2);
+      if (std::abs(((ElType *)outputs[index].base)[j] - expected[j]) >
+          tolerance) {
+        retCode = 1;
+        if (printErrs && j < MAX_RESULT_VALUES_TO_PRINT) {
+          printf("Mismatch at index %d: expected %1.6f, got %1.6f\n", (int)j,
+                 expected[j], ((ElType *)outputs[index].base)[j]);
+        }
+        if (retCode && j == MAX_RESULT_VALUES_TO_PRINT)
+          printf("... (more mismatches not printed; maximum %d reached) ...\n",
+                 MAX_RESULT_VALUES_TO_PRINT);
+      }
+    }
+    // Compute the relative error: norm2(result) / norm2(expected)
+    float sqrSumExpected = 0.0;
+    for (size_t j = 0; j < nEls; j++)
+      sqrSumExpected += std::pow(expected[j], 2);
+
+    float relativeErr = std::sqrt(sqrSumOfDiff) / std::sqrt(sqrSumExpected);
+    if (verbose_) {
+      retCode ? printf("\n[%s v.%d] FAILED [relative err=%1.6f]\n",
+                       name_.c_str(), version_, relativeErr)
+              : printf("\n[%s v.%d] PASS [relative err=%1.6f]\n", name_.c_str(),
+                       version_, relativeErr);
+    }
+    return retCode;
+  }
+
+  size_t getNumElements(const MemRefDescriptor<Rank> &memref) const {
+    return getNumElements(memref.shape);
+  }
+
+  template <typename ElType>
+  void writeToFile(void *data, size_t numElements,
+                   const std::string &filename) {
+    std::ofstream ofs(filename, std::ios::binary);
+    if (!ofs) {
+      printf("[%s v.%d] Error opening file %s for writing.", name_.c_str(),
+             version_, filename.c_str());
+      return;
+    }
+    ofs.write((char *)data, numElements * sizeof(ElType));
+    ofs.close();
+  }
+
+  template <typename ElType>
+  void readFromFile(void *data, size_t numElements,
+                    const std::string &filename) {
+    std::ifstream ifs(filename, std::ios::binary);
+    if (!ifs) {
+      printf("[%s v.%d] Error opening file %s for reading.", name_.c_str(),
+             version_, filename.c_str());
+      return;
+    }
+    ifs.read((char *)data, numElements * sizeof(ElType));
+    ifs.close();
+  }
+
+  std::string getName() const { return name_; }
+  std::string getVersion() const { return std::to_string(version_); }
+
+  MemRefDescriptor<Rank> inputs[NumInputs];
+  MemRefDescriptor<Rank> outputs[NumOutputs];
+
+private:
+  std::string name_;
+  int version_ = 1;
+  bool verbose_ = false;
+
+  template <typename ElType>
+  void initMemRefDescriptor(MemRefDescriptor<Rank> &memref, size_t shape[Rank],
+                            bool initWithRandom = false,
+                            std::array<int, 2> inputRange = {-10, 10},
+                            int seed = 42) {
+    size_t nBytes = sizeof(ElType);
+    for (int i = 0; i < Rank; i++) {
+      nBytes *= shape[i];
+    }
+    memref.base = tsi_alloc(nBytes);
+    memref.data = memref.base;
+    memref.offset = 0;
+    printf("\n checking Shape value %d \n\n", memref.shape[0]);
+#if 0
+    for (int i = 0; i < Rank; i++) {
+      memref.shape[i] = shape[i];
+      memref.strides[i] = 1;
+      for (int j = i + 1; j < Rank; j++) {
+        memref.strides[i] *= shape[j];
+      }
+    }
+ #endif
+    if (initWithRandom) {
+      std::mt19937 gen(seed); // fixed seed
+      std::uniform_real_distribution<float> dist(inputRange[0], inputRange[1]);
+      for (size_t i = 0; i < getNumElements(shape); i++) {
+        static_cast<ElType *>(memref.data)[i] = static_cast<ElType>(dist(gen));
+      }
+    }
+  }
+
+  size_t getNumElements(const int64_t shape[Rank]) const {
+    size_t numElements = 1;
+    printf("\n Anoop Rank %d and shape[Rank] %d \n\n", Rank, shape[Rank]);
+    for (int i = 0; i < Rank; i++) {
+      numElements *= shape[i];
+    }
+    printf("\n numElements %d \n", numElements);
+    return numElements;
+  }
+
+  size_t getNumElements(const size_t shape[Rank]) const {
+    return getNumElements(reinterpret_cast<const int64_t *>(shape));
+  }
+};
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 083347d188880..31fa312f65da6 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -174,4 +174,4 @@ target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
 add_executable(${TEST_TARGET} test-c.c)
-target_link_libraries(${TEST_TARGET} PRIVATE llama)
+target_link_libraries(${TEST_TARGET} PRIVATE llama ${TLIBS})
diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh
new file mode 100755
index 0000000000000..5ff9b9389c475
--- /dev/null
+++ b/tsi-pkg-build.sh
@@ -0,0 +1,87 @@
+
+set -e
+
+#Ensure prerequisites are met as follows
+echo 'updating submodule'
+git submodule update --recursive --init
+cd ggml-tsi-kernel/
+module load tsi4 gcc/13.3.0
+echo 'creating python virtual env'
+python3 -m venv blob-creation
+source blob-creation/bin/activate
+echo 'installing mlir and python dependencies'
+pip install -r /proj/rel/sw/mlir-compiler/python/requirements-common.txt
+pip install /proj/rel/sw/mlir-compiler/python/mlir_external_packages-1.2.1-py3-none-any.whl
+pip install onnxruntime-training
+
+#build TSI kernels for the Tsavorite backend
+#First for FPGA
+
+echo 'creating fpga kernel'
+cd fpga-kernel
+cmake -B build-fpga
+./create-all-kernels.sh
+#The for Posix Use cases 
+
+echo 'creating posix kernel'
+cd ../posix-kernel/
+./create-all-kernels.sh
+
+#Change directory to top level llama.cpp  
+
+cd ../../
+
+export MLIR_SDK_VERSION=/proj/work/rel/sw/sdk-r.0.1.1
+#Compile for posix with build-posix as a target folder
+
+echo 'building llama.cp, ggml for tsavorite  and other binary for posix'
+cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix
+cmake --build build-posix --config Release
+
+#Compile for fpga with build-fpga as a target folder
+
+echo 'building llama.cp, ggml for tsavorite  and other binary for fpga'
+export CC="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc"
+export CXX="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-g++"
+cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga
+cmake --build build-fpga --config Release
+
+
+echo 'creating tar bundle for fpga'
+TSI_GGML_VERSION=0.0.2
+TSI_GGML_BUNDLE_INSTALL_DIR=tsi-ggml
+GGML_TSI_INSTALL_DIR=ggml-tsi-kernel
+TSI_GGML_RELEASE_DIR=/proj/rel/sw/ggml/
+TSI_BLOB_INSTALL_DIR=$(pwd)/${GGML_TSI_INSTALL_DIR}/fpga-kernel/build-fpga
+
+if [ -e ${TSI_GGML_BUNDLE_INSTALL_DIR} ]; then
+   echo "${TSI_GGML_BUNDLE_INSTALL_DIR} exist"
+else
+   echo "creating ${TSI_GGML_BUNDLE_INSTALL_DIR}"
+   mkdir ${TSI_GGML_BUNDLE_INSTALL_DIR}
+fi
+if [ -e ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh ]; then
+   rm -fr ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh
+fi
+
+cat > ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh << EOL
+#!/bin/bash
+export LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:\$(pwd)
+mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_mult
+mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_add
+cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_mult/ -r
+cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_add/ -r
+EOL
+chmod +x ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh
+cp ${GGML_TSI_INSTALL_DIR}/fpga/blobs ${TSI_GGML_BUNDLE_INSTALL_DIR}/ -r
+cp build-fpga/bin/llama-cli ${TSI_GGML_BUNDLE_INSTALL_DIR}/
+cp build-fpga/bin/libggml*.so ${TSI_GGML_BUNDLE_INSTALL_DIR}/
+cp build-fpga/bin/libllama*.so ${TSI_GGML_BUNDLE_INSTALL_DIR}/
+cp build-fpga/bin/simple-backend-tsi ${TSI_GGML_BUNDLE_INSTALL_DIR}/
+
+tar -cvzf ${TSI_GGML_BUNDLE_INSTALL_DIR}-${TSI_GGML_VERSION}.tz ${TSI_GGML_BUNDLE_INSTALL_DIR}/*
+
+if [ "$1" == "Release" ] || [ "$1" == "release" ]
+then
+    cp ${TSI_GGML_BUNDLE_INSTALL_DIR}-${TSI_GGML_VERSION}.tz ${TSI_GGML_RELEASE_DIR}
+fi