diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 50e18f57a22f7..d73ee6e0afd5e 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -343,13 +343,13 @@ extern "C" { GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4) GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...); -#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL) +#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL) enum ggml_compute_backend_type { GGML_COMPUTE_BACKEND_CPU=0, GGML_COMPUTE_BACKEND_TSAVORITE, GGML_COMPUTE_BACKEND_COUNT }; -#endif /* GGML_PERF || GGML_PERF_DETAIL */ +#endif /* GML_PERF-related flag */ enum ggml_status { GGML_STATUS_ALLOC_FAILED = -2, @@ -660,14 +660,14 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu -#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL) +#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL) int64_t perf_runs; int64_t perf_time_us; enum ggml_compute_backend_type ggml_compute_backend; char padding[4]; #else char padding[8]; -#endif /* GGML_PERF || GGML_PERF_DETAIL */ +#endif /* GML_PERF-related flag */ }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); @@ -2557,7 +2557,7 @@ extern "C" { GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); -#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL) +#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL) struct ggml_perf_backend_subtotals { int64_t total_us; int64_t runs; @@ -2587,7 +2587,7 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp); void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph); const char * ggml_backend_type(enum ggml_compute_backend_type backend); -#endif /* GGML_PERF || GGML_PERF_DETAIL */ +#endif /* GML_PERF-related flags */ #ifdef __cplusplus } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 3654fc3185ae6..527288c6239db 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2879,12 +2879,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { struct ggml_tensor * node = cgraph->nodes[node_n]; -#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL) +#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL) int64_t t_start = ggml_time_us(); -#endif /* GGML_PERF || GGML_PERF_DETAIL */ +#endif /* GGML_PERF-related flags */ ggml_compute_forward(¶ms, node); -#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL) +#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL) int64_t t_end = ggml_time_us(); node->perf_runs++; if (t_end >= t_start) { @@ -2893,7 +2893,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { // Handle wraparound by assuming timer rolls over at max int64_t value node->perf_time_us += (INT64_MAX - t_start + t_end + 1); } -#endif /* GGML_PERF || GGML_PERF_DETAIL */ +#endif /* GGML_PERF-related flags */ if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed); diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp index 30de81e576b67..430e1894c5015 100644 --- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp +++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp @@ -929,9 +929,9 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, for (int i = 0; i < cgraph->n_nodes; i++) { int32_t kernel_sub_type=-1; -#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL) +#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL) int64_t t_start = ggml_time_us(); -#endif /* GGML_PERF || GGML_PERF_DETAIL */ +#endif /* GGML_PERF-related flags */ node = cgraph->nodes[i]; src0 = node->src[0]; src1 = node->src[1]; @@ -1279,7 +1279,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, device->stats.op_run_count[kernel_type].max_num_of_elem < max_num_of_elem) device->stats.op_run_count[kernel_type].max_num_of_elem = max_num_of_elem; } -#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL) +#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL) int64_t t_end = ggml_time_us(); node->perf_runs++; node->ggml_compute_backend = GGML_COMPUTE_BACKEND_TSAVORITE; @@ -1289,7 +1289,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, // Handle wraparound by assuming timer rolls over at max int64_t value node->perf_time_us += (INT64_MAX - t_start + t_end + 1); } -#endif /* GGML_PERF || GGML_PERF_DETAIL */ +#endif /* GGML_PERF-related flags */ } // This this need to implement correctly when we have mixture of CPU and accelerator operation diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 6adf7849d80f2..960622c9cb8ba 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1020,12 +1020,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL) +#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL) static const char * GGML_BACKEND_TYPE[GGML_COMPUTE_BACKEND_COUNT] = { "CPU", "OPU" }; -#endif /* GGML_PERF || GGML_PERF_DETAIL */ +#endif /* GGML_PERF-related flags */ static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90"); @@ -1262,11 +1262,11 @@ const char * ggml_op_name(enum ggml_op op) { return GGML_OP_NAME[op]; } -#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL) +#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL) const char * ggml_backend_type(enum ggml_compute_backend_type backend) { return GGML_BACKEND_TYPE[backend]; } -#endif /* GGML_PERF || GGML_PERF_DETAIL */ +#endif /* GML_PERF-related flags */ const char * ggml_op_symbol(enum ggml_op op) { return GGML_OP_SYMBOL[op]; @@ -1692,11 +1692,11 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, -#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL) +#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL) /*.perf_runs =*/ 0, /*.perf_time_us =*/ 0, /*.ggml_compute_backend =*/ GGML_COMPUTE_BACKEND_CPU, -#endif /* GGML_PERF || GGML_PERF_DETAIL */ +#endif /* GML_PERF-related flags */ /*.padding =*/ { 0 }, }; @@ -7231,7 +7231,7 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; } -#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL) +#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL) void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; ++i) { struct ggml_tensor * node = cgraph->nodes[i]; @@ -7258,7 +7258,7 @@ void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct } } } -#endif /* GGML_PERF || GGML_PERF_DETAIL */ +#endif /* GML_PERF-related flags */ #if defined(GGML_PERF_DETAIL) FILE * ggml_perf_log_open(const char *filename) { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 37afd06db38ed..a7f77ae1c64f3 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1090,12 +1090,12 @@ int llama_context::decode(const llama_batch & batch_inp) { ggml_status status; const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status); -#if defined(GGML_PERF) +#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) ggml_perf_accumulate(perf_totals, res->get_gf()); #elif defined(GGML_PERF_DETAIL) ggml_perf_accumulate(perf_totals, res->get_gf()); ggml_perf_write_detailed_csv(res->get_gf(), perf_all_shape_fp); -#endif /* GGML_PERF || GGML_PERF_DETAIL */ +#endif /* GML_PERF-related flags */ if (!res) { @@ -2759,7 +2759,7 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) { } -#if defined(GGML_PERF) +#if defined(GGML_PERF_RELEASE) void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) { LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n"); LLAMA_LOG_TSAVORITE(" %-16s %7s %14s %16s\n", "Op", "Runs", "Total us", "Avg us"); @@ -2788,7 +2788,7 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) { } } -#elif defined(GGML_PERF_DETAIL) +#elif defined(GGML_PERF) || defined(GGML_PERF_DETAIL) void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) { LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n"); LLAMA_LOG_TSAVORITE(" %-16s %-8s %7s %14s %16s\n", "Op", "Target", "Runs", "Total us", "Avg us"); @@ -2838,7 +2838,7 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) { } } } -#endif /* GGML_PERF || GGML_PERF_DETAI */ +#endif /* GGML_PERF-related flags */ void llama_perf_context_print(const llama_context * ctx) { @@ -2852,7 +2852,7 @@ void llama_perf_context_print(const llama_context * ctx) { __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); -#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL) +#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL) LLAMA_LOG_TSAVORITE("\n%s: load time = %10.2f ms\n", __func__, data.t_load_ms); LLAMA_LOG_TSAVORITE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); @@ -2861,7 +2861,7 @@ void llama_perf_context_print(const llama_context * ctx) { LLAMA_LOG_TSAVORITE("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); ggml_perf_print_totals(const_cast(ctx->perf_totals)); -#endif /* GGML_PERF || GGML_PERF_DETAIL */ +#endif /* GGML_PERF-related flags */ } void llama_perf_context_reset(llama_context * ctx) { diff --git a/src/llama-context.h b/src/llama-context.h index e1a5bc4f9efdd..fe7c4cb579f23 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -198,9 +198,9 @@ struct llama_context { // reserve a graph with a dummy ubatch of the specified size ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false); -#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL) +#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL) struct ggml_perf_totals perf_totals[GGML_OP_COUNT] = {}; // add this to llama_context -#endif /* GGML_PERF || GGML_PERF_DETAIL */ +#endif /* GGML_PERF-related flags */ private: llm_graph_params graph_params( diff --git a/tools/main/main.cpp b/tools/main/main.cpp index bd8c364584633..dc90b08a05296 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -126,9 +126,9 @@ int main(int argc, char ** argv) { LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } -#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL) +#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL) llama_log_set(my_logger, nullptr); -#endif /* GGML_PERF || GGML_PERF_DETAIL */ +#endif /* GGML_PERF-related flags */ LOG_INF("%s: llama backend init\n", __func__); llama_backend_init(); diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh index 6dcd6852b8e8d..fefb3b22f715e 100755 --- a/tsi-pkg-build.sh +++ b/tsi-pkg-build.sh @@ -32,10 +32,18 @@ cd ../posix-kernel/ cd ../../ -#Compile for posix with build-posix as a target folder +#Compile for posix & fpga with build-posix as a target folder echo 'building llama.cp, ggml for tsavorite and other binary for posix' -cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF" +if [ "$(echo "$1" | tr '[:upper:]' '[:lower:]')" = "release" ]; +then + cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF_RELEASE" -DCMAKE_CXX_FLAGS="-DGGML_PERF_RELEASE" +elif [ "$(echo "$1" | tr '[:upper:]' '[:lower:]')" = "debug" ]; then + cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF_DETAIL" -DCMAKE_CXX_FLAGS="-DGGML_PERF_DETAIL" +else + cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF" +fi + cmake --build build-posix --config Release # Fix GLIBC compatibility for TSI binaries @@ -64,12 +72,21 @@ chmod +x build-posix/bin/llama-cli echo 'building llama.cp, ggml for tsavorite and other binary for fpga' export CC="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc" export CXX="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-g++" -cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF" + +if [ "$(echo "$1" | tr '[:upper:]' '[:lower:]')" = "release" ]; +then + cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga -DCMAKE_C_FLAGS="-DGGML_PERF_RELEASE" -DCMAKE_CXX_FLAGS="-DGGML_PERF_RELEASE" +elif [ "$(echo "$1" | tr '[:upper:]' '[:lower:]')" = "debug" ]; then + cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga -DCMAKE_C_FLAGS="-DGGML_PERF_DETAIL" -DCMAKE_CXX_FLAGS="-DGGML_PERF_DETAIL" +else + cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF" +fi + cmake --build build-fpga --config Release echo 'creating tar bundle for fpga' -TSI_GGML_VERSION=0.0.8 +TSI_GGML_VERSION=0.0.9 TSI_GGML_BUNDLE_INSTALL_DIR=tsi-ggml GGML_TSI_INSTALL_DIR=ggml-tsi-kernel TSI_GGML_RELEASE_DIR=/proj/rel/sw/ggml