diff --git a/README.md b/README.md index f222c9a1a8ae1..97be1e621c20d 100644 --- a/README.md +++ b/README.md @@ -619,6 +619,9 @@ cd ../../ #Compile for posix with build-posix as a target folder cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix +to enable STatus use below command +cmake -B build-posix -DCMAKE_BUILD_TYPE=Debug -DGGML_TSAVORITE=ON -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF" + cmake --build build-posix --config Release #Compile for fpga with build-fpga as a target folder diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index e6830b63ba8e1..d739f4c44e0c7 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -317,6 +317,14 @@ extern "C" { GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4) GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...); +#ifdef GGML_PERF + enum ggml_compute_backend_type { + GGML_COMPUTE_BACKEND_CPU=0, + GGML_COMPUTE_BACKEND_TSAVORITE, + GGML_COMPUTE_BACKEND_COUNT + }; +#endif /* GGML_PERF */ + enum ggml_status { GGML_STATUS_ALLOC_FAILED = -2, GGML_STATUS_FAILED = -1, @@ -603,8 +611,14 @@ extern "C" { char name[GGML_MAX_NAME]; void * extra; // extra things e.g. for ggml-cuda.cu - +#ifdef GGML_PERF + int64_t perf_runs; + int64_t perf_time_us; + enum ggml_compute_backend_type ggml_compute_backend; + char padding[4]; +#else char padding[8]; +#endif /* GGML_PERF */ }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); @@ -2197,6 +2211,23 @@ extern "C" { GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); + +#ifdef GGML_PERF +// internal perf accumulation struct +struct ggml_perf_totals { + int op_count; + int64_t total_us; + int64_t runs; + const char * op_name; +}; + +FILE * ggml_perf_log_open(const char *filename); +void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp); + +// capture perf into totals +void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph); + +#endif /* GGML_PERF */ #ifdef __cplusplus } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 46f75ad97cd61..221182445ea34 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2844,8 +2844,21 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { struct ggml_tensor * node = cgraph->nodes[node_n]; +#ifdef GGML_PERF + int64_t t_start = ggml_time_us(); +#endif ggml_compute_forward(¶ms, node); +#ifdef GGML_PERF + int64_t t_end = ggml_time_us(); + node->perf_runs++; + if (t_end >= t_start) { + node->perf_time_us += (t_end - t_start); + } else { + // Handle wraparound by assuming timer rolls over at max int64_t value + node->perf_time_us += (INT64_MAX - t_start + t_end + 1); + } +#endif if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed); diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp index c49d02375921f..cdb9842bf7a0f 100644 --- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp +++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp @@ -813,6 +813,9 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, tensor_log log_data; for (int i = 0; i < cgraph->n_nodes; i++) { +#ifdef GGML_PERF + int64_t t_start = ggml_time_us(); +#endif node = cgraph->nodes[i]; src0 = node->src[0]; src1 = node->src[1]; @@ -1122,6 +1125,17 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, device->stats.op_run_count[kernel_type].max_num_of_elem < max_num_of_elem) device->stats.op_run_count[kernel_type].max_num_of_elem = max_num_of_elem; } +#ifdef GGML_PERF + int64_t t_end = ggml_time_us(); + node->perf_runs++; + node->ggml_compute_backend = GGML_COMPUTE_BACKEND_TSAVORITE; + if (t_end >= t_start) { + node->perf_time_us += (t_end - t_start); + } else { + // Handle wraparound by assuming timer rolls over at max int64_t value + node->perf_time_us += (INT64_MAX - t_start + t_end + 1); + } +#endif } // This this need to implement correctly when we have mixture of CPU and accelerator operation diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 134b7420de746..cddeca9067288 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -250,7 +250,7 @@ void ggml_log_internal(enum ggml_log_level level, const char * format, ...) { va_list args; va_start(args, format); if (level == GGML_LOG_LEVEL_TSAVORITE) - ggml_log_internal_v(level, format, args); + ggml_log_internal_v(level, format, args); va_end(args); } @@ -985,6 +985,13 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; +#ifdef GGML_PERF +static const char * GGML_BACKEND_TYPE[GGML_COMPUTE_BACKEND_COUNT] = { + "CPU", + "TSAVORITE" +}; +#endif /* GGML_PERF */ + static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { @@ -1200,6 +1207,12 @@ const char * ggml_op_name(enum ggml_op op) { return GGML_OP_NAME[op]; } +#ifdef GGML_PERF +static const char * ggml_backend_type(enum ggml_compute_backend_type backend) { + return GGML_BACKEND_TYPE[backend]; +} +#endif /* GGML_PERF */ + const char * ggml_op_symbol(enum ggml_op op) { return GGML_OP_SYMBOL[op]; } @@ -1617,6 +1630,11 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, +#ifdef GGML_PERF + /*.perf_runs =*/ 0, + /*.perf_time_us =*/ 0, + /*.ggml_compute_backend =*/ GGML_COMPUTE_BACKEND_CPU, +#endif /* GGML_PERF */ /*.padding =*/ { 0 }, }; @@ -6549,3 +6567,66 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons if (p0->strict_cpu != p1->strict_cpu ) return false; return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; } + +#ifdef GGML_PERF +void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph) { + for (int i = 0; i < cgraph->n_nodes; ++i) { + struct ggml_tensor * node = cgraph->nodes[i]; + enum ggml_op op = node->op; + + if (op >= GGML_OP_COUNT) continue; + + totals[op].op_name = ggml_op_name(op); + totals[op].total_us += node->perf_time_us; + totals[op].runs += node->perf_runs; + totals[op].op_count++; + } +} + +FILE * ggml_perf_log_open(const char *filename) { + // Try to delete existing file, ignore error if it doesn't exist + remove(filename); + + // Create a new file in write mode + FILE *fp = fopen(filename, "w"); + if (!fp) { + fprintf(stderr, "Error: Could not create file %s\n", filename); + return NULL; + } + + return fp; +} + +void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp) { + if (!fp) return; + + int64_t total_time_us = 0; + for (int i = 0; i < cgraph->n_nodes; ++i) { + if (cgraph->nodes[i]->perf_runs > 0) { + total_time_us += cgraph->nodes[i]->perf_time_us; + } + } + + fprintf(fp, "ggml_graph_compute_perf: total compute time: %.3f ms\n", total_time_us / 1000.0); + + for (int i = 0; i < cgraph->n_nodes; ++i) { + struct ggml_tensor * node = cgraph->nodes[i]; + if (node->perf_runs == 0) continue; + + double t_ms = node->perf_time_us / 1000.0; + double avg_ms = t_ms / node->perf_runs; + + fprintf(fp, + " - BACKEND:%s OP:%s: total %.3f ms over %d runs (avg %.3f ms) [shape=%d,%d,%d]\n", + ggml_backend_type(node->ggml_compute_backend), + ggml_op_name(node->op), + t_ms, + node->perf_runs, + avg_ms, + node->ne[0], node->ne[1], node->ne[2]); + } + + fprintf(fp, "--------------------------------------------------\n\n"); +} + +#endif /* GGML_PERF */ diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 984dbf14d14ae..4d8b6fdf5a74b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -932,6 +932,9 @@ int llama_context::decode(llama_batch & inp_batch) { kv_self_update(); int64_t n_outputs_prev = 0; +#ifdef GGML_PERF + FILE *perf_all_shape_fp = ggml_perf_log_open("ggml_perf-all-shape.log"); +#endif /* GGML_PERF */ while (sbatch.n_tokens > 0) { llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled); @@ -971,6 +974,12 @@ int llama_context::decode(llama_batch & inp_batch) { res->set_inputs(&ubatch); const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); +#ifdef GGML_PERF + if (perf_all_shape_fp) { + ggml_perf_write_detailed_csv(gf, perf_all_shape_fp); + } + ggml_perf_accumulate(perf_totals, gf); +#endif /* GGML_PERF */ if (compute_status != GGML_STATUS_SUCCESS) { switch (compute_status) { case GGML_STATUS_ABORTED: @@ -2611,6 +2620,19 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) { return data; } +void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) { + LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n"); + for (int i = 0; i < GGML_OP_COUNT; ++i) { + if (totals[i].runs > 0) { + LLAMA_LOG_TSAVORITE(" %-16s: %5ld runs, %8ld us total, avg %.2f us\n", + totals[i].op_name ? totals[i].op_name : "UNKNOWN", + totals[i].runs, + totals[i].total_us, + (double)totals[i].total_us / totals[i].runs); + } + } +} + void llama_perf_context_print(const llama_context * ctx) { const auto data = llama_perf_context(ctx); @@ -2622,12 +2644,16 @@ void llama_perf_context_print(const llama_context * ctx) { __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); +#ifdef GGML_PERF LLAMA_LOG_TSAVORITE("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); LLAMA_LOG_TSAVORITE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); LLAMA_LOG_TSAVORITE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); LLAMA_LOG_TSAVORITE("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); + + ggml_perf_print_totals(const_cast(ctx->perf_totals)); +#endif /* GGML_PERF */ } void llama_perf_context_reset(llama_context * ctx) { diff --git a/src/llama-context.h b/src/llama-context.h index c0ceacb10ce6f..918004be9bb8e 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -8,6 +8,7 @@ #include "ggml-cpp.h" #include "ggml-opt.h" +#include "ggml.h" #include #include @@ -183,6 +184,9 @@ struct llama_context { ggml_status graph_compute( ggml_cgraph * gf, bool batched); +#ifdef GGML_PERF + struct ggml_perf_totals perf_totals[GGML_OP_COUNT] = {}; // add this to llama_context +#endif private: llm_graph_result_ptr graph_build( diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 26842116ec6df..d736ef9e515c6 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -126,7 +126,9 @@ int main(int argc, char ** argv) { LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } +#ifdef GGML_PERF llama_log_set(my_logger, nullptr); +#endif /* GGML_PERF */ LOG_INF("%s: llama backend init\n", __func__); llama_backend_init();