Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,9 @@ cd ../../
#Compile for posix with build-posix as a target folder

cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix
to enable STatus use below command
cmake -B build-posix -DCMAKE_BUILD_TYPE=Debug -DGGML_TSAVORITE=ON -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF"

cmake --build build-posix --config Release

#Compile for fpga with build-fpga as a target folder
Expand Down
33 changes: 32 additions & 1 deletion ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,14 @@ extern "C" {
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);

#ifdef GGML_PERF
enum ggml_compute_backend_type {
GGML_COMPUTE_BACKEND_CPU=0,
GGML_COMPUTE_BACKEND_TSAVORITE,
GGML_COMPUTE_BACKEND_COUNT
};
#endif /* GGML_PERF */

enum ggml_status {
GGML_STATUS_ALLOC_FAILED = -2,
GGML_STATUS_FAILED = -1,
Expand Down Expand Up @@ -603,8 +611,14 @@ extern "C" {
char name[GGML_MAX_NAME];

void * extra; // extra things e.g. for ggml-cuda.cu

#ifdef GGML_PERF
int64_t perf_runs;
int64_t perf_time_us;
enum ggml_compute_backend_type ggml_compute_backend;
char padding[4];
#else
char padding[8];
#endif /* GGML_PERF */
};

static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
Expand Down Expand Up @@ -2197,6 +2211,23 @@ extern "C" {
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);

#ifdef GGML_PERF
// internal perf accumulation struct
struct ggml_perf_totals {
int op_count;
int64_t total_us;
int64_t runs;
const char * op_name;
};

FILE * ggml_perf_log_open(const char *filename);
void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp);

// capture perf into totals
void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph);

#endif /* GGML_PERF */

#ifdef __cplusplus
}
Expand Down
13 changes: 13 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -2844,8 +2844,21 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
struct ggml_tensor * node = cgraph->nodes[node_n];

#ifdef GGML_PERF
int64_t t_start = ggml_time_us();
#endif
ggml_compute_forward(&params, node);

#ifdef GGML_PERF
int64_t t_end = ggml_time_us();
node->perf_runs++;
if (t_end >= t_start) {
node->perf_time_us += (t_end - t_start);
} else {
// Handle wraparound by assuming timer rolls over at max int64_t value
node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
}
#endif
if (state->ith == 0 && cplan->abort_callback &&
cplan->abort_callback(cplan->abort_callback_data)) {
atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
Expand Down
14 changes: 14 additions & 0 deletions ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -813,6 +813,9 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
tensor_log log_data;

for (int i = 0; i < cgraph->n_nodes; i++) {
#ifdef GGML_PERF
int64_t t_start = ggml_time_us();
#endif
node = cgraph->nodes[i];
src0 = node->src[0];
src1 = node->src[1];
Expand Down Expand Up @@ -1122,6 +1125,17 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
device->stats.op_run_count[kernel_type].max_num_of_elem < max_num_of_elem)
device->stats.op_run_count[kernel_type].max_num_of_elem = max_num_of_elem;
}
#ifdef GGML_PERF
int64_t t_end = ggml_time_us();
node->perf_runs++;
node->ggml_compute_backend = GGML_COMPUTE_BACKEND_TSAVORITE;
if (t_end >= t_start) {
node->perf_time_us += (t_end - t_start);
} else {
// Handle wraparound by assuming timer rolls over at max int64_t value
node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
}
#endif
}

// This this need to implement correctly when we have mixture of CPU and accelerator operation
Expand Down
83 changes: 82 additions & 1 deletion ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ void ggml_log_internal(enum ggml_log_level level, const char * format, ...) {
va_list args;
va_start(args, format);
if (level == GGML_LOG_LEVEL_TSAVORITE)
ggml_log_internal_v(level, format, args);
ggml_log_internal_v(level, format, args);
va_end(args);
}

Expand Down Expand Up @@ -985,6 +985,13 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};

#ifdef GGML_PERF
static const char * GGML_BACKEND_TYPE[GGML_COMPUTE_BACKEND_COUNT] = {
"CPU",
"TSAVORITE"
};
#endif /* GGML_PERF */

static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");

static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
Expand Down Expand Up @@ -1200,6 +1207,12 @@ const char * ggml_op_name(enum ggml_op op) {
return GGML_OP_NAME[op];
}

#ifdef GGML_PERF
static const char * ggml_backend_type(enum ggml_compute_backend_type backend) {
return GGML_BACKEND_TYPE[backend];
}
#endif /* GGML_PERF */

const char * ggml_op_symbol(enum ggml_op op) {
return GGML_OP_SYMBOL[op];
}
Expand Down Expand Up @@ -1617,6 +1630,11 @@ static struct ggml_tensor * ggml_new_tensor_impl(
/*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
/*.name =*/ { 0 },
/*.extra =*/ NULL,
#ifdef GGML_PERF
/*.perf_runs =*/ 0,
/*.perf_time_us =*/ 0,
/*.ggml_compute_backend =*/ GGML_COMPUTE_BACKEND_CPU,
#endif /* GGML_PERF */
/*.padding =*/ { 0 },
};

Expand Down Expand Up @@ -6549,3 +6567,66 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons
if (p0->strict_cpu != p1->strict_cpu ) return false;
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
}

#ifdef GGML_PERF
void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph) {
for (int i = 0; i < cgraph->n_nodes; ++i) {
struct ggml_tensor * node = cgraph->nodes[i];
enum ggml_op op = node->op;

if (op >= GGML_OP_COUNT) continue;

totals[op].op_name = ggml_op_name(op);
totals[op].total_us += node->perf_time_us;
totals[op].runs += node->perf_runs;
totals[op].op_count++;
}
}

FILE * ggml_perf_log_open(const char *filename) {
// Try to delete existing file, ignore error if it doesn't exist
remove(filename);

// Create a new file in write mode
FILE *fp = fopen(filename, "w");
if (!fp) {
fprintf(stderr, "Error: Could not create file %s\n", filename);
return NULL;
}

return fp;
}

void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp) {
if (!fp) return;

int64_t total_time_us = 0;
for (int i = 0; i < cgraph->n_nodes; ++i) {
if (cgraph->nodes[i]->perf_runs > 0) {
total_time_us += cgraph->nodes[i]->perf_time_us;
}
}

fprintf(fp, "ggml_graph_compute_perf: total compute time: %.3f ms\n", total_time_us / 1000.0);

for (int i = 0; i < cgraph->n_nodes; ++i) {
struct ggml_tensor * node = cgraph->nodes[i];
if (node->perf_runs == 0) continue;

double t_ms = node->perf_time_us / 1000.0;
double avg_ms = t_ms / node->perf_runs;

fprintf(fp,
" - BACKEND:%s OP:%s: total %.3f ms over %d runs (avg %.3f ms) [shape=%d,%d,%d]\n",
ggml_backend_type(node->ggml_compute_backend),
ggml_op_name(node->op),
t_ms,
node->perf_runs,
avg_ms,
node->ne[0], node->ne[1], node->ne[2]);
}

fprintf(fp, "--------------------------------------------------\n\n");
}

#endif /* GGML_PERF */
26 changes: 26 additions & 0 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -932,6 +932,9 @@ int llama_context::decode(llama_batch & inp_batch) {
kv_self_update();

int64_t n_outputs_prev = 0;
#ifdef GGML_PERF
FILE *perf_all_shape_fp = ggml_perf_log_open("ggml_perf-all-shape.log");
#endif /* GGML_PERF */

while (sbatch.n_tokens > 0) {
llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
Expand Down Expand Up @@ -971,6 +974,12 @@ int llama_context::decode(llama_batch & inp_batch) {
res->set_inputs(&ubatch);

const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
#ifdef GGML_PERF
if (perf_all_shape_fp) {
ggml_perf_write_detailed_csv(gf, perf_all_shape_fp);
}
ggml_perf_accumulate(perf_totals, gf);
#endif /* GGML_PERF */
if (compute_status != GGML_STATUS_SUCCESS) {
switch (compute_status) {
case GGML_STATUS_ABORTED:
Expand Down Expand Up @@ -2611,6 +2620,19 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) {
return data;
}

void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
for (int i = 0; i < GGML_OP_COUNT; ++i) {
if (totals[i].runs > 0) {
LLAMA_LOG_TSAVORITE(" %-16s: %5ld runs, %8ld us total, avg %.2f us\n",
totals[i].op_name ? totals[i].op_name : "UNKNOWN",
totals[i].runs,
totals[i].total_us,
(double)totals[i].total_us / totals[i].runs);
}
}
}

void llama_perf_context_print(const llama_context * ctx) {
const auto data = llama_perf_context(ctx);

Expand All @@ -2622,12 +2644,16 @@ void llama_perf_context_print(const llama_context * ctx) {
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));

#ifdef GGML_PERF
LLAMA_LOG_TSAVORITE("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
LLAMA_LOG_TSAVORITE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
LLAMA_LOG_TSAVORITE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
LLAMA_LOG_TSAVORITE("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));

ggml_perf_print_totals(const_cast<ggml_perf_totals *>(ctx->perf_totals));
#endif /* GGML_PERF */
}

void llama_perf_context_reset(llama_context * ctx) {
Expand Down
4 changes: 4 additions & 0 deletions src/llama-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "ggml-cpp.h"
#include "ggml-opt.h"
#include "ggml.h"

#include <map>
#include <vector>
Expand Down Expand Up @@ -183,6 +184,9 @@ struct llama_context {
ggml_status graph_compute(
ggml_cgraph * gf,
bool batched);
#ifdef GGML_PERF
struct ggml_perf_totals perf_totals[GGML_OP_COUNT] = {}; // add this to llama_context
#endif

private:
llm_graph_result_ptr graph_build(
Expand Down
2 changes: 2 additions & 0 deletions tools/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,9 @@ int main(int argc, char ** argv) {
LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
}

#ifdef GGML_PERF
llama_log_set(my_logger, nullptr);
#endif /* GGML_PERF */
LOG_INF("%s: llama backend init\n", __func__);

llama_backend_init();
Expand Down