Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -343,13 +343,13 @@ extern "C" {
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);

#ifdef GGML_PERF
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
enum ggml_compute_backend_type {
GGML_COMPUTE_BACKEND_CPU=0,
GGML_COMPUTE_BACKEND_TSAVORITE,
GGML_COMPUTE_BACKEND_COUNT
};
#endif /* GGML_PERF */
#endif /* GGML_PERF || GGML_PERF_DETAIL */

enum ggml_status {
GGML_STATUS_ALLOC_FAILED = -2,
Expand Down Expand Up @@ -659,14 +659,15 @@ extern "C" {
char name[GGML_MAX_NAME];

void * extra; // extra things e.g. for ggml-cuda.cu
#ifdef GGML_PERF

#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
int64_t perf_runs;
int64_t perf_time_us;
enum ggml_compute_backend_type ggml_compute_backend;
char padding[4];
#else
char padding[8];
#endif /* GGML_PERF */
#endif /* GGML_PERF || GGML_PERF_DETAIL */
};

static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
Expand Down Expand Up @@ -2556,7 +2557,7 @@ extern "C" {
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);

#ifdef GGML_PERF
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
struct ggml_perf_backend_subtotals {
int64_t total_us;
int64_t runs;
Expand Down Expand Up @@ -2586,7 +2587,7 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp);
void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph);
const char * ggml_backend_type(enum ggml_compute_backend_type backend);

#endif /* GGML_PERF */
#endif /* GGML_PERF || GGML_PERF_DETAIL */

#ifdef __cplusplus
}
Expand Down
8 changes: 4 additions & 4 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -2879,12 +2879,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
struct ggml_tensor * node = cgraph->nodes[node_n];

#ifdef GGML_PERF
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
int64_t t_start = ggml_time_us();
#endif
#endif /* GGML_PERF || GGML_PERF_DETAIL */
ggml_compute_forward(&params, node);

#ifdef GGML_PERF
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
int64_t t_end = ggml_time_us();
node->perf_runs++;
if (t_end >= t_start) {
Expand All @@ -2893,7 +2893,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
// Handle wraparound by assuming timer rolls over at max int64_t value
node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
}
#endif
#endif /* GGML_PERF || GGML_PERF_DETAIL */
if (state->ith == 0 && cplan->abort_callback &&
cplan->abort_callback(cplan->abort_callback_data)) {
atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
Expand Down
8 changes: 4 additions & 4 deletions ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -929,9 +929,9 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,

for (int i = 0; i < cgraph->n_nodes; i++) {
int32_t kernel_sub_type=-1;
#ifdef GGML_PERF
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
int64_t t_start = ggml_time_us();
#endif
#endif /* GGML_PERF || GGML_PERF_DETAIL */
node = cgraph->nodes[i];
src0 = node->src[0];
src1 = node->src[1];
Expand Down Expand Up @@ -1279,7 +1279,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
device->stats.op_run_count[kernel_type].max_num_of_elem < max_num_of_elem)
device->stats.op_run_count[kernel_type].max_num_of_elem = max_num_of_elem;
}
#ifdef GGML_PERF
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
int64_t t_end = ggml_time_us();
node->perf_runs++;
node->ggml_compute_backend = GGML_COMPUTE_BACKEND_TSAVORITE;
Expand All @@ -1289,7 +1289,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
// Handle wraparound by assuming timer rolls over at max int64_t value
node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
}
#endif
#endif /* GGML_PERF || GGML_PERF_DETAIL */
}

// This this need to implement correctly when we have mixture of CPU and accelerator operation
Expand Down
18 changes: 10 additions & 8 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1020,12 +1020,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"GLU",
};

#ifdef GGML_PERF
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
static const char * GGML_BACKEND_TYPE[GGML_COMPUTE_BACKEND_COUNT] = {
"CPU",
"OPU"
};
#endif /* GGML_PERF */
#endif /* GGML_PERF || GGML_PERF_DETAIL */

static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");

Expand Down Expand Up @@ -1262,11 +1262,11 @@ const char * ggml_op_name(enum ggml_op op) {
return GGML_OP_NAME[op];
}

#ifdef GGML_PERF
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
const char * ggml_backend_type(enum ggml_compute_backend_type backend) {
return GGML_BACKEND_TYPE[backend];
}
#endif /* GGML_PERF */
#endif /* GGML_PERF || GGML_PERF_DETAIL */

const char * ggml_op_symbol(enum ggml_op op) {
return GGML_OP_SYMBOL[op];
Expand Down Expand Up @@ -1692,11 +1692,11 @@ static struct ggml_tensor * ggml_new_tensor_impl(
/*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
/*.name =*/ { 0 },
/*.extra =*/ NULL,
#ifdef GGML_PERF
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
/*.perf_runs =*/ 0,
/*.perf_time_us =*/ 0,
/*.ggml_compute_backend =*/ GGML_COMPUTE_BACKEND_CPU,
#endif /* GGML_PERF */
#endif /* GGML_PERF || GGML_PERF_DETAIL */
/*.padding =*/ { 0 },
};

Expand Down Expand Up @@ -7231,7 +7231,7 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
}

#ifdef GGML_PERF
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph) {
for (int i = 0; i < cgraph->n_nodes; ++i) {
struct ggml_tensor * node = cgraph->nodes[i];
Expand All @@ -7258,7 +7258,9 @@ void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct
}
}
}
#endif /* GGML_PERF || GGML_PERF_DETAIL */

#if defined(GGML_PERF_DETAIL)
FILE * ggml_perf_log_open(const char *filename) {
// Try to delete existing file, ignore error if it doesn't exist
remove(filename);
Expand Down Expand Up @@ -7326,4 +7328,4 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp) {

fprintf(fp, "--------------------------------------------------------------------------------------------------------\n\n");
}
#endif /* GGML_PERF */
#endif /* GGML_PERF_DETAIL */
17 changes: 13 additions & 4 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1090,6 +1090,14 @@ int llama_context::decode(const llama_batch & batch_inp) {
ggml_status status;
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);

#if defined(GGML_PERF)
ggml_perf_accumulate(perf_totals, res->get_gf());
#elif defined(GGML_PERF_DETAIL)
ggml_perf_accumulate(perf_totals, res->get_gf());
ggml_perf_write_detailed_csv(res->get_gf(), perf_all_shape_fp);
#endif /* GGML_PERF || GGML_PERF_DETAIL */


if (!res) {
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module
llama_pos pos_min[LLAMA_MAX_SEQ];
Expand Down Expand Up @@ -2751,7 +2759,7 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) {
}


#ifdef GGML_PERF
#if defined(GGML_PERF)
void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
LLAMA_LOG_TSAVORITE(" %-16s %7s %14s %16s\n", "Op", "Runs", "Total us", "Avg us");
Expand Down Expand Up @@ -2779,7 +2787,8 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
}
}
}
#elif GGML_PERF_DETAIL

#elif defined(GGML_PERF_DETAIL)
void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
LLAMA_LOG_TSAVORITE(" %-16s %-8s %7s %14s %16s\n", "Op", "Target", "Runs", "Total us", "Avg us");
Expand Down Expand Up @@ -2843,7 +2852,7 @@ void llama_perf_context_print(const llama_context * ctx) {
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));

#ifdef GGML_PERF
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
LLAMA_LOG_TSAVORITE("\n%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
LLAMA_LOG_TSAVORITE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
Expand All @@ -2852,7 +2861,7 @@ void llama_perf_context_print(const llama_context * ctx) {
LLAMA_LOG_TSAVORITE("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));

ggml_perf_print_totals(const_cast<ggml_perf_totals *>(ctx->perf_totals));
#endif /* GGML_PERF */
#endif /* GGML_PERF || GGML_PERF_DETAIL */
}

void llama_perf_context_reset(llama_context * ctx) {
Expand Down
4 changes: 2 additions & 2 deletions src/llama-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,9 +198,9 @@ struct llama_context {

// reserve a graph with a dummy ubatch of the specified size
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
#ifdef GGML_PERF
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
struct ggml_perf_totals perf_totals[GGML_OP_COUNT] = {}; // add this to llama_context
#endif
#endif /* GGML_PERF || GGML_PERF_DETAIL */

private:
llm_graph_params graph_params(
Expand Down
4 changes: 2 additions & 2 deletions tools/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,9 @@ int main(int argc, char ** argv) {
LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
}

#ifdef GGML_PERF
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
llama_log_set(my_logger, nullptr);
#endif /* GGML_PERF */
#endif /* GGML_PERF || GGML_PERF_DETAIL */
LOG_INF("%s: llama backend init\n", __func__);

llama_backend_init();
Expand Down