diff --git a/common/log.h b/common/log.h index c56bb50d95db0..0e23b669fcc22 100644 --- a/common/log.h +++ b/common/log.h @@ -90,11 +90,20 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) #define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__) #define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__) +#if ENABLE_LOG #define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__) #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__) #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__) #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__) #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, 0, __VA_ARGS__) +#else +#define LOG_INF(...) +#define LOG_WRN(...) +#define LOG_ERR(...) +#define LOG_DBG(...) +#define LOG_CNT(...) +#endif +#define LOG_TSAVORITE(...) LOG_TMPL(GGML_LOG_LEVEL_TSAVORITE, 0, __VA_ARGS__) #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__) #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index c81ff03fee810..e6830b63ba8e1 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -554,6 +554,7 @@ extern "C" { GGML_LOG_LEVEL_WARN = 3, GGML_LOG_LEVEL_ERROR = 4, GGML_LOG_LEVEL_CONT = 5, // continue previous log + GGML_LOG_LEVEL_TSAVORITE = 42, }; // this tensor... diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index a19cfb14e0f9f..99c3475fc10cf 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -85,6 +85,7 @@ GGML_API void ggml_log_callback_default(enum ggml_log_level level, const char * #define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__) +#define GGML_LOG_TSAVORITE(...) ggml_log_internal(GGML_LOG_LEVEL_TSAVORITE , __VA_ARGS__) #define GGML_DEBUG 0 diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp index bc7095eeebf2f..c49d02375921f 100644 --- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp +++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp @@ -504,7 +504,6 @@ static void *ggml_tsavorite_host_malloc(size_t n) { GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); GGML_TSAVORITE_LOG_INFO("\n Allocating memory from tsi_alloc with size %ld \n", n); - printf("\n ANoop Allocating memory from tsi_alloc with size %ld \n", n); data = tsi_alloc(n); GGML_TSAVORITE_LOG_CONT("\n Allocating memory from tsi_alloc with size %ld starting memory %p\n", n, data); @@ -1800,7 +1799,6 @@ static bool ggml_backend_tsavorite_device_supports_buft(ggml_backend_dev_t dev, // ggml_backend_sched_backend_id_from_cur -> ggml_backend_offload_op -> static bool ggml_backend_tsavorite_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) { - // printf("\n ANoop Calling %s \n ", __func__); if (op->type != GGML_TYPE_F32) return false; switch (op->op) { @@ -1894,8 +1892,9 @@ static struct ggml_backend_reg_i ggml_backend_tsavorite_reg_i = { /* .get_proc_address = */ NULL, }; + ggml_backend_reg_t ggml_backend_tsavorite_reg(void) { - ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_ERROR; + ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_NONE; ggml_tsavorite_kernel_mode_flag = GGML_TSAVORITE_KERNEL_MODE_MLIR; GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); g_ggml_backend_tsavorite_reg.iface = ggml_backend_tsavorite_reg_i; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 57d3e39adf758..134b7420de746 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -249,6 +249,7 @@ static void ggml_log_internal_v(enum ggml_log_level level, const char * format, void ggml_log_internal(enum ggml_log_level level, const char * format, ...) { va_list args; va_start(args, format); + if (level == GGML_LOG_LEVEL_TSAVORITE) ggml_log_internal_v(level, format, args); va_end(args); } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 85b4324b699e6..984dbf14d14ae 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2615,13 +2615,19 @@ void llama_perf_context_print(const llama_context * ctx) { const auto data = llama_perf_context(ctx); const double t_end_ms = 1e-3 * ggml_time_us(); - LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); + + LLAMA_LOG_TSAVORITE("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); + LLAMA_LOG_TSAVORITE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); + LLAMA_LOG_TSAVORITE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); + LLAMA_LOG_TSAVORITE("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); } void llama_perf_context_reset(llama_context * ctx) { diff --git a/src/llama-impl.h b/src/llama-impl.h index 02b1d07f8400d..abc963a4a14e7 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -29,6 +29,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void * #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define LLAMA_LOG_CONT(...) llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__) +#define LLAMA_LOG_TSAVORITE(...) llama_log_internal(GGML_LOG_LEVEL_TSAVORITE, __VA_ARGS__) // // helpers diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 804b11e0a943e..d012a0ce520e0 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2562,6 +2562,8 @@ void llama_perf_sampler_print(const struct llama_sampler * chain) { LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); + LLAMA_LOG_TSAVORITE("\n\n%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", + __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); } void llama_perf_sampler_reset(struct llama_sampler * chain) { diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 1bd2be2d94f51..26842116ec6df 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -41,6 +41,12 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; +static void my_logger(ggml_log_level level, const char *text, void *user_data) { + if (level == GGML_LOG_LEVEL_TSAVORITE) { + fprintf(stderr, "%s", text); // only show warnings or errors + } +} + static void print_usage(int argc, char ** argv) { (void) argc; @@ -120,6 +126,7 @@ int main(int argc, char ** argv) { LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } + llama_log_set(my_logger, nullptr); LOG_INF("%s: llama backend init\n", __func__); llama_backend_init();