Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -343,13 +343,13 @@ extern "C" {
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);

#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
enum ggml_compute_backend_type {
GGML_COMPUTE_BACKEND_CPU=0,
GGML_COMPUTE_BACKEND_TSAVORITE,
GGML_COMPUTE_BACKEND_COUNT
};
#endif /* GGML_PERF || GGML_PERF_DETAIL */
#endif /* GML_PERF-related flag */

enum ggml_status {
GGML_STATUS_ALLOC_FAILED = -2,
Expand Down Expand Up @@ -660,14 +660,14 @@ extern "C" {

void * extra; // extra things e.g. for ggml-cuda.cu

#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
int64_t perf_runs;
int64_t perf_time_us;
enum ggml_compute_backend_type ggml_compute_backend;
char padding[4];
#else
char padding[8];
#endif /* GGML_PERF || GGML_PERF_DETAIL */
#endif /* GML_PERF-related flag */
};

static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
Expand Down Expand Up @@ -2557,7 +2557,7 @@ extern "C" {
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);

#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
struct ggml_perf_backend_subtotals {
int64_t total_us;
int64_t runs;
Expand Down Expand Up @@ -2587,7 +2587,7 @@ void ggml_perf_write_detailed_csv(struct ggml_cgraph * cgraph, FILE *fp);
void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph);
const char * ggml_backend_type(enum ggml_compute_backend_type backend);

#endif /* GGML_PERF || GGML_PERF_DETAIL */
#endif /* GML_PERF-related flags */

#ifdef __cplusplus
}
Expand Down
8 changes: 4 additions & 4 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -2879,12 +2879,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
struct ggml_tensor * node = cgraph->nodes[node_n];

#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
int64_t t_start = ggml_time_us();
#endif /* GGML_PERF || GGML_PERF_DETAIL */
#endif /* GGML_PERF-related flags */
ggml_compute_forward(&params, node);

#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
int64_t t_end = ggml_time_us();
node->perf_runs++;
if (t_end >= t_start) {
Expand All @@ -2893,7 +2893,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
// Handle wraparound by assuming timer rolls over at max int64_t value
node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
}
#endif /* GGML_PERF || GGML_PERF_DETAIL */
#endif /* GGML_PERF-related flags */
if (state->ith == 0 && cplan->abort_callback &&
cplan->abort_callback(cplan->abort_callback_data)) {
atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
Expand Down
8 changes: 4 additions & 4 deletions ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -929,9 +929,9 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,

for (int i = 0; i < cgraph->n_nodes; i++) {
int32_t kernel_sub_type=-1;
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
int64_t t_start = ggml_time_us();
#endif /* GGML_PERF || GGML_PERF_DETAIL */
#endif /* GGML_PERF-related flags */
node = cgraph->nodes[i];
src0 = node->src[0];
src1 = node->src[1];
Expand Down Expand Up @@ -1279,7 +1279,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
device->stats.op_run_count[kernel_type].max_num_of_elem < max_num_of_elem)
device->stats.op_run_count[kernel_type].max_num_of_elem = max_num_of_elem;
}
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
int64_t t_end = ggml_time_us();
node->perf_runs++;
node->ggml_compute_backend = GGML_COMPUTE_BACKEND_TSAVORITE;
Expand All @@ -1289,7 +1289,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
// Handle wraparound by assuming timer rolls over at max int64_t value
node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
}
#endif /* GGML_PERF || GGML_PERF_DETAIL */
#endif /* GGML_PERF-related flags */
}

// This this need to implement correctly when we have mixture of CPU and accelerator operation
Expand Down
16 changes: 8 additions & 8 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1020,12 +1020,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"GLU",
};

#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
static const char * GGML_BACKEND_TYPE[GGML_COMPUTE_BACKEND_COUNT] = {
"CPU",
"OPU"
};
#endif /* GGML_PERF || GGML_PERF_DETAIL */
#endif /* GGML_PERF-related flags */

static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");

Expand Down Expand Up @@ -1262,11 +1262,11 @@ const char * ggml_op_name(enum ggml_op op) {
return GGML_OP_NAME[op];
}

#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
const char * ggml_backend_type(enum ggml_compute_backend_type backend) {
return GGML_BACKEND_TYPE[backend];
}
#endif /* GGML_PERF || GGML_PERF_DETAIL */
#endif /* GML_PERF-related flags */

const char * ggml_op_symbol(enum ggml_op op) {
return GGML_OP_SYMBOL[op];
Expand Down Expand Up @@ -1692,11 +1692,11 @@ static struct ggml_tensor * ggml_new_tensor_impl(
/*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
/*.name =*/ { 0 },
/*.extra =*/ NULL,
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
/*.perf_runs =*/ 0,
/*.perf_time_us =*/ 0,
/*.ggml_compute_backend =*/ GGML_COMPUTE_BACKEND_CPU,
#endif /* GGML_PERF || GGML_PERF_DETAIL */
#endif /* GML_PERF-related flags */
/*.padding =*/ { 0 },
};

Expand Down Expand Up @@ -7231,7 +7231,7 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
}

#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct ggml_cgraph * cgraph) {
for (int i = 0; i < cgraph->n_nodes; ++i) {
struct ggml_tensor * node = cgraph->nodes[i];
Expand All @@ -7258,7 +7258,7 @@ void ggml_perf_accumulate(struct ggml_perf_totals totals[GGML_OP_COUNT], struct
}
}
}
#endif /* GGML_PERF || GGML_PERF_DETAIL */
#endif /* GML_PERF-related flags */

#if defined(GGML_PERF_DETAIL)
FILE * ggml_perf_log_open(const char *filename) {
Expand Down
14 changes: 7 additions & 7 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1090,12 +1090,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
ggml_status status;
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);

#if defined(GGML_PERF)
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE)
ggml_perf_accumulate(perf_totals, res->get_gf());
#elif defined(GGML_PERF_DETAIL)
ggml_perf_accumulate(perf_totals, res->get_gf());
ggml_perf_write_detailed_csv(res->get_gf(), perf_all_shape_fp);
#endif /* GGML_PERF || GGML_PERF_DETAIL */
#endif /* GML_PERF-related flags */


if (!res) {
Expand Down Expand Up @@ -2759,7 +2759,7 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) {
}


#if defined(GGML_PERF)
#if defined(GGML_PERF_RELEASE)
void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
LLAMA_LOG_TSAVORITE(" %-16s %7s %14s %16s\n", "Op", "Runs", "Total us", "Avg us");
Expand Down Expand Up @@ -2788,7 +2788,7 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
}
}

#elif defined(GGML_PERF_DETAIL)
#elif defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
LLAMA_LOG_TSAVORITE(" %-16s %-8s %7s %14s %16s\n", "Op", "Target", "Runs", "Total us", "Avg us");
Expand Down Expand Up @@ -2838,7 +2838,7 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
}
}
}
#endif /* GGML_PERF || GGML_PERF_DETAI */
#endif /* GGML_PERF-related flags */


void llama_perf_context_print(const llama_context * ctx) {
Expand All @@ -2852,7 +2852,7 @@ void llama_perf_context_print(const llama_context * ctx) {
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));

#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
LLAMA_LOG_TSAVORITE("\n%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
LLAMA_LOG_TSAVORITE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
Expand All @@ -2861,7 +2861,7 @@ void llama_perf_context_print(const llama_context * ctx) {
LLAMA_LOG_TSAVORITE("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));

ggml_perf_print_totals(const_cast<ggml_perf_totals *>(ctx->perf_totals));
#endif /* GGML_PERF || GGML_PERF_DETAIL */
#endif /* GGML_PERF-related flags */
}

void llama_perf_context_reset(llama_context * ctx) {
Expand Down
4 changes: 2 additions & 2 deletions src/llama-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,9 +198,9 @@ struct llama_context {

// reserve a graph with a dummy ubatch of the specified size
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
struct ggml_perf_totals perf_totals[GGML_OP_COUNT] = {}; // add this to llama_context
#endif /* GGML_PERF || GGML_PERF_DETAIL */
#endif /* GGML_PERF-related flags */

private:
llm_graph_params graph_params(
Expand Down
4 changes: 2 additions & 2 deletions tools/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,9 @@ int main(int argc, char ** argv) {
LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
}

#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
#if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
llama_log_set(my_logger, nullptr);
#endif /* GGML_PERF || GGML_PERF_DETAIL */
#endif /* GGML_PERF-related flags */
LOG_INF("%s: llama backend init\n", __func__);

llama_backend_init();
Expand Down
25 changes: 21 additions & 4 deletions tsi-pkg-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,18 @@ cd ../posix-kernel/

cd ../../

#Compile for posix with build-posix as a target folder
#Compile for posix & fpga with build-posix as a target folder

echo 'building llama.cp, ggml for tsavorite and other binary for posix'
cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF"
if [ "$(echo "$1" | tr '[:upper:]' '[:lower:]')" = "release" ];
then
cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF_RELEASE" -DCMAKE_CXX_FLAGS="-DGGML_PERF_RELEASE"
elif [ "$(echo "$1" | tr '[:upper:]' '[:lower:]')" = "debug" ]; then
cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF_DETAIL" -DCMAKE_CXX_FLAGS="-DGGML_PERF_DETAIL"
else
cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF"
fi

cmake --build build-posix --config Release

# Fix GLIBC compatibility for TSI binaries
Expand Down Expand Up @@ -64,12 +72,21 @@ chmod +x build-posix/bin/llama-cli
echo 'building llama.cp, ggml for tsavorite and other binary for fpga'
export CC="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc"
export CXX="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-g++"
cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF"

if [ "$(echo "$1" | tr '[:upper:]' '[:lower:]')" = "release" ];
then
cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga -DCMAKE_C_FLAGS="-DGGML_PERF_RELEASE" -DCMAKE_CXX_FLAGS="-DGGML_PERF_RELEASE"
elif [ "$(echo "$1" | tr '[:upper:]' '[:lower:]')" = "debug" ]; then
cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga -DCMAKE_C_FLAGS="-DGGML_PERF_DETAIL" -DCMAKE_CXX_FLAGS="-DGGML_PERF_DETAIL"
else
cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF"
fi

cmake --build build-fpga --config Release


echo 'creating tar bundle for fpga'
TSI_GGML_VERSION=0.0.8
TSI_GGML_VERSION=0.0.9
TSI_GGML_BUNDLE_INSTALL_DIR=tsi-ggml
GGML_TSI_INSTALL_DIR=ggml-tsi-kernel
TSI_GGML_RELEASE_DIR=/proj/rel/sw/ggml
Expand Down