Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,10 @@ llama_context::llama_context(

LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());

const size_t max_nodes = this->graph_max_nodes();
const uint32_t n_seqs = cparams.n_seq_max;
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);

const size_t max_nodes = this->graph_max_nodes(n_tokens);

LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);

Expand Down Expand Up @@ -300,9 +303,6 @@ llama_context::llama_context(

cross.v_embd.clear();

const uint32_t n_seqs = cparams.n_seq_max;
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);

// avoid reserving graphs with zero outputs - assume one output per sequence
n_outputs = n_seqs;

Expand Down Expand Up @@ -1386,9 +1386,9 @@ void llama_context::output_reorder() {
// graph
//

uint32_t llama_context::graph_max_nodes() const {
uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) {
if (model.arch == LLM_ARCH_QWEN3NEXT) {
return std::max<uint32_t>(8192u, 32u*model.n_tensors());
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
}
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
}
Expand Down
2 changes: 1 addition & 1 deletion src/llama-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ struct llama_context {
//

public:
uint32_t graph_max_nodes() const;
uint32_t graph_max_nodes(uint32_t n_tokens);

// can reuse the llm_graph_result instance of the context (for example to update a memory module)
llm_graph_result * get_gf_res_reserve() const;
Expand Down