-
Notifications
You must be signed in to change notification settings - Fork 132
Open
Labels
bugSomething isn't workingSomething isn't working
Description
System Info
Here's a revised version of the issue description with improved wording:
System Specifications:
- CPU Architecture: x86_64
- CPU Memory: 1024GB
- GPU Model: Nvidia L40S
- Docker Image:
nvcr.io/nvidia/tritonserver:nvcr.io/nvidia/tritonserver - TensorRT-LLM Version: 0.16.0
Configuration Details:
Model Information:
- Model Name:
/engines/llama3.1-8B - Model Directory:
/models/Meta-Llama-3.1-8B-Instruct - Engine Directory:
/repo/llama3/tensorrt_llm/1
Engine Configuration:
\
How to fix the above error?
Who can help?
No response
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
- quantize the llama3.1-8B model
python ../quantization/quantize.py --model_dir <model_dir> \
--output_dir <output_dir> \
--dtype bfloat16 \
--qformat int4_awq \
--awq_block_size 128 \
--batch_size 12- build the engine of it:
trtllm-build \
--checkpoint_dir <checkpoint_dir> \
--gpt_attention_plugin bfloat16 \
--gemm_plugin bfloat16 \
--output_dir <output_dir>the config of the engine
{
"version": "0.17.0.dev2024121700",
"pretrained_config": {
"mlp_bias": false,
"attn_bias": false,
"rotary_base": 500000.0,
"rotary_scaling": {
"factor": 8.0,
"high_freq_factor": 4.0,
"low_freq_factor": 1.0,
"original_max_position_embeddings": 8192,
"rope_type": "llama3"
},
"residual_mlp": false,
"disable_weight_only_quant_plugin": false,
"moe": {
"num_experts": 0,
"shared_expert_intermediate_size": 0,
"top_k": 0,
"normalization_mode": 1,
"sparse_mixer_epsilon": 0.01,
"tp_mode": 0,
"device_limited_n_group": 0,
"device_limited_topk_group": 0,
"device_limited_routed_scaling_factor": 1.0
},
"remove_duplicated_kv_heads": false,
"fc_after_embed": false,
"use_input_layernorm_in_first_layer": true,
"use_last_layernorm": true,
"layer_idx_offset": 0,
"architecture": "LlamaForCausalLM",
"dtype": "bfloat16",
"vocab_size": 128256,
"hidden_size": 4096,
"num_hidden_layers": 32,
"num_attention_heads": 32,
"hidden_act": "silu",
"logits_dtype": "float16",
"norm_epsilon": 1e-05,
"runtime_defaults": null,
"position_embedding_type": "rope_gpt_neox",
"num_key_value_heads": 8,
"intermediate_size": 14336,
"max_position_embeddings": 131072,
"mapping": {
"world_size": 1,
"gpus_per_node": 8,
"cp_size": 1,
"tp_size": 1,
"pp_size": 1,
"moe_tp_size": 1,
"moe_ep_size": 1
},
"quantization": {
"quant_algo": "W4A16_AWQ",
"kv_cache_quant_algo": null,
"group_size": 128,
"smoothquant_val": 0.5,
"clamp_val": null,
"use_meta_recipe": false,
"has_zero_point": false,
"pre_quant_scale": true,
"exclude_modules": [
"transformer.layers.13.input_layernorm",
"transformer.layers.8.post_layernorm",
"transformer.layers.0.input_layernorm",
"transformer.layers.15.post_layernorm",
"transformer.layers.19.post_layernorm",
"transformer.layers.25.post_layernorm",
"transformer.layers.9.post_layernorm",
"transformer.layers.11.input_layernorm",
"transformer.layers.21.input_layernorm",
"transformer.layers.4.input_layernorm",
"transformer.layers.20.input_layernorm",
"transformer.layers.9.input_layernorm",
"transformer.layers.26.input_layernorm",
"transformer.layers.23.input_layernorm",
"transformer.layers.31.input_layernorm",
"transformer.layers.11.post_layernorm",
"transformer.layers.25.input_layernorm",
"transformer.layers.8.input_layernorm",
"transformer.layers.5.post_layernorm",
"transformer.layers.31.post_layernorm",
"transformer.layers.30.input_layernorm",
"transformer.layers.3.post_layernorm",
"transformer.vocab_embedding",
"transformer.layers.2.input_layernorm",
"transformer.layers.29.post_layernorm",
"transformer.layers.19.input_layernorm",
"transformer.layers.27.post_layernorm",
"transformer.layers.12.post_layernorm",
"transformer.ln_f",
"transformer.layers.1.input_layernorm",
"transformer.layers.6.input_layernorm",
"transformer.layers.5.input_layernorm",
"transformer.layers.0.post_layernorm",
"lm_head",
"transformer.layers.26.post_layernorm",
"transformer.layers.23.post_layernorm",
"transformer.layers.17.post_layernorm",
"transformer.layers.21.post_layernorm",
"transformer.layers.7.input_layernorm",
"transformer.layers.22.post_layernorm",
"transformer.layers.10.post_layernorm",
"transformer.layers.12.input_layernorm",
"transformer.layers.18.input_layernorm",
"transformer.layers.7.post_layernorm",
"transformer.layers.30.post_layernorm",
"transformer.layers.24.post_layernorm",
"transformer.layers.20.post_layernorm",
"transformer.layers.22.input_layernorm",
"transformer.layers.15.input_layernorm",
"transformer.layers.3.input_layernorm",
"transformer.layers.4.post_layernorm",
"transformer.layers.13.post_layernorm",
"transformer.layers.29.input_layernorm",
"transformer.layers.18.post_layernorm",
"transformer.layers.28.input_layernorm",
"transformer.layers.16.input_layernorm",
"transformer.layers.1.post_layernorm",
"transformer.layers.17.input_layernorm",
"transformer.layers.28.post_layernorm",
"transformer.layers.16.post_layernorm",
"transformer.layers.6.post_layernorm",
"transformer.layers.14.post_layernorm",
"transformer.layers.24.input_layernorm",
"transformer.layers.10.input_layernorm",
"transformer.layers.2.post_layernorm",
"transformer.layers.27.input_layernorm",
"transformer.layers.14.input_layernorm"
]
},
"use_parallel_embedding": true,
"embedding_sharding_dim": 0,
"head_size": 128,
"qk_layernorm": false,
"rotary_embedding_dim": 128,
"producer": {
"name": "modelopt",
"version": "0.19.0"
},
"share_embedding_table": false,
"bias": false,
"rotary_pct": 1.0,
"rank": 0,
"decoder": "llama",
"rmsnorm": true,
"lm_head_bias": false,
"model_type": "llama"
},
"build_config": {
"max_input_len": 1024,
"max_seq_len": 131072,
"opt_batch_size": 8,
"max_batch_size": 2048,
"max_beam_width": 1,
"max_num_tokens": 8192,
"opt_num_tokens": 2048,
"max_prompt_embedding_table_size": 0,
"kv_cache_type": "PAGED",
"gather_context_logits": false,
"gather_generation_logits": false,
"strongly_typed": true,
"force_num_profiles": null,
"profiling_verbosity": "layer_names_only",
"enable_debug_output": false,
"max_draft_len": 0,
"speculative_decoding_mode": 1,
"use_refit": false,
"input_timing_cache": null,
"output_timing_cache": "model.cache",
"lora_config": {
"lora_dir": [],
"lora_ckpt_source": "hf",
"max_lora_rank": 64,
"lora_target_modules": [],
"trtllm_modules_to_hf_modules": {}
},
"auto_parallel_config": {
"world_size": 1,
"gpus_per_node": 8,
"cluster_key": "L40S",
"cluster_info": null,
"sharding_cost_model": "alpha_beta",
"comm_cost_model": "alpha_beta",
"enable_pipeline_parallelism": false,
"enable_shard_unbalanced_shape": false,
"enable_shard_dynamic_shape": false,
"enable_reduce_scatter": true,
"builder_flags": null,
"debug_mode": false,
"infer_shape": true,
"validation_mode": false,
"same_buffer_io": {
"past_key_value_(\\d+)": "present_key_value_\\1"
},
"same_spec_io": {},
"sharded_io_allowlist": [
"past_key_value_\\d+",
"present_key_value_\\d*"
],
"fill_weights": false,
"parallel_config_cache": null,
"profile_cache": null,
"dump_path": null,
"debug_outputs": []
},
"weight_sparsity": false,
"weight_streaming": false,
"plugin_config": {
"dtype": "bfloat16",
"bert_attention_plugin": "auto",
"gpt_attention_plugin": "bfloat16",
"gemm_plugin": "bfloat16",
"gemm_swiglu_plugin": null,
"fp8_rowwise_gemm_plugin": null,
"qserve_gemm_plugin": null,
"identity_plugin": null,
"nccl_plugin": null,
"lora_plugin": null,
"weight_only_groupwise_quant_matmul_plugin": "bfloat16",
"weight_only_quant_matmul_plugin": null,
"smooth_quant_plugins": true,
"smooth_quant_gemm_plugin": null,
"layernorm_quantization_plugin": null,
"rmsnorm_quantization_plugin": null,
"quantize_per_token_plugin": false,
"quantize_tensor_plugin": false,
"moe_plugin": "auto",
"mamba_conv1d_plugin": "auto",
"low_latency_gemm_plugin": null,
"low_latency_gemm_swiglu_plugin": null,
"context_fmha": true,
"bert_context_fmha_fp32_acc": false,
"paged_kv_cache": true,
"remove_input_padding": true,
"reduce_fusion": false,
"user_buffer": false,
"tokens_per_block": 64,
"use_paged_context_fmha": false,
"use_fp8_context_fmha": false,
"multiple_profiles": false,
"paged_state": false,
"streamingllm": false,
"manage_weights": false,
"use_fused_mlp": true,
"pp_reduce_scatter": false
},
"use_strip_plan": false,
"max_encoder_input_len": 1024,
"use_fused_mlp": true,
"monitor_memory": false,
"use_mrope": false
}- execute the following commands to populate the
config.pbtxtfiles:
FILL_TEMPLATE_SCRIPT=/tensorrtllm_backend/tools/fill_template.py
MODEL_FOLDER=/repo/llama3
ENGINE_PATH=/repo/llama3/tensorrt_llm/1
TOKENIZER_DIR=/models/Meta-Llama-3.1-8B-Instruct
TRITON_MAX_BATCH_SIZE=1
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},preprocessing_instance_count:1
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},postprocessing_instance_count:1
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},logits_datatype:TYPE_FP32
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:256,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_BF16,logits_datatype:TYPE_FP32,enable_kv_cache_reuse:true,enable_chunked_context:true- Start Triton Server:
tritonserver --model-repository /repo/llama3/Expected behavior
start the triton server using TensorRT-LLM
actual behavior
Upon starting the Triton server, the following error occurs:
I0114 09:00:18.017712 880 pinned_memory_manager.cc:277] "Pinned memory pool is created at '0x755bdc000000' with size 268435456"
I0114 09:00:18.029910 880 cuda_memory_manager.cc:107] "CUDA memory pool is created on device 0 with size 67108864"
I0114 09:00:18.029919 880 cuda_memory_manager.cc:107] "CUDA memory pool is created on device 1 with size 67108864"
I0114 09:00:18.200807 880 model_lifecycle.cc:473] "loading: postprocessing:1"
I0114 09:00:18.200845 880 model_lifecycle.cc:473] "loading: preprocessing:1"
I0114 09:00:18.200881 880 model_lifecycle.cc:473] "loading: tensorrt_llm:1"
I0114 09:00:18.200904 880 model_lifecycle.cc:473] "loading: tensorrt_llm_bls:1"
I0114 09:00:18.378243 880 libtensorrtllm.cc:55] "TRITONBACKEND_Initialize: tensorrtllm"
I0114 09:00:18.378292 880 libtensorrtllm.cc:62] "Triton TRITONBACKEND API version: 1.19"
I0114 09:00:18.378299 880 libtensorrtllm.cc:66] "'tensorrtllm' TRITONBACKEND API version: 1.19"
I0114 09:00:18.378306 880 libtensorrtllm.cc:86] "backend configuration:\n{\"cmdline\":{\"auto-complete-config\":\"true\",\"backend-directory\":\"/opt/tritonserver/backends\",\"min-compute-capability\":\"6.000000\",\"default-max-batch-size\":\"4\"}}"
[TensorRT-LLM][WARNING] gpu_device_ids is not specified, will be automatically set
[TensorRT-LLM][WARNING] participant_ids is not specified, will be automatically set
I0114 09:00:18.402251 880 libtensorrtllm.cc:114] "TRITONBACKEND_ModelInitialize: tensorrt_llm (version 1)"
[TensorRT-LLM][WARNING] iter_stats_max_iterations is not specified, will use default value of 1000
[TensorRT-LLM][WARNING] request_stats_max_iterations is not specified, will use default value of 0
[TensorRT-LLM][WARNING] normalize_log_probs is not specified, will be set to true
[TensorRT-LLM][WARNING] cross_kv_cache_fraction is not specified, error if it's encoder-decoder model, otherwise ok
[TensorRT-LLM][WARNING] kv_cache_host_memory_bytes not set, defaulting to 0
[TensorRT-LLM][WARNING] kv_cache_onboard_blocks not set, defaulting to true
[TensorRT-LLM][WARNING] sink_token_length is not specified, will use default value
[TensorRT-LLM][WARNING] enable_chunked_context is set to true, will use context chunking (requires building the model with use_paged_context_fmha).
[TensorRT-LLM][WARNING] batch_scheduler_policy parameter was not found or is invalid (must be max_utilization or guaranteed_no_evict)
[TensorRT-LLM][WARNING] lora_cache_max_adapter_size not set, defaulting to 64
[TensorRT-LLM][WARNING] lora_cache_optimal_adapter_size not set, defaulting to 8
[TensorRT-LLM][WARNING] lora_cache_gpu_memory_fraction not set, defaulting to 0.05
[TensorRT-LLM][WARNING] lora_cache_host_memory_bytes not set, defaulting to 1GB
[TensorRT-LLM][WARNING] multi_block_mode is not specified, will be set to true
[TensorRT-LLM][WARNING] enable_context_fmha_fp32_acc is not specified, will be set to false
[TensorRT-LLM][WARNING] cuda_graph_mode is not specified, will be set to false
[TensorRT-LLM][WARNING] cuda_graph_cache_size is not specified, will be set to 0
[TensorRT-LLM][INFO] speculative_decoding_fast_logits is not specified, will be set to false
[TensorRT-LLM][WARNING] decoding_mode parameter is invalid or not specified(must be one of the {top_k, top_p, top_k_top_p, beam_search, medusa, redrafter, lookahead, eagle}).Using default: top_k_top_p if max_beam_width == 1, beam_search otherwise
[TensorRT-LLM][WARNING] gpu_weights_percent parameter is not specified, will use default value of 1.0
[TensorRT-LLM][INFO] recv_poll_period_ms is not set, will use busy loop
[TensorRT-LLM][WARNING] encoder_model_path is not specified, will be left empty
[TensorRT-LLM][INFO] Engine version 0.17.0.dev2024121700 found in the config file, assuming engine(s) built by new builder API.
[TensorRT-LLM][INFO] Initializing MPI with thread mode 3
[TensorRT-LLM][INFO] Initialized MPI
[TensorRT-LLM][INFO] Refreshed the MPI local session
[TensorRT-LLM][INFO] MPI size: 1, MPI local size: 1, rank: 0
[TensorRT-LLM][WARNING] Chunked context is not supported for this configuration and will be disabled. Related configs: RNNBased: 0, KVCacheEnabled: 1, PagedContextFMHA: 0
[TensorRT-LLM][INFO] Rank 0 is using GPU 0
[TensorRT-LLM][WARNING] Fix optionalParams : KV cache reuse disabled because model was not built with paged context FMHA support
[TensorRT-LLM][INFO] TRTGptModel maxNumSequences: 2048
[TensorRT-LLM][INFO] TRTGptModel maxBatchSize: 2048
[TensorRT-LLM][INFO] TRTGptModel maxBeamWidth: 1
[TensorRT-LLM][INFO] TRTGptModel maxSequenceLen: 131072
[TensorRT-LLM][INFO] TRTGptModel maxDraftLen: 0
[TensorRT-LLM][INFO] TRTGptModel mMaxAttentionWindowSize: (256) * 32
[TensorRT-LLM][INFO] TRTGptModel enableTrtOverlap: 0
[TensorRT-LLM][INFO] TRTGptModel normalizeLogProbs: 1
[TensorRT-LLM][INFO] TRTGptModel maxNumTokens: 8192
[TensorRT-LLM][INFO] TRTGptModel maxInputLen: 8192 = min(maxSequenceLen - 1, maxNumTokens) since context FMHA and usePackedInput are enabled
[TensorRT-LLM][INFO] TRTGptModel If model type is encoder, maxInputLen would be reset in trtEncoderModel to maxInputLen: min(maxSequenceLen, maxNumTokens).
[TensorRT-LLM][INFO] Capacity Scheduler Policy: GUARANTEED_NO_EVICT
[TensorRT-LLM][INFO] Context Chunking Scheduler Policy: None
I0114 09:00:21.592649 880 python_be.cc:2249] "TRITONBACKEND_ModelInstanceInitialize: tensorrt_llm_bls_0_0 (CPU device 0)"
I0114 09:00:21.818542 880 python_be.cc:2249] "TRITONBACKEND_ModelInstanceInitialize: postprocessing_0_0 (CPU device 0)"
I0114 09:00:22.615258 880 python_be.cc:2249] "TRITONBACKEND_ModelInstanceInitialize: preprocessing_0_0 (CPU device 0)"
I0114 09:00:23.110505 880 model_lifecycle.cc:849] "successfully loaded 'tensorrt_llm_bls'"
[TensorRT-LLM][INFO] Loaded engine size: 5510 MiB
[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is ${skip_special_tokens}). Set it as True by default.
I0114 09:00:24.115872 880 model_lifecycle.cc:849] "successfully loaded 'postprocessing'"
[TensorRT-LLM][ERROR] tensorrt_llm::common::TllmException: [TensorRT-LLM][ERROR] Assertion failed: sizeof(T) <= remaining_buffer_size (/workspace/tensorrt_llm/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/serializationUtils.h:32)
1 0x755b190d47df tensorrt_llm::common::throwRuntimeError(char const*, int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 95
2 0x755b192eb1c2 tensorrt_llm::kernels::jit::CubinObj::CubinObj(void const*, unsigned long) + 274
3 0x755b193017d4 tensorrt_llm::kernels::jit::CubinObjRegistryTemplate<tensorrt_llm::kernels::XQAKernelFullHashKey, tensorrt_llm::kernels::XQAKernelFullHasher>::CubinObjRegistryTemplate(void const*, unsigned long) + 292
4 0x755b19301132 tensorrt_llm::kernels::DecoderXQARunner::Resource::Resource(void const*, unsigned long) + 50
5 0x755b0cc5e149 tensorrt_llm::plugins::GPTAttentionPluginCommon::GPTAttentionPluginCommon(void const*, unsigned long) + 1193
6 0x755b0cc95232 tensorrt_llm::plugins::GPTAttentionPlugin::GPTAttentionPlugin(void const*, unsigned long) + 18
7 0x755b0cc952b2 tensorrt_llm::plugins::GPTAttentionPluginCreator::deserializePlugin(char const*, void const*, unsigned long) + 50
8 0x755ad6b53b5b /usr/local/tensorrt/lib/libnvinfer.so.10(+0x11deb5b) [0x755ad6b53b5b]
9 0x755ad6b5045e /usr/local/tensorrt/lib/libnvinfer.so.10(+0x11db45e) [0x755ad6b5045e]
10 0x755ad6a832b7 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x110e2b7) [0x755ad6a832b7]
11 0x755ad6a81e6a /usr/local/tensorrt/lib/libnvinfer.so.10(+0x110ce6a) [0x755ad6a81e6a]
12 0x755ad6a99a77 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1124a77) [0x755ad6a99a77]
13 0x755ad6a9d5b6 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x11285b6) [0x755ad6a9d5b6]
14 0x755ad6a9db06 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1128b06) [0x755ad6a9db06]
15 0x755ad6ad4fc7 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x115ffc7) [0x755ad6ad4fc7]
16 0x755ad6ad5bd8 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1160bd8) [0x755ad6ad5bd8]
17 0x755ad6ad5cdb /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1160cdb) [0x755ad6ad5cdb]
18 0x755b1b12f275 tensorrt_llm::runtime::TllmRuntime::TllmRuntime(tensorrt_llm::runtime::RawEngine const&, nvinfer1::ILogger*, float, bool) + 1413
19 0x755b1b58d428 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer1::ILogger>, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::runtime::RawEngine const&, bool, tensorrt_llm::batch_manager::TrtGptModelOptionalParams const&) + 1304
20 0x755b1b51151e tensorrt_llm::batch_manager::TrtGptModelFactory::create(tensorrt_llm::runtime::RawEngine const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::batch_manager::TrtGptModelType, tensorrt_llm::batch_manager::TrtGptModelOptionalParams const&) + 526
21 0x755b1b628029 tensorrt_llm::executor::Executor::Impl::createModel(tensorrt_llm::runtime::RawEngine const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::executor::ExecutorConfig const&) + 185
22 0x755b1b6286bd tensorrt_llm::executor::Executor::Impl::loadModel(std::optional<std::filesystem::__cxx11::path> const&, std::optional<std::basic_string_view<unsigned char, std::char_traits<unsigned char> > > const&, tensorrt_llm::runtime::GptJsonConfig const&, tensorrt_llm::executor::ExecutorConfig const&, bool, std::optional<std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorrt_llm::executor::Tensor, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, tensorrt_llm::executor::Tensor> > > > const&) + 1229
23 0x755b1b62990a tensorrt_llm::executor::Executor::Impl::Impl(std::filesystem::__cxx11::path const&, std::optional<std::filesystem::__cxx11::path> const&, tensorrt_llm::executor::ModelType, tensorrt_llm::executor::ExecutorConfig const&) + 2474
24 0x755b1b60f757 tensorrt_llm::executor::Executor::Executor(std::filesystem::__cxx11::path const&, tensorrt_llm::executor::ModelType, tensorrt_llm::executor::ExecutorConfig const&) + 87
25 0x755c3a2af38e /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so(+0x3238e) [0x755c3a2af38e]
26 0x755c3a2abc39 triton::backend::inflight_batcher_llm::ModelInstanceState::ModelInstanceState(triton::backend::inflight_batcher_llm::ModelState*, TRITONBACKEND_ModelInstance*) + 2185
27 0x755c3a2ac182 triton::backend::inflight_batcher_llm::ModelInstanceState::Create(triton::backend::inflight_batcher_llm::ModelState*, TRITONBACKEND_ModelInstance*, triton::backend::inflight_batcher_llm::ModelInstanceState**) + 66
28 0x755c3a299319 TRITONBACKEND_ModelInstanceInitialize + 153
29 0x755c43dd8619 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x1a1619) [0x755c43dd8619]
30 0x755c43dd90a2 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x1a20a2) [0x755c43dd90a2]
31 0x755c43dbecc3 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x187cc3) [0x755c43dbecc3]
32 0x755c43dbf074 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x188074) [0x755c43dbf074]
33 0x755c43dc865d /opt/tritonserver/bin/../lib/libtritonserver.so(+0x19165d) [0x755c43dc865d]
34 0x755c45578ec3 /usr/lib/x86_64-linux-gnu/libc.so.6(+0xa1ec3) [0x755c45578ec3]
35 0x755c43db5ee2 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x17eee2) [0x755c43db5ee2]
36 0x755c43dc3dac /opt/tritonserver/bin/../lib/libtritonserver.so(+0x18cdac) [0x755c43dc3dac]
37 0x755c43dc7de2 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x190de2) [0x755c43dc7de2]
38 0x755c43ec7ca1 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x290ca1) [0x755c43ec7ca1]
39 0x755c43ecaffc /opt/tritonserver/bin/../lib/libtritonserver.so(+0x293ffc) [0x755c43ecaffc]
40 0x755c440276f5 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x3f06f5) [0x755c440276f5]
41 0x755c45af5db4 /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xecdb4) [0x755c45af5db4]
42 0x755c45573a94 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x9ca94) [0x755c45573a94]
43 0x755c45600a34 __clone + 68
additional notes
N/A
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working