Assertion failed: sizeof(T) <= remaining_buffer_size

### System Info

Here's a revised version of the issue description with improved wording:

---

### System Specifications:
- **CPU Architecture**: x86_64  
- **CPU Memory**: 1024GB  
- **GPU Model**: Nvidia L40S  
- **Docker Image**: `nvcr.io/nvidia/tritonserver:nvcr.io/nvidia/tritonserver`  
- **TensorRT-LLM Version**: 0.16.0  

---

### Configuration Details:

#### Model Information:
- **Model Name**: `/engines/llama3.1-8B`  
- **Model Directory**: `/models/Meta-Llama-3.1-8B-Instruct`  
- **Engine Directory**: `/repo/llama3/tensorrt_llm/1`

#### Engine Configuration:

---

\




How to fix the above error?


### Who can help?

_No response_

### Information

- [ ] The official example scripts
- [X] My own modified scripts

### Tasks

- [ ] An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)
- [X] My own task or dataset (give details below)

### Reproduction

1. quantize the llama3.1-8B model
```bash
python ../quantization/quantize.py --model_dir <model_dir> \
  --output_dir <output_dir> \
  --dtype bfloat16 \
  --qformat int4_awq \
  --awq_block_size 128 \
  --batch_size 12
```

2. build the engine of it:
```bash
trtllm-build \
  --checkpoint_dir <checkpoint_dir> \
  --gpt_attention_plugin bfloat16 \
  --gemm_plugin bfloat16 \
  --output_dir <output_dir>
```
the config of the engine
```json
{
    "version": "0.17.0.dev2024121700",
    "pretrained_config": {
        "mlp_bias": false,
        "attn_bias": false,
        "rotary_base": 500000.0,
        "rotary_scaling": {
            "factor": 8.0,
            "high_freq_factor": 4.0,
            "low_freq_factor": 1.0,
            "original_max_position_embeddings": 8192,
            "rope_type": "llama3"
        },
        "residual_mlp": false,
        "disable_weight_only_quant_plugin": false,
        "moe": {
            "num_experts": 0,
            "shared_expert_intermediate_size": 0,
            "top_k": 0,
            "normalization_mode": 1,
            "sparse_mixer_epsilon": 0.01,
            "tp_mode": 0,
            "device_limited_n_group": 0,
            "device_limited_topk_group": 0,
            "device_limited_routed_scaling_factor": 1.0
        },
        "remove_duplicated_kv_heads": false,
        "fc_after_embed": false,
        "use_input_layernorm_in_first_layer": true,
        "use_last_layernorm": true,
        "layer_idx_offset": 0,
        "architecture": "LlamaForCausalLM",
        "dtype": "bfloat16",
        "vocab_size": 128256,
        "hidden_size": 4096,
        "num_hidden_layers": 32,
        "num_attention_heads": 32,
        "hidden_act": "silu",
        "logits_dtype": "float16",
        "norm_epsilon": 1e-05,
        "runtime_defaults": null,
        "position_embedding_type": "rope_gpt_neox",
        "num_key_value_heads": 8,
        "intermediate_size": 14336,
        "max_position_embeddings": 131072,
        "mapping": {
            "world_size": 1,
            "gpus_per_node": 8,
            "cp_size": 1,
            "tp_size": 1,
            "pp_size": 1,
            "moe_tp_size": 1,
            "moe_ep_size": 1
        },
        "quantization": {
            "quant_algo": "W4A16_AWQ",
            "kv_cache_quant_algo": null,
            "group_size": 128,
            "smoothquant_val": 0.5,
            "clamp_val": null,
            "use_meta_recipe": false,
            "has_zero_point": false,
            "pre_quant_scale": true,
            "exclude_modules": [
                "transformer.layers.13.input_layernorm",
                "transformer.layers.8.post_layernorm",
                "transformer.layers.0.input_layernorm",
                "transformer.layers.15.post_layernorm",
                "transformer.layers.19.post_layernorm",
                "transformer.layers.25.post_layernorm",
                "transformer.layers.9.post_layernorm",
                "transformer.layers.11.input_layernorm",
                "transformer.layers.21.input_layernorm",
                "transformer.layers.4.input_layernorm",
                "transformer.layers.20.input_layernorm",
                "transformer.layers.9.input_layernorm",
                "transformer.layers.26.input_layernorm",
                "transformer.layers.23.input_layernorm",
                "transformer.layers.31.input_layernorm",
                "transformer.layers.11.post_layernorm",
                "transformer.layers.25.input_layernorm",
                "transformer.layers.8.input_layernorm",
                "transformer.layers.5.post_layernorm",
                "transformer.layers.31.post_layernorm",
                "transformer.layers.30.input_layernorm",
                "transformer.layers.3.post_layernorm",
                "transformer.vocab_embedding",
                "transformer.layers.2.input_layernorm",
                "transformer.layers.29.post_layernorm",
                "transformer.layers.19.input_layernorm",
                "transformer.layers.27.post_layernorm",
                "transformer.layers.12.post_layernorm",
                "transformer.ln_f",
                "transformer.layers.1.input_layernorm",
                "transformer.layers.6.input_layernorm",
                "transformer.layers.5.input_layernorm",
                "transformer.layers.0.post_layernorm",
                "lm_head",
                "transformer.layers.26.post_layernorm",
                "transformer.layers.23.post_layernorm",
                "transformer.layers.17.post_layernorm",
                "transformer.layers.21.post_layernorm",
                "transformer.layers.7.input_layernorm",
                "transformer.layers.22.post_layernorm",
                "transformer.layers.10.post_layernorm",
                "transformer.layers.12.input_layernorm",
                "transformer.layers.18.input_layernorm",
                "transformer.layers.7.post_layernorm",
                "transformer.layers.30.post_layernorm",
                "transformer.layers.24.post_layernorm",
                "transformer.layers.20.post_layernorm",
                "transformer.layers.22.input_layernorm",
                "transformer.layers.15.input_layernorm",
                "transformer.layers.3.input_layernorm",
                "transformer.layers.4.post_layernorm",
                "transformer.layers.13.post_layernorm",
                "transformer.layers.29.input_layernorm",
                "transformer.layers.18.post_layernorm",
                "transformer.layers.28.input_layernorm",
                "transformer.layers.16.input_layernorm",
                "transformer.layers.1.post_layernorm",
                "transformer.layers.17.input_layernorm",
                "transformer.layers.28.post_layernorm",
                "transformer.layers.16.post_layernorm",
                "transformer.layers.6.post_layernorm",
                "transformer.layers.14.post_layernorm",
                "transformer.layers.24.input_layernorm",
                "transformer.layers.10.input_layernorm",
                "transformer.layers.2.post_layernorm",
                "transformer.layers.27.input_layernorm",
                "transformer.layers.14.input_layernorm"
            ]
        },
        "use_parallel_embedding": true,
        "embedding_sharding_dim": 0,
        "head_size": 128,
        "qk_layernorm": false,
        "rotary_embedding_dim": 128,
        "producer": {
            "name": "modelopt",
            "version": "0.19.0"
        },
        "share_embedding_table": false,
        "bias": false,
        "rotary_pct": 1.0,
        "rank": 0,
        "decoder": "llama",
        "rmsnorm": true,
        "lm_head_bias": false,
        "model_type": "llama"
    },
    "build_config": {
        "max_input_len": 1024,
        "max_seq_len": 131072,
        "opt_batch_size": 8,
        "max_batch_size": 2048,
        "max_beam_width": 1,
        "max_num_tokens": 8192,
        "opt_num_tokens": 2048,
        "max_prompt_embedding_table_size": 0,
        "kv_cache_type": "PAGED",
        "gather_context_logits": false,
        "gather_generation_logits": false,
        "strongly_typed": true,
        "force_num_profiles": null,
        "profiling_verbosity": "layer_names_only",
        "enable_debug_output": false,
        "max_draft_len": 0,
        "speculative_decoding_mode": 1,
        "use_refit": false,
        "input_timing_cache": null,
        "output_timing_cache": "model.cache",
        "lora_config": {
            "lora_dir": [],
            "lora_ckpt_source": "hf",
            "max_lora_rank": 64,
            "lora_target_modules": [],
            "trtllm_modules_to_hf_modules": {}
        },
        "auto_parallel_config": {
            "world_size": 1,
            "gpus_per_node": 8,
            "cluster_key": "L40S",
            "cluster_info": null,
            "sharding_cost_model": "alpha_beta",
            "comm_cost_model": "alpha_beta",
            "enable_pipeline_parallelism": false,
            "enable_shard_unbalanced_shape": false,
            "enable_shard_dynamic_shape": false,
            "enable_reduce_scatter": true,
            "builder_flags": null,
            "debug_mode": false,
            "infer_shape": true,
            "validation_mode": false,
            "same_buffer_io": {
                "past_key_value_(\\d+)": "present_key_value_\\1"
            },
            "same_spec_io": {},
            "sharded_io_allowlist": [
                "past_key_value_\\d+",
                "present_key_value_\\d*"
            ],
            "fill_weights": false,
            "parallel_config_cache": null,
            "profile_cache": null,
            "dump_path": null,
            "debug_outputs": []
        },
        "weight_sparsity": false,
        "weight_streaming": false,
        "plugin_config": {
            "dtype": "bfloat16",
            "bert_attention_plugin": "auto",
            "gpt_attention_plugin": "bfloat16",
            "gemm_plugin": "bfloat16",
            "gemm_swiglu_plugin": null,
            "fp8_rowwise_gemm_plugin": null,
            "qserve_gemm_plugin": null,
            "identity_plugin": null,
            "nccl_plugin": null,
            "lora_plugin": null,
            "weight_only_groupwise_quant_matmul_plugin": "bfloat16",
            "weight_only_quant_matmul_plugin": null,
            "smooth_quant_plugins": true,
            "smooth_quant_gemm_plugin": null,
            "layernorm_quantization_plugin": null,
            "rmsnorm_quantization_plugin": null,
            "quantize_per_token_plugin": false,
            "quantize_tensor_plugin": false,
            "moe_plugin": "auto",
            "mamba_conv1d_plugin": "auto",
            "low_latency_gemm_plugin": null,
            "low_latency_gemm_swiglu_plugin": null,
            "context_fmha": true,
            "bert_context_fmha_fp32_acc": false,
            "paged_kv_cache": true,
            "remove_input_padding": true,
            "reduce_fusion": false,
            "user_buffer": false,
            "tokens_per_block": 64,
            "use_paged_context_fmha": false,
            "use_fp8_context_fmha": false,
            "multiple_profiles": false,
            "paged_state": false,
            "streamingllm": false,
            "manage_weights": false,
            "use_fused_mlp": true,
            "pp_reduce_scatter": false
        },
        "use_strip_plan": false,
        "max_encoder_input_len": 1024,
        "use_fused_mlp": true,
        "monitor_memory": false,
        "use_mrope": false
    }
```

3. execute the following commands to populate the `config.pbtxt` files:

```bash
FILL_TEMPLATE_SCRIPT=/tensorrtllm_backend/tools/fill_template.py
MODEL_FOLDER=/repo/llama3
ENGINE_PATH=/repo/llama3/tensorrt_llm/1
TOKENIZER_DIR=/models/Meta-Llama-3.1-8B-Instruct
TRITON_MAX_BATCH_SIZE=1

python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},preprocessing_instance_count:1
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},postprocessing_instance_count:1
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},logits_datatype:TYPE_FP32
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:256,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_BF16,logits_datatype:TYPE_FP32,enable_kv_cache_reuse:true,enable_chunked_context:true
```

4. Start Triton Server:
```bash
tritonserver --model-repository /repo/llama3/
```

### Expected behavior

start the triton server using TensorRT-LLM

### actual behavior

Upon starting the Triton server, the following error occurs:  

```
I0114 09:00:18.017712 880 pinned_memory_manager.cc:277] "Pinned memory pool is created at '0x755bdc000000' with size 268435456"
I0114 09:00:18.029910 880 cuda_memory_manager.cc:107] "CUDA memory pool is created on device 0 with size 67108864"
I0114 09:00:18.029919 880 cuda_memory_manager.cc:107] "CUDA memory pool is created on device 1 with size 67108864"
I0114 09:00:18.200807 880 model_lifecycle.cc:473] "loading: postprocessing:1"
I0114 09:00:18.200845 880 model_lifecycle.cc:473] "loading: preprocessing:1"
I0114 09:00:18.200881 880 model_lifecycle.cc:473] "loading: tensorrt_llm:1"
I0114 09:00:18.200904 880 model_lifecycle.cc:473] "loading: tensorrt_llm_bls:1"
I0114 09:00:18.378243 880 libtensorrtllm.cc:55] "TRITONBACKEND_Initialize: tensorrtllm"
I0114 09:00:18.378292 880 libtensorrtllm.cc:62] "Triton TRITONBACKEND API version: 1.19"
I0114 09:00:18.378299 880 libtensorrtllm.cc:66] "'tensorrtllm' TRITONBACKEND API version: 1.19"
I0114 09:00:18.378306 880 libtensorrtllm.cc:86] "backend configuration:\n{\"cmdline\":{\"auto-complete-config\":\"true\",\"backend-directory\":\"/opt/tritonserver/backends\",\"min-compute-capability\":\"6.000000\",\"default-max-batch-size\":\"4\"}}"
[TensorRT-LLM][WARNING] gpu_device_ids is not specified, will be automatically set
[TensorRT-LLM][WARNING] participant_ids is not specified, will be automatically set
I0114 09:00:18.402251 880 libtensorrtllm.cc:114] "TRITONBACKEND_ModelInitialize: tensorrt_llm (version 1)"
[TensorRT-LLM][WARNING] iter_stats_max_iterations is not specified, will use default value of 1000
[TensorRT-LLM][WARNING] request_stats_max_iterations is not specified, will use default value of 0
[TensorRT-LLM][WARNING] normalize_log_probs is not specified, will be set to true
[TensorRT-LLM][WARNING] cross_kv_cache_fraction is not specified, error if it's encoder-decoder model, otherwise ok
[TensorRT-LLM][WARNING] kv_cache_host_memory_bytes not set, defaulting to 0
[TensorRT-LLM][WARNING] kv_cache_onboard_blocks not set, defaulting to true
[TensorRT-LLM][WARNING] sink_token_length is not specified, will use default value
[TensorRT-LLM][WARNING] enable_chunked_context is set to true, will use context chunking (requires building the model with use_paged_context_fmha).
[TensorRT-LLM][WARNING] batch_scheduler_policy parameter was not found or is invalid (must be max_utilization or guaranteed_no_evict)
[TensorRT-LLM][WARNING] lora_cache_max_adapter_size not set, defaulting to 64
[TensorRT-LLM][WARNING] lora_cache_optimal_adapter_size not set, defaulting to 8
[TensorRT-LLM][WARNING] lora_cache_gpu_memory_fraction not set, defaulting to 0.05
[TensorRT-LLM][WARNING] lora_cache_host_memory_bytes not set, defaulting to 1GB
[TensorRT-LLM][WARNING] multi_block_mode is not specified, will be set to true
[TensorRT-LLM][WARNING] enable_context_fmha_fp32_acc is not specified, will be set to false
[TensorRT-LLM][WARNING] cuda_graph_mode is not specified, will be set to false
[TensorRT-LLM][WARNING] cuda_graph_cache_size is not specified, will be set to 0
[TensorRT-LLM][INFO] speculative_decoding_fast_logits is not specified, will be set to false
[TensorRT-LLM][WARNING] decoding_mode parameter is invalid or not specified(must be one of the {top_k, top_p, top_k_top_p, beam_search, medusa, redrafter, lookahead, eagle}).Using default: top_k_top_p if max_beam_width == 1, beam_search otherwise
[TensorRT-LLM][WARNING] gpu_weights_percent parameter is not specified, will use default value of 1.0
[TensorRT-LLM][INFO] recv_poll_period_ms is not set, will use busy loop
[TensorRT-LLM][WARNING] encoder_model_path is not specified, will be left empty
[TensorRT-LLM][INFO] Engine version 0.17.0.dev2024121700 found in the config file, assuming engine(s) built by new builder API.
[TensorRT-LLM][INFO] Initializing MPI with thread mode 3
[TensorRT-LLM][INFO] Initialized MPI
[TensorRT-LLM][INFO] Refreshed the MPI local session
[TensorRT-LLM][INFO] MPI size: 1, MPI local size: 1, rank: 0
[TensorRT-LLM][WARNING] Chunked context is not supported for this configuration and will be disabled. Related configs: RNNBased: 0, KVCacheEnabled: 1, PagedContextFMHA: 0
[TensorRT-LLM][INFO] Rank 0 is using GPU 0
[TensorRT-LLM][WARNING] Fix optionalParams : KV cache reuse disabled because model was not built with paged context FMHA support
[TensorRT-LLM][INFO] TRTGptModel maxNumSequences: 2048
[TensorRT-LLM][INFO] TRTGptModel maxBatchSize: 2048
[TensorRT-LLM][INFO] TRTGptModel maxBeamWidth: 1
[TensorRT-LLM][INFO] TRTGptModel maxSequenceLen: 131072
[TensorRT-LLM][INFO] TRTGptModel maxDraftLen: 0
[TensorRT-LLM][INFO] TRTGptModel mMaxAttentionWindowSize: (256) * 32
[TensorRT-LLM][INFO] TRTGptModel enableTrtOverlap: 0
[TensorRT-LLM][INFO] TRTGptModel normalizeLogProbs: 1
[TensorRT-LLM][INFO] TRTGptModel maxNumTokens: 8192
[TensorRT-LLM][INFO] TRTGptModel maxInputLen: 8192 = min(maxSequenceLen - 1, maxNumTokens) since context FMHA and usePackedInput are enabled
[TensorRT-LLM][INFO] TRTGptModel If model type is encoder, maxInputLen would be reset in trtEncoderModel to maxInputLen: min(maxSequenceLen, maxNumTokens).
[TensorRT-LLM][INFO] Capacity Scheduler Policy: GUARANTEED_NO_EVICT
[TensorRT-LLM][INFO] Context Chunking Scheduler Policy: None
I0114 09:00:21.592649 880 python_be.cc:2249] "TRITONBACKEND_ModelInstanceInitialize: tensorrt_llm_bls_0_0 (CPU device 0)"
I0114 09:00:21.818542 880 python_be.cc:2249] "TRITONBACKEND_ModelInstanceInitialize: postprocessing_0_0 (CPU device 0)"
I0114 09:00:22.615258 880 python_be.cc:2249] "TRITONBACKEND_ModelInstanceInitialize: preprocessing_0_0 (CPU device 0)"
I0114 09:00:23.110505 880 model_lifecycle.cc:849] "successfully loaded 'tensorrt_llm_bls'"
[TensorRT-LLM][INFO] Loaded engine size: 5510 MiB
[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is ${skip_special_tokens}). Set it as True by default.
I0114 09:00:24.115872 880 model_lifecycle.cc:849] "successfully loaded 'postprocessing'"
[TensorRT-LLM][ERROR] tensorrt_llm::common::TllmException: [TensorRT-LLM][ERROR] Assertion failed: sizeof(T) <= remaining_buffer_size (/workspace/tensorrt_llm/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/serializationUtils.h:32)
1       0x755b190d47df tensorrt_llm::common::throwRuntimeError(char const*, int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 95
2       0x755b192eb1c2 tensorrt_llm::kernels::jit::CubinObj::CubinObj(void const*, unsigned long) + 274
3       0x755b193017d4 tensorrt_llm::kernels::jit::CubinObjRegistryTemplate<tensorrt_llm::kernels::XQAKernelFullHashKey, tensorrt_llm::kernels::XQAKernelFullHasher>::CubinObjRegistryTemplate(void const*, unsigned long) + 292
4       0x755b19301132 tensorrt_llm::kernels::DecoderXQARunner::Resource::Resource(void const*, unsigned long) + 50
5       0x755b0cc5e149 tensorrt_llm::plugins::GPTAttentionPluginCommon::GPTAttentionPluginCommon(void const*, unsigned long) + 1193
6       0x755b0cc95232 tensorrt_llm::plugins::GPTAttentionPlugin::GPTAttentionPlugin(void const*, unsigned long) + 18
7       0x755b0cc952b2 tensorrt_llm::plugins::GPTAttentionPluginCreator::deserializePlugin(char const*, void const*, unsigned long) + 50
8       0x755ad6b53b5b /usr/local/tensorrt/lib/libnvinfer.so.10(+0x11deb5b) [0x755ad6b53b5b]
9       0x755ad6b5045e /usr/local/tensorrt/lib/libnvinfer.so.10(+0x11db45e) [0x755ad6b5045e]
10      0x755ad6a832b7 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x110e2b7) [0x755ad6a832b7]
11      0x755ad6a81e6a /usr/local/tensorrt/lib/libnvinfer.so.10(+0x110ce6a) [0x755ad6a81e6a]
12      0x755ad6a99a77 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1124a77) [0x755ad6a99a77]
13      0x755ad6a9d5b6 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x11285b6) [0x755ad6a9d5b6]
14      0x755ad6a9db06 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1128b06) [0x755ad6a9db06]
15      0x755ad6ad4fc7 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x115ffc7) [0x755ad6ad4fc7]
16      0x755ad6ad5bd8 /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1160bd8) [0x755ad6ad5bd8]
17      0x755ad6ad5cdb /usr/local/tensorrt/lib/libnvinfer.so.10(+0x1160cdb) [0x755ad6ad5cdb]
18      0x755b1b12f275 tensorrt_llm::runtime::TllmRuntime::TllmRuntime(tensorrt_llm::runtime::RawEngine const&, nvinfer1::ILogger*, float, bool) + 1413
19      0x755b1b58d428 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer1::ILogger>, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::runtime::RawEngine const&, bool, tensorrt_llm::batch_manager::TrtGptModelOptionalParams const&) + 1304
20      0x755b1b51151e tensorrt_llm::batch_manager::TrtGptModelFactory::create(tensorrt_llm::runtime::RawEngine const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::batch_manager::TrtGptModelType, tensorrt_llm::batch_manager::TrtGptModelOptionalParams const&) + 526
21      0x755b1b628029 tensorrt_llm::executor::Executor::Impl::createModel(tensorrt_llm::runtime::RawEngine const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::executor::ExecutorConfig const&) + 185
22      0x755b1b6286bd tensorrt_llm::executor::Executor::Impl::loadModel(std::optional<std::filesystem::__cxx11::path> const&, std::optional<std::basic_string_view<unsigned char, std::char_traits<unsigned char> > > const&, tensorrt_llm::runtime::GptJsonConfig const&, tensorrt_llm::executor::ExecutorConfig const&, bool, std::optional<std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorrt_llm::executor::Tensor, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, tensorrt_llm::executor::Tensor> > > > const&) + 1229
23      0x755b1b62990a tensorrt_llm::executor::Executor::Impl::Impl(std::filesystem::__cxx11::path const&, std::optional<std::filesystem::__cxx11::path> const&, tensorrt_llm::executor::ModelType, tensorrt_llm::executor::ExecutorConfig const&) + 2474
24      0x755b1b60f757 tensorrt_llm::executor::Executor::Executor(std::filesystem::__cxx11::path const&, tensorrt_llm::executor::ModelType, tensorrt_llm::executor::ExecutorConfig const&) + 87
25      0x755c3a2af38e /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so(+0x3238e) [0x755c3a2af38e]
26      0x755c3a2abc39 triton::backend::inflight_batcher_llm::ModelInstanceState::ModelInstanceState(triton::backend::inflight_batcher_llm::ModelState*, TRITONBACKEND_ModelInstance*) + 2185
27      0x755c3a2ac182 triton::backend::inflight_batcher_llm::ModelInstanceState::Create(triton::backend::inflight_batcher_llm::ModelState*, TRITONBACKEND_ModelInstance*, triton::backend::inflight_batcher_llm::ModelInstanceState**) + 66
28      0x755c3a299319 TRITONBACKEND_ModelInstanceInitialize + 153
29      0x755c43dd8619 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x1a1619) [0x755c43dd8619]
30      0x755c43dd90a2 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x1a20a2) [0x755c43dd90a2]
31      0x755c43dbecc3 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x187cc3) [0x755c43dbecc3]
32      0x755c43dbf074 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x188074) [0x755c43dbf074]
33      0x755c43dc865d /opt/tritonserver/bin/../lib/libtritonserver.so(+0x19165d) [0x755c43dc865d]
34      0x755c45578ec3 /usr/lib/x86_64-linux-gnu/libc.so.6(+0xa1ec3) [0x755c45578ec3]
35      0x755c43db5ee2 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x17eee2) [0x755c43db5ee2]
36      0x755c43dc3dac /opt/tritonserver/bin/../lib/libtritonserver.so(+0x18cdac) [0x755c43dc3dac]
37      0x755c43dc7de2 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x190de2) [0x755c43dc7de2]
38      0x755c43ec7ca1 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x290ca1) [0x755c43ec7ca1]
39      0x755c43ecaffc /opt/tritonserver/bin/../lib/libtritonserver.so(+0x293ffc) [0x755c43ecaffc]
40      0x755c440276f5 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x3f06f5) [0x755c440276f5]
41      0x755c45af5db4 /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xecdb4) [0x755c45af5db4]
42      0x755c45573a94 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x9ca94) [0x755c45573a94]
43      0x755c45600a34 __clone + 68
```

### additional notes

N/A

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Assertion failed: sizeof(T) <= remaining_buffer_size #679

System Info

System Specifications:

Configuration Details:

Model Information:

Engine Configuration:

Who can help?

Information

Tasks

Reproduction

Expected behavior

actual behavior

additional notes

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Assertion failed: sizeof(T) <= remaining_buffer_size #679

Description

System Info

System Specifications:

Configuration Details:

Model Information:

Engine Configuration:

Who can help?

Information

Tasks

Reproduction

Expected behavior

actual behavior

additional notes

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions