Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 5 additions & 25 deletions vllm/model_executor/models/apertus.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead,
VocabParallelEmbedding,
)
Expand Down Expand Up @@ -346,24 +345,18 @@ def __init__(
config = vllm_config.model_config.hf_config
cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config

self.config = config
self.quant_config = quant_config
lora_vocab = (
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
if lora_config
else 0
)
self.vocab_size = config.vocab_size + lora_vocab
self.org_vocab_size = config.vocab_size

self.vocab_size = config.vocab_size

if get_pp_group().is_first_rank or (
config.tie_word_embeddings and get_pp_group().is_last_rank
):
self.embed_tokens = VocabParallelEmbedding(
self.vocab_size,
config.hidden_size,
org_num_embeddings=config.vocab_size,
quant_config=quant_config,
)
else:
Expand Down Expand Up @@ -518,9 +511,7 @@ def __init__(
super().__init__()
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.config = config
self.lora_config = lora_config

self.model = self._init_model(
vllm_config=vllm_config,
Expand All @@ -529,20 +520,9 @@ def __init__(
)

if get_pp_group().is_last_rank:
self.unpadded_vocab_size = config.vocab_size
if lora_config:
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
self.lm_head = ParallelLMHead(
self.unpadded_vocab_size,
config.vocab_size,
config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=(
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if not lora_config
else lora_config.lora_vocab_padding_size
),
quant_config=quant_config,
prefix=maybe_prefix(prefix, "lm_head"),
)
Expand All @@ -551,7 +531,7 @@ def __init__(

logit_scale = getattr(config, "logit_scale", 1.0)
self.logits_processor = LogitsProcessor(
self.unpadded_vocab_size, config.vocab_size, logit_scale
config.vocab_size, scale=logit_scale
)
else:
self.lm_head = PPMissingLayer()
Expand Down
10 changes: 2 additions & 8 deletions vllm/model_executor/models/arcee.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead,
VocabParallelEmbedding,
)
Expand Down Expand Up @@ -200,7 +199,6 @@ def __init__(
self.quant_config = quant_config
self.config = config
self.vocab_size = config.vocab_size
self.org_vocab_size = config.vocab_size

# Word embeddings (parallelized if using pipeline parallel)
if get_pp_group().is_first_rank or (
Expand All @@ -209,7 +207,6 @@ def __init__(
self.embed_tokens = VocabParallelEmbedding(
self.vocab_size,
config.hidden_size,
org_num_embeddings=config.vocab_size,
quant_config=quant_config,
)
else:
Expand Down Expand Up @@ -383,13 +380,10 @@ def __init__(self, *, vllm_config, prefix: str = "") -> None:
if get_pp_group().is_last_rank:
# Determine vocabulary size (including any LoRA extra tokens
# for padded LM head)
self.unpadded_vocab_size = config.vocab_size

self.lm_head = ParallelLMHead(
self.unpadded_vocab_size,
config.vocab_size,
config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE,
quant_config=vllm_config.quant_config,
bias=getattr(config, "lm_head_bias", False),
prefix=f"{prefix}.lm_head",
Expand All @@ -399,7 +393,7 @@ def __init__(self, *, vllm_config, prefix: str = "") -> None:
self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
logit_scale = getattr(config, "logit_scale", 1.0)
self.logits_processor = LogitsProcessor(
self.unpadded_vocab_size, config.vocab_size, logit_scale
config.vocab_size, scale=logit_scale
)
else:
# Placeholder for lm_head on non-last ranks
Expand Down
6 changes: 2 additions & 4 deletions vllm/model_executor/models/arctic.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,10 +490,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.lm_head.weight = self.model.embed_tokens.weight
self.num_experts = config.num_local_experts
self.num_experts_per_tok = config.num_experts_per_tok
self.unpadded_vocab_size = config.vocab_size
self.logits_processor = LogitsProcessor(
self.unpadded_vocab_size, config.vocab_size
)

self.logits_processor = LogitsProcessor(config.vocab_size)
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors
)
Expand Down
8 changes: 2 additions & 6 deletions vllm/model_executor/models/aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,18 +547,14 @@ def __init__(
self.pad_token_id = (
self.config.pad_token_id if self.config.pad_token_id is not None else -1
)
self.unpadded_vocab_size = config.text_config.vocab_size
self.lm_head = ParallelLMHead(
self.unpadded_vocab_size,
self.vocab_size,
config.text_config.hidden_size,
org_num_embeddings=self.language_model.org_vocab_size,
quant_config=quant_config,
prefix=maybe_prefix(prefix, "lm_head"),
)
logit_scale = getattr(config, "logit_scale", 1.0)
self.logits_processor = LogitsProcessor(
self.unpadded_vocab_size, self.vocab_size, logit_scale
)
self.logits_processor = LogitsProcessor(self.vocab_size, scale=logit_scale)

def _parse_and_validate_image_input(
self, **kwargs: object
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,9 +402,9 @@ def __init__(
super().__init__()
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config

self.config = config
self.lora_config = lora_config

self.tp_size = get_tensor_model_parallel_world_size()
self.quant_config = quant_config
self.model = BaiChuanModel(
Expand Down
2 changes: 0 additions & 2 deletions vllm/model_executor/models/bailing_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,10 +581,8 @@ def __init__(
config = vllm_config.model_config.hf_config.get_text_config()
vllm_config.model_config.hf_config = config
quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config

self.config = config
self.lora_config = lora_config
self.quant_config = quant_config
self.max_position_embeddings = config.max_position_embeddings
self.model = BailingMoeModel(
Expand Down
30 changes: 6 additions & 24 deletions vllm/model_executor/models/bamba.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead,
VocabParallelEmbedding,
)
Expand Down Expand Up @@ -284,21 +283,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
model_config = vllm_config.model_config
cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config

self.config = config
lora_vocab = (
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
if lora_config
else 0
)
self.vocab_size = config.vocab_size + lora_vocab
self.org_vocab_size = config.vocab_size

self.vocab_size = config.vocab_size

self.embed_tokens = VocabParallelEmbedding(
self.vocab_size,
config.hidden_size,
org_num_embeddings=config.vocab_size,
)

def get_layer(prefix: str):
Expand Down Expand Up @@ -478,7 +470,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
config = vllm_config.model_config.hf_config
self.vllm_config = vllm_config
self.model_config = vllm_config.model_config
lora_config = vllm_config.lora_config

scheduler_config = vllm_config.scheduler_config
self.quant_config = vllm_config.quant_config

Expand All @@ -488,24 +480,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.model = BambaModel(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
)
self.unpadded_vocab_size = config.vocab_size
if lora_config:
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size

self.lm_head = ParallelLMHead(
self.unpadded_vocab_size,
config.vocab_size,
config.hidden_size,
org_num_embeddings=config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if not lora_config
else lora_config.lora_vocab_padding_size,
prefix=maybe_prefix(prefix, "lm_head"),
)

self.logits_processor = LogitsProcessor(
self.unpadded_vocab_size, config.vocab_size
)
self.logits_processor = LogitsProcessor(config.vocab_size)

self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors
Expand Down
8 changes: 3 additions & 5 deletions vllm/model_executor/models/chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -963,19 +963,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.model = ChameleonModel(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
)
self.unpadded_vocab_size = config.vocab_size

self.lm_head = ParallelLMHead(
self.unpadded_vocab_size,
config.vocab_size,
config.hidden_size,
prefix=maybe_prefix(prefix, "lm_head"),
)
if config.tie_word_embeddings:
self.lm_head.weight = self.model.embed_tokens.weight

logit_scale = getattr(config, "logit_scale", 1.0)
self.logits_processor = LogitsProcessor(
self.unpadded_vocab_size, config.vocab_size, logit_scale
)
self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale)
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors
)
Expand Down
3 changes: 1 addition & 2 deletions vllm/model_executor/models/chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,10 +433,9 @@ def __init__(
super().__init__()
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config

multimodal_config = vllm_config.model_config.multimodal_config
self.config = config
self.lora_config = lora_config
self.multimodal_config = multimodal_config

self.quant_config = quant_config
Expand Down
19 changes: 6 additions & 13 deletions vllm/model_executor/models/commandr.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,17 +288,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
config = vllm_config.model_config.hf_config
cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
self.quant_config = quant_config

self.config = config
lora_vocab = (
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
if lora_config
else 0
)
self.vocab_size = config.vocab_size + lora_vocab
self.org_vocab_size = config.vocab_size

self.vocab_size = config.vocab_size

self.embed_tokens = VocabParallelEmbedding(
config.vocab_size, config.hidden_size
)
Expand Down Expand Up @@ -424,17 +419,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config

self.config = config
# currently all existing command R models have `tie_word_embeddings`
# enabled
assert config.tie_word_embeddings
self.unpadded_vocab_size = config.vocab_size
if lora_config:
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size

self.quant_config = quant_config
self.logits_processor = LogitsProcessor(
self.unpadded_vocab_size, config.vocab_size, scale=config.logit_scale
config.vocab_size, scale=config.logit_scale
)
self.model = CohereModel(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
Expand Down
9 changes: 2 additions & 7 deletions vllm/model_executor/models/dbrx.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE,
ParallelLMHead,
VocabParallelEmbedding,
)
Expand Down Expand Up @@ -441,21 +440,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
if config.tie_word_embeddings:
raise ValueError("tie_word_embeddings is not supported for Dbrx models.")
self.quant_config = quant_config
self.unpadded_vocab_size = config.vocab_size

self.transformer = DbrxModel(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "transformer")
)
self.lm_head = ParallelLMHead(
config.vocab_size,
config.d_model,
org_num_embeddings=config.vocab_size,
padding_size=DEFAULT_VOCAB_PADDING_SIZE,
quant_config=quant_config,
prefix=maybe_prefix(prefix, "lm_head"),
)
self.logits_processor = LogitsProcessor(
self.unpadded_vocab_size, config.vocab_size
)
self.logits_processor = LogitsProcessor(config.vocab_size)
self.make_empty_intermediate_tensors = (
self.transformer.make_empty_intermediate_tensors
)
Expand Down
Loading