Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/nightly-benchmarks/nightly-descriptions.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ This benchmark aims to:

Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.

Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)

## Setup

Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/layers/quantization/moe_wna16.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
group_size = self.quant_config.group_size
group_size_div_factor = 1

# make intermediate_size and hidden_size diviable by group_size
# make intermediate_size and hidden_size divisible by group_size
# we reduce the group size to ensure that
# and we would repeat the loaded_weight later
while intermediate_size_per_partition % group_size or \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class MarlinWorkspace:

def __init__(self, out_features, min_thread_n, max_parallel):
assert (out_features % min_thread_n == 0), (
"out_features = {} is undivisible by min_thread_n = {}".format(
"out_features = {} is indivisible by min_thread_n = {}".format(
out_features, min_thread_n))

max_workspace_size = ((out_features // min_thread_n) * max_parallel)
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/layers/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,7 @@ def _sample_with_torch(
else:
sampled_token_ids_tensor = None

# Counterintiutively, having two loops here is actually faster.
# Counterintuitively, having two loops here is actually faster.
# The first loop can run without waiting on GPU<->CPU sync.
for sampling_type in SamplingType:
sample_indices = categorized_sample_indices[sampling_type]
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/glm4_1v.py
Original file line number Diff line number Diff line change
Expand Up @@ -1524,7 +1524,7 @@ def get_multimodal_embeddings(
return None

# The result multimodal_embeddings is tuple of tensors, with each
# tensor correspoending to a multimodal data item (image or video).
# tensor corresponding to a multimodal data item (image or video).
multimodal_embeddings: tuple[torch.Tensor, ...] = ()

# NOTE: It is important to iterate over the keys in this dictionary
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/interns1.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,7 +738,7 @@ def get_multimodal_embeddings(self,
return []

# The result multimodal_embeddings is tuple of tensors, with each
# tensor correspoending to a multimodal data item (image or video).
# tensor corresponding to a multimodal data item (image or video).
multimodal_embeddings: tuple[torch.Tensor, ...] = ()

# NOTE: It is important to iterate over the keys in this dictionary
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,7 @@ def pad_and_concat_to_dim3(
max_len = max(f.shape[-1] for f in features)
# Ensure all features have dim=3
features = [f.view(-1, *f.shape[-2:]) for f in features]
# Pad and oncatenate:
# Pad and concatenate:
# [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)]
features = [F.pad(f, (0, max_len - f.shape[-1])) for f in features]
return torch.cat(features)
Loading