From ed6236e781fbb526e552b9b4730c1b891efdf8ea Mon Sep 17 00:00:00 2001 From: wangruohui <12756472+wangruohui@users.noreply.github.com> Date: Fri, 11 Aug 2023 18:18:40 +0800 Subject: [PATCH 1/8] add output to throughput benchmark --- benchmarks/benchmark_throughput.py | 98 ++++++++++++++++++++---------- 1 file changed, 67 insertions(+), 31 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index b2bea8520565..1b82f194888b 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -12,6 +12,17 @@ from vllm import LLM, SamplingParams from vllm.transformers_utils.tokenizer import get_tokenizer +import logging + +import time + +logging.basicConfig( + level=logging.DEBUG, + filename=f"benchmark_throughput_{time.time()}.log", + filemode="w", + format="%(asctime)s %(levelname)s %(message)s", +) + def sample_requests( dataset_path: str, @@ -22,15 +33,10 @@ def sample_requests( with open(dataset_path) as f: dataset = json.load(f) # Filter out the conversations with less than 2 turns. - dataset = [ - data for data in dataset - if len(data["conversations"]) >= 2 - ] + dataset = [data for data in dataset if len(data["conversations"]) >= 2] # Only keep the first two turns of each conversation. - dataset = [ - (data["conversations"][0]["value"], data["conversations"][1]["value"]) - for data in dataset - ] + dataset = [(data["conversations"][0]["value"], + data["conversations"][1]["value"]) for data in dataset] # Tokenize the prompts and completions. prompts = [prompt for prompt, _ in dataset] @@ -111,8 +117,8 @@ def run_hf( trust_remote_code: bool, ) -> float: assert not use_beam_search - llm = AutoModelForCausalLM.from_pretrained(model, - torch_dtype=torch.float16, trust_remote_code=trust_remote_code) + llm = AutoModelForCausalLM.from_pretrained( + model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) if llm.config.model_type == "llama": # To enable padding in the HF backend. tokenizer.pad_token = tokenizer.eos_token @@ -132,13 +138,14 @@ def run_hf( if len(batch) < max_batch_size and i != len(requests) - 1: # Check if we can add more requests to the batch. _, next_prompt_len, next_output_len = requests[i + 1] - if (max(max_prompt_len, next_prompt_len) + max( - max_output_len, next_output_len)) <= 2048: + if (max(max_prompt_len, next_prompt_len) + + max(max_output_len, next_output_len)) <= 2048: # We can add more requests to the batch. continue # Generate the sequences. - input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids + input_ids = tokenizer(batch, return_tensors="pt", + padding=True).input_ids llm_outputs = llm.generate( input_ids=input_ids.cuda(), do_sample=not use_beam_search, @@ -165,48 +172,77 @@ def main(args: argparse.Namespace): random.seed(args.seed) # Sample the requests. - tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code) + tokenizer = get_tokenizer(args.tokenizer, + trust_remote_code=args.trust_remote_code) requests = sample_requests(args.dataset, args.num_prompts, tokenizer) if args.backend == "vllm": elapsed_time = run_vllm( - requests, args.model, args.tokenizer, args.tensor_parallel_size, - args.seed, args.n, args.use_beam_search, args.trust_remote_code) + requests, + args.model, + args.tokenizer, + args.tensor_parallel_size, + args.seed, + args.n, + args.use_beam_search, + args.trust_remote_code, + ) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf( - requests, args.model, tokenizer, args.n, args.use_beam_search, - args.hf_max_batch_size, args.trust_remote_code) + requests, + args.model, + tokenizer, + args.n, + args.use_beam_search, + args.hf_max_batch_size, + args.trust_remote_code, + ) else: raise ValueError(f"Unknown backend: {args.backend}") - total_num_tokens = sum( - prompt_len + output_len - for _, prompt_len, output_len in requests - ) + total_num_tokens = sum(prompt_len + output_len + for _, prompt_len, output_len in requests) + total_new_tokens = sum(output_len for _, _, output_len in requests) print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} tokens/s") + print(f"Throughput (output): " + f"{total_new_tokens / elapsed_time:.2f} tokens/s") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Benchmark the throughput.") - parser.add_argument("--backend", type=str, choices=["vllm", "hf"], + parser.add_argument("--backend", + type=str, + choices=["vllm", "hf"], default="vllm") - parser.add_argument("--dataset", type=str, required=True, + parser.add_argument("--dataset", + type=str, + required=True, help="Path to the dataset.") parser.add_argument("--model", type=str, default="facebook/opt-125m") parser.add_argument("--tokenizer", type=str, default=None) parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) - parser.add_argument("--n", type=int, default=1, + parser.add_argument("--n", + type=int, + default=1, help="Number of generated sequences per prompt.") parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument("--num-prompts", type=int, default=1000, + parser.add_argument("--num-prompts", + type=int, + default=1000, help="Number of prompts to process.") parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--hf-max-batch-size", type=int, default=None, - help="Maximum batch size for HF backend.") - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') + parser.add_argument( + "--hf-max-batch-size", + type=int, + default=None, + help="Maximum batch size for HF backend.", + ) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="trust remote code from huggingface", + ) args = parser.parse_args() if args.backend == "vllm": From 377783b818121630751ffab4bdaee29994d7a9ef Mon Sep 17 00:00:00 2001 From: wangruohui <12756472+wangruohui@users.noreply.github.com> Date: Tue, 18 Jul 2023 19:30:37 +0800 Subject: [PATCH 2/8] add support for internlm --- vllm/model_executor/model_loader.py | 1 + vllm/model_executor/models/internlm.py | 32 ++++++++++++++++++-------- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py index 85d917e6d3b5..0d39eeaf1a4c 100644 --- a/vllm/model_executor/model_loader.py +++ b/vllm/model_executor/model_loader.py @@ -26,6 +26,7 @@ "OPTForCausalLM": OPTForCausalLM, "QWenLMHeadModel": QWenLMHeadModel, "RWForCausalLM": FalconForCausalLM, + "InternLMForCausalLM": InternLMForCausalLM, } diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py index e2fb3f2ff064..bf17ab7236e8 100644 --- a/vllm/model_executor/models/internlm.py +++ b/vllm/model_executor/models/internlm.py @@ -1,25 +1,38 @@ # -*- coding: utf-8 -*- +import sys from typing import Dict, List, Optional, Tuple import torch from torch import nn -from transformers import LlamaConfig from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.attention import PagedAttentionWithRoPE +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.weight_utils import (hf_model_weights_iterator, - load_tensor_parallel_weights) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.parallel_utils.tensor_parallel import ( - VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear) + ColumnParallelLinear, RowParallelLinear, VocabParallelEmbedding) +from vllm.model_executor.weight_utils import (hf_model_weights_iterator, + load_tensor_parallel_weights) from vllm.sequence import SequenceOutputs KVCache = Tuple[torch.Tensor, torch.Tensor] +from transformers.utils import (HF_MODULES_CACHE, + TRANSFORMERS_DYNAMIC_MODULE_NAME) + +path = sys.path.append('/'.join((HF_MODULES_CACHE, TRANSFORMERS_DYNAMIC_MODULE_NAME))) + +try: + from internlm.configuration_internlm import InternLMConfig + from internlm.modeling_internlm import InternLMModel +except ImportError as e: + print(f"InternLM is not ported to transformers' local module cache at {path}." + f"Try running `AutoModelForCausalLM.from_pretrained('internlm/internlm-chat-7b')` once " + f"to make transformers copy the codes to its hub.") + raise e class InternLMMLP(nn.Module): @@ -32,12 +45,12 @@ def __init__( super().__init__() self.gate_up_proj = ColumnParallelLinear(hidden_size, 2 * intermediate_size, - bias=True, + bias=False, gather_output=False, perform_initialization=False) self.down_proj = RowParallelLinear(intermediate_size, hidden_size, - bias=True, + bias=False, input_is_parallel=True, perform_initialization=False) if hidden_act != "silu": @@ -108,7 +121,7 @@ def forward( class InternLMDecoderLayer(nn.Module): - def __init__(self, config: LlamaConfig): + def __init__(self, config: InternLMConfig): super().__init__() self.hidden_size = config.hidden_size self.self_attn = InternLMAttention( @@ -155,7 +168,7 @@ def forward( class InternLMModel(nn.Module): - def __init__(self, config: LlamaConfig): + def __init__(self, config: InternLMConfig): super().__init__() self.config = config self.padding_idx = config.pad_token_id @@ -273,6 +286,7 @@ def load_weights(self, break if is_attention_weight: continue + # TODO: current implementation works without TP, bias is not handled is_gate_up_weight = False for stride_id, weight_name in enumerate(["gate_proj", "up_proj"]): From 290b29e36d2266f50191425a457fa5d232d3eda6 Mon Sep 17 00:00:00 2001 From: wangruohui <12756472+wangruohui@users.noreply.github.com> Date: Fri, 11 Aug 2023 18:55:40 +0800 Subject: [PATCH 3/8] recover benchmark --- benchmarks/benchmark_throughput.py | 98 ++++++++++-------------------- 1 file changed, 31 insertions(+), 67 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 1b82f194888b..b2bea8520565 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -12,17 +12,6 @@ from vllm import LLM, SamplingParams from vllm.transformers_utils.tokenizer import get_tokenizer -import logging - -import time - -logging.basicConfig( - level=logging.DEBUG, - filename=f"benchmark_throughput_{time.time()}.log", - filemode="w", - format="%(asctime)s %(levelname)s %(message)s", -) - def sample_requests( dataset_path: str, @@ -33,10 +22,15 @@ def sample_requests( with open(dataset_path) as f: dataset = json.load(f) # Filter out the conversations with less than 2 turns. - dataset = [data for data in dataset if len(data["conversations"]) >= 2] + dataset = [ + data for data in dataset + if len(data["conversations"]) >= 2 + ] # Only keep the first two turns of each conversation. - dataset = [(data["conversations"][0]["value"], - data["conversations"][1]["value"]) for data in dataset] + dataset = [ + (data["conversations"][0]["value"], data["conversations"][1]["value"]) + for data in dataset + ] # Tokenize the prompts and completions. prompts = [prompt for prompt, _ in dataset] @@ -117,8 +111,8 @@ def run_hf( trust_remote_code: bool, ) -> float: assert not use_beam_search - llm = AutoModelForCausalLM.from_pretrained( - model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) + llm = AutoModelForCausalLM.from_pretrained(model, + torch_dtype=torch.float16, trust_remote_code=trust_remote_code) if llm.config.model_type == "llama": # To enable padding in the HF backend. tokenizer.pad_token = tokenizer.eos_token @@ -138,14 +132,13 @@ def run_hf( if len(batch) < max_batch_size and i != len(requests) - 1: # Check if we can add more requests to the batch. _, next_prompt_len, next_output_len = requests[i + 1] - if (max(max_prompt_len, next_prompt_len) + - max(max_output_len, next_output_len)) <= 2048: + if (max(max_prompt_len, next_prompt_len) + max( + max_output_len, next_output_len)) <= 2048: # We can add more requests to the batch. continue # Generate the sequences. - input_ids = tokenizer(batch, return_tensors="pt", - padding=True).input_ids + input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids llm_outputs = llm.generate( input_ids=input_ids.cuda(), do_sample=not use_beam_search, @@ -172,77 +165,48 @@ def main(args: argparse.Namespace): random.seed(args.seed) # Sample the requests. - tokenizer = get_tokenizer(args.tokenizer, - trust_remote_code=args.trust_remote_code) + tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code) requests = sample_requests(args.dataset, args.num_prompts, tokenizer) if args.backend == "vllm": elapsed_time = run_vllm( - requests, - args.model, - args.tokenizer, - args.tensor_parallel_size, - args.seed, - args.n, - args.use_beam_search, - args.trust_remote_code, - ) + requests, args.model, args.tokenizer, args.tensor_parallel_size, + args.seed, args.n, args.use_beam_search, args.trust_remote_code) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf( - requests, - args.model, - tokenizer, - args.n, - args.use_beam_search, - args.hf_max_batch_size, - args.trust_remote_code, - ) + requests, args.model, tokenizer, args.n, args.use_beam_search, + args.hf_max_batch_size, args.trust_remote_code) else: raise ValueError(f"Unknown backend: {args.backend}") - total_num_tokens = sum(prompt_len + output_len - for _, prompt_len, output_len in requests) - total_new_tokens = sum(output_len for _, _, output_len in requests) + total_num_tokens = sum( + prompt_len + output_len + for _, prompt_len, output_len in requests + ) print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} tokens/s") - print(f"Throughput (output): " - f"{total_new_tokens / elapsed_time:.2f} tokens/s") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Benchmark the throughput.") - parser.add_argument("--backend", - type=str, - choices=["vllm", "hf"], + parser.add_argument("--backend", type=str, choices=["vllm", "hf"], default="vllm") - parser.add_argument("--dataset", - type=str, - required=True, + parser.add_argument("--dataset", type=str, required=True, help="Path to the dataset.") parser.add_argument("--model", type=str, default="facebook/opt-125m") parser.add_argument("--tokenizer", type=str, default=None) parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) - parser.add_argument("--n", - type=int, - default=1, + parser.add_argument("--n", type=int, default=1, help="Number of generated sequences per prompt.") parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument("--num-prompts", - type=int, - default=1000, + parser.add_argument("--num-prompts", type=int, default=1000, help="Number of prompts to process.") parser.add_argument("--seed", type=int, default=0) - parser.add_argument( - "--hf-max-batch-size", - type=int, - default=None, - help="Maximum batch size for HF backend.", - ) - parser.add_argument( - "--trust-remote-code", - action="store_true", - help="trust remote code from huggingface", - ) + parser.add_argument("--hf-max-batch-size", type=int, default=None, + help="Maximum batch size for HF backend.") + parser.add_argument('--trust-remote-code', + action='store_true', + help='trust remote code from huggingface') args = parser.parse_args() if args.backend == "vllm": From 0b28ba48e62586f6a6c295b7a58c04435f5c8c6f Mon Sep 17 00:00:00 2001 From: wangruohui <12756472+wangruohui@users.noreply.github.com> Date: Fri, 11 Aug 2023 19:23:21 +0800 Subject: [PATCH 4/8] remove dynamic imports --- vllm/model_executor/models/internlm.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py index bf17ab7236e8..00bc55357b25 100644 --- a/vllm/model_executor/models/internlm.py +++ b/vllm/model_executor/models/internlm.py @@ -4,6 +4,7 @@ import torch from torch import nn +from transformers import LlamaConfig from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul @@ -20,20 +21,6 @@ KVCache = Tuple[torch.Tensor, torch.Tensor] -from transformers.utils import (HF_MODULES_CACHE, - TRANSFORMERS_DYNAMIC_MODULE_NAME) - -path = sys.path.append('/'.join((HF_MODULES_CACHE, TRANSFORMERS_DYNAMIC_MODULE_NAME))) - -try: - from internlm.configuration_internlm import InternLMConfig - from internlm.modeling_internlm import InternLMModel -except ImportError as e: - print(f"InternLM is not ported to transformers' local module cache at {path}." - f"Try running `AutoModelForCausalLM.from_pretrained('internlm/internlm-chat-7b')` once " - f"to make transformers copy the codes to its hub.") - raise e - class InternLMMLP(nn.Module): def __init__( @@ -121,7 +108,7 @@ def forward( class InternLMDecoderLayer(nn.Module): - def __init__(self, config: InternLMConfig): + def __init__(self, config: LlamaConfig): super().__init__() self.hidden_size = config.hidden_size self.self_attn = InternLMAttention( @@ -168,7 +155,7 @@ def forward( class InternLMModel(nn.Module): - def __init__(self, config: InternLMConfig): + def __init__(self, config: LlamaConfig): super().__init__() self.config = config self.padding_idx = config.pad_token_id From b46c724ea10789f0db1f500b1014093aa07ffcdd Mon Sep 17 00:00:00 2001 From: wangruohui <12756472+wangruohui@users.noreply.github.com> Date: Fri, 11 Aug 2023 19:31:23 +0800 Subject: [PATCH 5/8] recover imports --- vllm/model_executor/models/internlm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py index 00bc55357b25..ebb25b41d02f 100644 --- a/vllm/model_executor/models/internlm.py +++ b/vllm/model_executor/models/internlm.py @@ -1,10 +1,9 @@ # -*- coding: utf-8 -*- -import sys from typing import Dict, List, Optional, Tuple import torch from torch import nn -from transformers import LlamaConfig +from transformers import LlamaConfig from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul From c7bf18cb3587c0674c119a1cc1c7ad25cda075c1 Mon Sep 17 00:00:00 2001 From: wangruohui <12756472+wangruohui@users.noreply.github.com> Date: Fri, 11 Aug 2023 19:33:08 +0800 Subject: [PATCH 6/8] remove a comment --- vllm/model_executor/models/internlm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py index ebb25b41d02f..dfd7d56c48d9 100644 --- a/vllm/model_executor/models/internlm.py +++ b/vllm/model_executor/models/internlm.py @@ -272,7 +272,6 @@ def load_weights(self, break if is_attention_weight: continue - # TODO: current implementation works without TP, bias is not handled is_gate_up_weight = False for stride_id, weight_name in enumerate(["gate_proj", "up_proj"]): From 28a838e08073f1432e85616cac52f0edb1a44a05 Mon Sep 17 00:00:00 2001 From: WRH <12756472+wangruohui@users.noreply.github.com> Date: Fri, 11 Aug 2023 19:36:12 +0800 Subject: [PATCH 7/8] Update model_loader.py --- vllm/model_executor/model_loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py index 0d39eeaf1a4c..85d917e6d3b5 100644 --- a/vllm/model_executor/model_loader.py +++ b/vllm/model_executor/model_loader.py @@ -26,7 +26,6 @@ "OPTForCausalLM": OPTForCausalLM, "QWenLMHeadModel": QWenLMHeadModel, "RWForCausalLM": FalconForCausalLM, - "InternLMForCausalLM": InternLMForCausalLM, } From 73b1f35bbb821f6c58b9c94b2cb4d9670dd7898e Mon Sep 17 00:00:00 2001 From: wangruohui <12756472+wangruohui@users.noreply.github.com> Date: Fri, 11 Aug 2023 19:37:44 +0800 Subject: [PATCH 8/8] yapf format --- vllm/model_executor/models/internlm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py index dfd7d56c48d9..1998323352be 100644 --- a/vllm/model_executor/models/internlm.py +++ b/vllm/model_executor/models/internlm.py @@ -20,6 +20,7 @@ KVCache = Tuple[torch.Tensor, torch.Tensor] + class InternLMMLP(nn.Module): def __init__(