In [16]:
import subprocess
from pathlib import Path
import pandas as pd
import json
RESULT_DIR = Path("./simulator_output/")


## Run simulator
python -m vidur.main  \
--replica_config_device a100 \
--replica_config_model_name meta-llama/Llama-2-7b-hf  \
--cluster_config_num_replicas 1 \
--replica_config_tensor_parallel_size 1 \
--replica_config_num_pipeline_stages 8 \
--request_generator_config_type synthetic \
--length_generator_config_type trace \
--interval_generator_config_type static \
--trace_request_length_generator_config_max_tokens 4096 \
--trace_request_length_generator_config_trace_file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv \
--synthetic_request_generator_config_num_requests 2048  \
--replica_scheduler_config_type vllm  \
--vllm_scheduler_config_batch_size_cap 256  \
--vllm_scheduler_config_max_tokens_in_batch 4096 \
--early_exit_type 1 \
--metrics_config_output_data_folder ee05

In [12]:
def run_vidur(config: dict):
    replica_config_device = config.get("replica_config_device", "a100")
    replica_config_model_name = config.get("replica_config_model_name", "meta-llama/Llama-2-7b-hf")
    cluster_config_num_replicas = config.get("cluster_config_num_replicas", 1)
    replica_config_tensor_parallel_size = config.get("replica_config_tensor_parallel_size", 1)
    replica_config_num_pipeline_stages = config.get("replica_config_num_pipeline_stages", 8)
    request_generator_config_type = config.get("request_generator_config_type", "synthetic")
    length_generator_config_type = config.get("length_generator_config_type", "trace")
    interval_generator_config_type = config.get("interval_generator_config_type", "static")
    trace_request_length_generator_config_max_tokens = config.get("trace_request_length_generator_config_max_tokens", 4096)
    trace_request_length_generator_config_trace_file = config.get("trace_request_length_generator_config_trace_file", "./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv")
    synthetic_request_generator_config_num_requests = config.get("synthetic_request_generator_config_num_requests", 2048)
    replica_scheduler_config_type = config.get("replica_scheduler_config_type", "vllm")
    vllm_scheduler_config_batch_size_cap = config.get("vllm_scheduler_config_batch_size_cap", 256)
    vllm_scheduler_config_max_tokens_in_batch = config.get("vllm_scheduler_config_max_tokens_in_batch", 4096)
    early_exit_type = config.get("early_exit_type", 1)
    metrics_config_output_data_folder = config.get("name", "ee05")
    subprocess.run(["python", "-m", "vidur.main",
                    f"--replica_config_device={replica_config_device}",
                    f"--replica_config_model_name={replica_config_model_name}",
                    f"--cluster_config_num_replicas={cluster_config_num_replicas}",
                    f"--replica_config_tensor_parallel_size={replica_config_tensor_parallel_size}",
                    f"--replica_config_num_pipeline_stages={replica_config_num_pipeline_stages}",
                    f"--request_generator_config_type={request_generator_config_type}",
                    f"--length_generator_config_type={length_generator_config_type}",
                    f"--interval_generator_config_type={interval_generator_config_type}",
                    f"--trace_request_length_generator_config_max_tokens={trace_request_length_generator_config_max_tokens}",
                    f"--trace_request_length_generator_config_trace_file={trace_request_length_generator_config_trace_file}",
                    f"--synthetic_request_generator_config_num_requests={synthetic_request_generator_config_num_requests}",
                    f"--replica_scheduler_config_type={replica_scheduler_config_type}",
                    f"--vllm_scheduler_config_batch_size_cap={vllm_scheduler_config_batch_size_cap}",
                    f"--vllm_scheduler_config_max_tokens_in_batch={vllm_scheduler_config_max_tokens_in_batch}",
                    f"--early_exit_type={early_exit_type}",
                    f"--metrics_config_output_data_folder={metrics_config_output_data_folder}"])

In [17]:
def analyze_trace(trace_name: str):
    trace_dir = RESULT_DIR / trace_name / "plots/"
    e2e_df = pd.read_csv(trace_dir / "request_e2e_time.csv")
    exec_df = pd.read_csv(trace_dir / "request_execution_time.csv")

    e2e_time = e2e_df["request_e2e_time"]
    exec_time = exec_df["request_execution_time"]

    return {
        "e2e_mean": e2e_time.mean(),
        "e2e_95th": e2e_time.quantile(0.95),
        "e2e_99th": e2e_time.quantile(0.99),
        "exec_mean": exec_time.mean(),
        "exec_95th": exec_time.quantile(0.95),
        "exec_99th": exec_time.quantile(0.99),
    }
def compute_gap(oroginal, ee, skip_chance: float=0.5):
    gaps = {}
    for key, value in oroginal.items():
        expected_value = value * (1 - skip_chance)
        gaps[key] = (ee[key] - expected_value) / expected_value
    return gaps
def get_e2e_over_exec(trace: str):
    trace_stat = analyze_trace(trace)
    return {
        'mean': trace_stat['e2e_mean'] / trace_stat['exec_mean'],
        '95th': trace_stat['e2e_95th'] / trace_stat['exec_95th'],
        '99th': trace_stat['e2e_99th'] / trace_stat['exec_99th'],
    }
def get_queue_length(trace: str):
    queue_length_file = RESULT_DIR / trace / "plots/queue_length.json"
    with open(queue_length_file, "r") as f:
        queue_length = json.load(f)
    return queue_length

## Batch Size experiments

In [21]:
BATCH_SIZES = [64, 128, 256, 512, 1024]
for batch_size in BATCH_SIZES:
    name = f"ee_batch{batch_size}"
    run_vidur({
        "name": name,
        "vllm_scheduler_config_batch_size_cap": batch_size,
    })


INFO 10-23 11:50:13 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.EE
----------
INFO 10-23 11:50:14 simulator.py:61] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests


  total_flops_per_second = total_flops / batch_stage.execution_time
  self._numer_sum += self._last_data_y * x_diff


-----Queue length-----
{0: 1.32434209780378, 1: 0.4435298252775038, 2: 0.4034604351621937, 3: 0.40178237355012075, 4: 0.40258738959375034, 5: 0.40317697880880304, 6: 0.4116466546481173, 7: 0.3213941517285168}
INFO 10-23 11:50:45 simulator.py:88] Simulation ended at: 479.9737835667563s
INFO 10-23 11:50:45 simulator.py:91] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 10-23 11:50:53 simulator.py:94] Metrics written
INFO 10-23 11:51:02 simulator.py:102] Chrome event trace written
INFO 10-23 11:51:04 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.EE
----------
INFO 10-23 11:51:05 simulator.py:61] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests


  total_flops_per_second = total_flops / batch_stage.execution_time
  self._numer_sum += self._last_data_y * x_diff


-----Queue length-----
{0: 1.3596411907259378, 1: 0.477248887763591, 2: 0.447571165045032, 3: 0.4482584005497884, 4: 0.4642094983180815, 5: 0.48048612869389085, 6: 0.49486381885918906, 7: 0.3815965565884183}
INFO 10-23 11:51:26 simulator.py:88] Simulation ended at: 378.28410941834477s
INFO 10-23 11:51:26 simulator.py:91] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 10-23 11:51:34 simulator.py:94] Metrics written
INFO 10-23 11:51:40 simulator.py:102] Chrome event trace written
INFO 10-23 11:51:42 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.EE
----------
INFO 10-23 11:51:43 simulator.py:61] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests


  total_flops_per_second = total_flops / batch_stage.execution_time
  self._numer_sum += self._last_data_y * x_diff


-----Queue length-----
{0: 1.2584892404569228, 1: 0.4026176784366787, 2: 0.38060419973458925, 3: 0.40017173635866876, 4: 0.42231531836278, 5: 0.4502094663162551, 6: 0.47599594077697693, 7: 0.32067861882334575}
INFO 10-23 11:52:00 simulator.py:88] Simulation ended at: 299.6331056283327s
INFO 10-23 11:52:00 simulator.py:91] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 10-23 11:52:07 simulator.py:94] Metrics written
INFO 10-23 11:52:12 simulator.py:102] Chrome event trace written
INFO 10-23 11:52:22 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.EE
----------
INFO 10-23 11:52:24 simulator.py:61] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests


  total_flops_per_second = total_flops / batch_stage.execution_time
  self._numer_sum += self._last_data_y * x_diff


-----Queue length-----
{0: 0.9527474877057943, 1: 0.2488163963468646, 2: 0.23837013958886955, 3: 0.2526039280368979, 4: 0.2657991997312074, 5: 0.28174348636183144, 6: 0.31430404105195636, 7: 0.2080698860686032}
INFO 10-23 11:52:38 simulator.py:88] Simulation ended at: 233.13236881007342s
INFO 10-23 11:52:38 simulator.py:91] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 10-23 11:52:49 simulator.py:94] Metrics written
INFO 10-23 11:52:54 simulator.py:102] Chrome event trace written
INFO 10-23 11:52:56 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.EE
----------
INFO 10-23 11:52:57 simulator.py:61] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests


  total_flops_per_second = total_flops / batch_stage.execution_time
  self._numer_sum += self._last_data_y * x_diff


-----Queue length-----
{0: 0.7386723342678846, 1: 0.35506918694039974, 2: 0.36972053902505203, 3: 0.39196888848693134, 4: 0.41390069639142624, 5: 0.45758343131048207, 6: 0.5236953965813512, 7: 0.3867685629013295}
INFO 10-23 11:53:09 simulator.py:88] Simulation ended at: 247.3667589906704s
INFO 10-23 11:53:09 simulator.py:91] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 10-23 11:53:16 simulator.py:94] Metrics written
INFO 10-23 11:53:20 simulator.py:102] Chrome event trace written


In [22]:
for batch_size in BATCH_SIZES:
    trace = f"ee_batch{batch_size}"
    print(f"batch size = {batch_size}")
    print("E2E / Exec")
    print(get_e2e_over_exec(trace))
    print("Queue Length")
    print(get_queue_length(trace))
    print()

batch size = 64
E2E / Exec
{'mean': 73.89105626012383, '95th': 59.24477736663204, '99th': 14.037986886170522}
Queue Length
{'0': 1.32434209780378, '1': 0.4435298252775038, '2': 0.4034604351621937, '3': 0.40178237355012075, '4': 0.40258738959375034, '5': 0.40317697880880304, '6': 0.4116466546481173, '7': 0.3213941517285168}

batch size = 128
E2E / Exec
{'mean': 47.030220006386415, '95th': 36.301619040961384, '99th': 9.3922261560815}
Queue Length
{'0': 1.3596411907259378, '1': 0.477248887763591, '2': 0.447571165045032, '3': 0.4482584005497884, '4': 0.4642094983180815, '5': 0.48048612869389085, '6': 0.49486381885918906, '7': 0.3815965565884183}

batch size = 256
E2E / Exec
{'mean': 27.2069468831687, '95th': 18.30399983175588, '99th': 6.478501609392427}
Queue Length
{'0': 1.2584892404569228, '1': 0.4026176784366787, '2': 0.38060419973458925, '3': 0.40017173635866876, '4': 0.42231531836278, '5': 0.4502094663162551, '6': 0.47599594077697693, '7': 0.32067861882334575}

batch size = 512
E2E / 

**No EE (batch size = 256)**

E2E / Exec

{'mean': 22.341935400591645, '95th': 14.987146524971076, '99th': 5.273660358307038}

Queue Length

{'0': 0.40735343863027246, '1': 0.31297650334365484, '2': 0.3305144284561942, '3': 0.3488850147016731, '4': 0.36899898519424423, '5': 0.397231401732976, '6': 0.4397751814941063, '7': 0.43327001639301604}

In [23]:
run_vidur({
        "name": "default",
        "early_exit_type": 0
    })
print("Default (No EE)")
print("E2E / Exec")
print(get_e2e_over_exec("default"))
print("Queue Length")
print(get_queue_length("default"))


INFO 10-23 11:56:08 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.NO_EE
----------
INFO 10-23 11:56:09 simulator.py:61] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests
-----Queue length-----
{0: 0.40735343863027246, 1: 0.31297650334365484, 2: 0.3305144284561942, 3: 0.3488850147016731, 4: 0.36899898519424423, 5: 0.397231401732976, 6: 0.4397751814941063, 7: 0.43327001639301604}
INFO 10-23 11:56:24 simulator.py:88] Simulation ended at: 409.49599237395756s
INFO 10-23 11:56:24 simulator.py:91] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 10-23 11:56:32 simulator.py:94] Metrics written
INFO 10-23 11:56:37 simulator.py:102] Chrome event trace written
Default (No EE)
E2E / Exec
{'mean': 22.341935400591645, '95th': 14.987146524971076, '99th': 5.273660358307038}
Queue Length
{'0': 0.40735343863027246, '1': 0.31297650334365484, '2': 0.3305144284561942, '3': 0.3488850147016731, '4': 0.36899898519424423, '5': 0.397231401732976, '6': 0.4397751814941063, '7': 0.43327001639301604}
