In [27]:
import subprocess
from pathlib import Path
import pandas as pd
import json
RESULT_DIR = Path("./simulator_output/")


## Run simulator
python -m vidur.main  \
--replica_config_device a100 \
--replica_config_model_name meta-llama/Llama-2-7b-hf  \
--cluster_config_num_replicas 1 \
--replica_config_tensor_parallel_size 1 \
--replica_config_num_pipeline_stages 8 \
--request_generator_config_type synthetic \
--length_generator_config_type trace \
--interval_generator_config_type static \
--trace_request_length_generator_config_max_tokens 4096 \
--trace_request_length_generator_config_trace_file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv \
--synthetic_request_generator_config_num_requests 2048  \
--replica_scheduler_config_type vllm  \
--vllm_scheduler_config_batch_size_cap 256  \
--vllm_scheduler_config_max_tokens_in_batch 4096 \
--early_exit_type 1 \
--metrics_config_output_data_folder ee05

In [10]:
def run_vidur(config: dict):
    replica_config_device = config.get("replica_config_device", "a100")
    replica_config_model_name = config.get("replica_config_model_name", "meta-llama/Llama-2-7b-hf")
    cluster_config_num_replicas = config.get("cluster_config_num_replicas", 1)
    replica_config_tensor_parallel_size = config.get("replica_config_tensor_parallel_size", 1)
    replica_config_num_pipeline_stages = config.get("replica_config_num_pipeline_stages", 8)
    request_generator_config_type = config.get("request_generator_config_type", "synthetic")
    length_generator_config_type = config.get("length_generator_config_type", "trace")
    interval_generator_config_type = config.get("interval_generator_config_type", "static")
    trace_request_length_generator_config_max_tokens = config.get("trace_request_length_generator_config_max_tokens", 4096)
    trace_request_length_generator_config_trace_file = config.get("trace_request_length_generator_config_trace_file", "./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv")
    synthetic_request_generator_config_num_requests = config.get("synthetic_request_generator_config_num_requests", 2048)
    replica_scheduler_config_type = config.get("replica_scheduler_config_type", "vllm")
    vllm_scheduler_config_batch_size_cap = config.get("vllm_scheduler_config_batch_size_cap", 256)
    vllm_scheduler_config_max_tokens_in_batch = config.get("vllm_scheduler_config_max_tokens_in_batch", 4096)
    early_exit_type = config.get("early_exit_type", 1)
    metrics_config_output_data_folder = config.get("name", "ee05")
    enable_priority_queue = config.get("enable_priority_queue", False)
    subprocess.run(["python", "-m", "vidur.main",
                    f"--replica_config_device={replica_config_device}",
                    f"--replica_config_model_name={replica_config_model_name}",
                    f"--cluster_config_num_replicas={cluster_config_num_replicas}",
                    f"--replica_config_tensor_parallel_size={replica_config_tensor_parallel_size}",
                    f"--replica_config_num_pipeline_stages={replica_config_num_pipeline_stages}",
                    f"--request_generator_config_type={request_generator_config_type}",
                    f"--length_generator_config_type={length_generator_config_type}",
                    f"--interval_generator_config_type={interval_generator_config_type}",
                    f"--trace_request_length_generator_config_max_tokens={trace_request_length_generator_config_max_tokens}",
                    f"--trace_request_length_generator_config_trace_file={trace_request_length_generator_config_trace_file}",
                    f"--synthetic_request_generator_config_num_requests={synthetic_request_generator_config_num_requests}",
                    f"--replica_scheduler_config_type={replica_scheduler_config_type}",
                    f"--vllm_scheduler_config_batch_size_cap={vllm_scheduler_config_batch_size_cap}",
                    f"--vllm_scheduler_config_max_tokens_in_batch={vllm_scheduler_config_max_tokens_in_batch}",
                    f"--early_exit_type={early_exit_type}",
                    f"--metrics_config_output_data_folder={metrics_config_output_data_folder}",
                    "--vllm_scheduler_config_enable_priority_queue" if enable_priority_queue else "--no-vllm_scheduler_config_enable_priority_queue",
                    ])

In [4]:
def analyze_trace(trace_name: str):
    trace_dir = RESULT_DIR / trace_name / "plots/"
    e2e_df = pd.read_csv(trace_dir / "request_e2e_time.csv")
    exec_df = pd.read_csv(trace_dir / "request_execution_time.csv")

    e2e_time = e2e_df["request_e2e_time"]
    exec_time = exec_df["request_execution_time"]

    return {
        "e2e_mean": e2e_time.mean(),
        "e2e_95th": e2e_time.quantile(0.95),
        "e2e_99th": e2e_time.quantile(0.99),
        "exec_mean": exec_time.mean(),
        "exec_95th": exec_time.quantile(0.95),
        "exec_99th": exec_time.quantile(0.99),
    }
def compute_gap(oroginal, ee, skip_chance: float=0.5):
    gaps = {}
    for key, value in oroginal.items():
        expected_value = value * (1 - skip_chance)
        gaps[key] = (ee[key] - expected_value) / expected_value
    return gaps
def get_e2e_over_exec(trace: str):
    trace_stat = analyze_trace(trace)
    return {
        'mean': trace_stat['e2e_mean'] / trace_stat['exec_mean'],
        '95th': trace_stat['e2e_95th'] / trace_stat['exec_95th'],
        '99th': trace_stat['e2e_99th'] / trace_stat['exec_99th'],
    }
def get_queue_length(trace: str):
    queue_length_file = RESULT_DIR / trace / "plots/queue_length.json"
    with open(queue_length_file, "r") as f:
        queue_length = json.load(f)
    return queue_length

In [30]:
run_vidur({
        "name": "test",
        "enable_priority_queue": True
    })

INFO 11-11 15:24:15 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.EE
----------
INFO 11-11 15:24:17 simulator.py:62] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests
Time to tokens file created: simulator_output/test/time_to_tokens.csv


  total_flops_per_second = total_flops / batch_stage.execution_time
  self._numer_sum += self._last_data_y * x_diff


-----Queue length-----
{0: 1.2097030953885028, 1: 0.37253316487681615, 2: 0.36525584333543903, 3: 0.37379658875552746, 4: 0.4013139608338598, 5: 0.42142766898294376, 6: 0.4587492103600758, 7: 0.31681617182564753}
INFO 11-11 15:24:43 simulator.py:97] Simulation ended at: 296.24695084102177s
INFO 11-11 15:24:43 simulator.py:100] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 11-11 15:24:55 simulator.py:103] Metrics written
INFO 11-11 15:25:00 simulator.py:111] Chrome event trace written


## Batch Size experiments

In [21]:
BATCH_SIZES = [64, 128, 256, 512, 1024]
for batch_size in BATCH_SIZES:
    name = f"ee_batch{batch_size}"
    run_vidur({
        "name": name,
        "vllm_scheduler_config_batch_size_cap": batch_size,
    })


INFO 10-23 11:50:13 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.EE
----------
INFO 10-23 11:50:14 simulator.py:61] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests


  total_flops_per_second = total_flops / batch_stage.execution_time
  self._numer_sum += self._last_data_y * x_diff


-----Queue length-----
{0: 1.32434209780378, 1: 0.4435298252775038, 2: 0.4034604351621937, 3: 0.40178237355012075, 4: 0.40258738959375034, 5: 0.40317697880880304, 6: 0.4116466546481173, 7: 0.3213941517285168}
INFO 10-23 11:50:45 simulator.py:88] Simulation ended at: 479.9737835667563s
INFO 10-23 11:50:45 simulator.py:91] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 10-23 11:50:53 simulator.py:94] Metrics written
INFO 10-23 11:51:02 simulator.py:102] Chrome event trace written
INFO 10-23 11:51:04 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.EE
----------
INFO 10-23 11:51:05 simulator.py:61] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests


  total_flops_per_second = total_flops / batch_stage.execution_time
  self._numer_sum += self._last_data_y * x_diff


-----Queue length-----
{0: 1.3596411907259378, 1: 0.477248887763591, 2: 0.447571165045032, 3: 0.4482584005497884, 4: 0.4642094983180815, 5: 0.48048612869389085, 6: 0.49486381885918906, 7: 0.3815965565884183}
INFO 10-23 11:51:26 simulator.py:88] Simulation ended at: 378.28410941834477s
INFO 10-23 11:51:26 simulator.py:91] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 10-23 11:51:34 simulator.py:94] Metrics written
INFO 10-23 11:51:40 simulator.py:102] Chrome event trace written
INFO 10-23 11:51:42 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.EE
----------
INFO 10-23 11:51:43 simulator.py:61] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests


  total_flops_per_second = total_flops / batch_stage.execution_time
  self._numer_sum += self._last_data_y * x_diff


-----Queue length-----
{0: 1.2584892404569228, 1: 0.4026176784366787, 2: 0.38060419973458925, 3: 0.40017173635866876, 4: 0.42231531836278, 5: 0.4502094663162551, 6: 0.47599594077697693, 7: 0.32067861882334575}
INFO 10-23 11:52:00 simulator.py:88] Simulation ended at: 299.6331056283327s
INFO 10-23 11:52:00 simulator.py:91] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 10-23 11:52:07 simulator.py:94] Metrics written
INFO 10-23 11:52:12 simulator.py:102] Chrome event trace written
INFO 10-23 11:52:22 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.EE
----------
INFO 10-23 11:52:24 simulator.py:61] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests


  total_flops_per_second = total_flops / batch_stage.execution_time
  self._numer_sum += self._last_data_y * x_diff


-----Queue length-----
{0: 0.9527474877057943, 1: 0.2488163963468646, 2: 0.23837013958886955, 3: 0.2526039280368979, 4: 0.2657991997312074, 5: 0.28174348636183144, 6: 0.31430404105195636, 7: 0.2080698860686032}
INFO 10-23 11:52:38 simulator.py:88] Simulation ended at: 233.13236881007342s
INFO 10-23 11:52:38 simulator.py:91] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 10-23 11:52:49 simulator.py:94] Metrics written
INFO 10-23 11:52:54 simulator.py:102] Chrome event trace written
INFO 10-23 11:52:56 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.EE
----------
INFO 10-23 11:52:57 simulator.py:61] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests


  total_flops_per_second = total_flops / batch_stage.execution_time
  self._numer_sum += self._last_data_y * x_diff


-----Queue length-----
{0: 0.7386723342678846, 1: 0.35506918694039974, 2: 0.36972053902505203, 3: 0.39196888848693134, 4: 0.41390069639142624, 5: 0.45758343131048207, 6: 0.5236953965813512, 7: 0.3867685629013295}
INFO 10-23 11:53:09 simulator.py:88] Simulation ended at: 247.3667589906704s
INFO 10-23 11:53:09 simulator.py:91] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 10-23 11:53:16 simulator.py:94] Metrics written
INFO 10-23 11:53:20 simulator.py:102] Chrome event trace written


In [22]:
for batch_size in BATCH_SIZES:
    trace = f"ee_batch{batch_size}"
    print(f"batch size = {batch_size}")
    print("E2E / Exec")
    print(get_e2e_over_exec(trace))
    print("Queue Length")
    print(get_queue_length(trace))
    print()

batch size = 64
E2E / Exec
{'mean': 73.89105626012383, '95th': 59.24477736663204, '99th': 14.037986886170522}
Queue Length
{'0': 1.32434209780378, '1': 0.4435298252775038, '2': 0.4034604351621937, '3': 0.40178237355012075, '4': 0.40258738959375034, '5': 0.40317697880880304, '6': 0.4116466546481173, '7': 0.3213941517285168}

batch size = 128
E2E / Exec
{'mean': 47.030220006386415, '95th': 36.301619040961384, '99th': 9.3922261560815}
Queue Length
{'0': 1.3596411907259378, '1': 0.477248887763591, '2': 0.447571165045032, '3': 0.4482584005497884, '4': 0.4642094983180815, '5': 0.48048612869389085, '6': 0.49486381885918906, '7': 0.3815965565884183}

batch size = 256
E2E / Exec
{'mean': 27.2069468831687, '95th': 18.30399983175588, '99th': 6.478501609392427}
Queue Length
{'0': 1.2584892404569228, '1': 0.4026176784366787, '2': 0.38060419973458925, '3': 0.40017173635866876, '4': 0.42231531836278, '5': 0.4502094663162551, '6': 0.47599594077697693, '7': 0.32067861882334575}

batch size = 512
E2E / 

**No EE (batch size = 256)**

E2E / Exec

{'mean': 22.341935400591645, '95th': 14.987146524971076, '99th': 5.273660358307038}

Queue Length

{'0': 0.40735343863027246, '1': 0.31297650334365484, '2': 0.3305144284561942, '3': 0.3488850147016731, '4': 0.36899898519424423, '5': 0.397231401732976, '6': 0.4397751814941063, '7': 0.43327001639301604}

In [23]:
run_vidur({
        "name": "default",
        "early_exit_type": 0
    })
print("Default (No EE)")
print("E2E / Exec")
print(get_e2e_over_exec("default"))
print("Queue Length")
print(get_queue_length("default"))


INFO 10-23 11:56:08 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.NO_EE
----------
INFO 10-23 11:56:09 simulator.py:61] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests
-----Queue length-----
{0: 0.40735343863027246, 1: 0.31297650334365484, 2: 0.3305144284561942, 3: 0.3488850147016731, 4: 0.36899898519424423, 5: 0.397231401732976, 6: 0.4397751814941063, 7: 0.43327001639301604}
INFO 10-23 11:56:24 simulator.py:88] Simulation ended at: 409.49599237395756s
INFO 10-23 11:56:24 simulator.py:91] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 10-23 11:56:32 simulator.py:94] Metrics written
INFO 10-23 11:56:37 simulator.py:102] Chrome event trace written
Default (No EE)
E2E / Exec
{'mean': 22.341935400591645, '95th': 14.987146524971076, '99th': 5.273660358307038}
Queue Length
{'0': 0.40735343863027246, '1': 0.31297650334365484, '2': 0.3305144284561942, '3': 0.3488850147016731, '4': 0.36899898519424423, '5': 0.397231401732976, '6': 0.4397751814941063, '7': 0.43327001639301604}


## Output token times experiments

In [9]:
run_vidur({
        "name": "EE_NoPriority",
        "early_exit_type": 1
    })
run_vidur({
        "name": "NoEE_NoPriority",
        "early_exit_type": 0
    })

INFO 10-30 11:22:02 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.EE
----------
INFO 10-30 11:22:03 simulator.py:62] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests
Time to tokens file created: simulator_output/EE_NoPriority/time_to_tokens.csv


  total_flops_per_second = total_flops / batch_stage.execution_time
  self._numer_sum += self._last_data_y * x_diff


-----Queue length-----
{0: 1.2584892404569228, 1: 0.4026176784366787, 2: 0.38060419973458925, 3: 0.40017173635866876, 4: 0.42231531836278, 5: 0.4502094663162551, 6: 0.47599594077697693, 7: 0.32067861882334575}
INFO 10-30 11:22:25 simulator.py:97] Simulation ended at: 299.6331056283327s
INFO 10-30 11:22:25 simulator.py:100] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 10-30 11:22:36 simulator.py:103] Metrics written
INFO 10-30 11:22:41 simulator.py:111] Chrome event trace written
INFO 10-30 11:22:43 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.NO_EE
----------
INFO 10-30 11:22:44 simulator.py:62] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests
Time to tokens file created: simulator_output/NoEE_NoPriority/time_to_tokens.csv
-----Queue length-----
{0: 0.40735343863027246, 1: 0.31297650334365484, 2: 0.3305144284561942, 3: 0.3488850147016731, 4: 0.36899898519424423, 5: 0.397231401732976, 6: 0.4397751814941063, 7: 0.43327001639301604}
INFO 10-30 11:23:08 simulator.py:97] Simulation ended at: 409.49599237395756s
INFO 10-30 11:23:08 simulator.py:100] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 10-30 11:23:15 simulator.py:103] Metrics written
INFO 10-30 11:23:21 simulator.py:111] Chrome event trace written


In [33]:
run_vidur({
        "name": "EE_Priority",
        "early_exit_type": 1,
        "enable_priority_queue": True
    })
run_vidur({
        "name": "NoEE_Priority",
        "early_exit_type": 0,
        "enable_priority_queue": True
    })

INFO 11-11 15:55:20 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.EE
----------
INFO 11-11 15:55:21 simulator.py:62] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests
Time to tokens file created: simulator_output/EE_Priority/time_to_tokens.csv


  total_flops_per_second = total_flops / batch_stage.execution_time
  self._numer_sum += self._last_data_y * x_diff


-----Queue length-----
{0: 1.2097030953885028, 1: 0.37253316487681615, 2: 0.36525584333543903, 3: 0.37379658875552746, 4: 0.4013139608338598, 5: 0.42142766898294376, 6: 0.4587492103600758, 7: 0.31681617182564753}
INFO 11-11 15:55:47 simulator.py:97] Simulation ended at: 296.24695084102177s
INFO 11-11 15:55:47 simulator.py:100] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 11-11 15:55:55 simulator.py:103] Metrics written
INFO 11-11 15:56:00 simulator.py:111] Chrome event trace written
INFO 11-11 15:56:03 trace_request_length_generator.py:78] Loaded request length trace file ./data/processed_traces/arxiv_summarization_stats_llama2_tokenizer_filtered_v2.csv with 28257 requests
----------
Execution Time Predictor initialized with Early-Exit type=EarlyExitType.NO_EE
----------
INFO 11-11 15:56:04 simulator.py:62] Starting simulation with cluster: Cluster({'id': 0, 'num_replicas': 1}) and 2048 requests
Time to tokens file created: simulator_output/NoEE_Priority/time_to_tokens.csv
-----Queue length-----
{0: 0.39220731960906635, 1: 0.3068205448118112, 2: 0.32454772301933876, 3: 0.343860469952173, 4: 0.3654086088583905, 5: 0.3891661468080682, 6: 0.42753171137450613, 7: 0.42698585984612186}
INFO 11-11 15:56:30 simulator.py:97] Simulation ended at: 407.70418183050515s
INFO 11-11 15:56:30 simulator.py:100] Writing output


Error importing optional module IPython.core.display
Traceback (most recent call last):
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/site-packages/_plotly_utils/optional_imports.py", line 28, in get_module
    return import_module(name)
  File "/Users/xutingl/anaconda3/envs/vidur_env/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 992, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

INFO 11-11 15:56:38 simulator.py:103] Metrics written
INFO 11-11 15:56:43 simulator.py:111] Chrome event trace written


In [22]:
def analyze_request(trace_name: str):
    request_dir = RESULT_DIR / trace_name
    time_to_tokens_df = pd.read_csv(request_dir / "time_to_tokens.csv")
    request_metrics_df = pd.read_csv(request_dir / "request_metrics.csv")
    return {
        "TPOP": time_to_tokens_df["TPOP"].mean(),
        "token_times_90": time_to_tokens_df["time_to_token_90_precentile"].mean(),
        "token_times_95": time_to_tokens_df["time_to_token_95_precentile"].mean(),
        "token_times_99": time_to_tokens_df["time_to_token_99_precentile"].mean(),
        "request_e2e_time": request_metrics_df["request_e2e_time"].mean(),
        "request_e2e_time_normalized": request_metrics_df["request_e2e_time_normalized"].mean(),
        "request_num_prefill_tokens": request_metrics_df["request_num_prefill_tokens"].mean(),
        "request_num_decode_tokens": request_metrics_df["request_num_decode_tokens"].mean(),
    }
    

In [26]:
trace_names = ["NoEE_NoPriority", "EE_NoPriority", "NoEE_Priority", "EE_Priority"]
TPOPs = []
token_times_90 = []
token_times_95 = []
token_times_99 = []
request_e2e_time = []
for trace_name in trace_names:
    result = analyze_request(trace_name)
    TPOPs.append(result["TPOP"].round(3))
    token_times_90.append(result["token_times_90"].round(3))
    token_times_95.append(result["token_times_95"].round(3))
    token_times_99.append(result["token_times_99"].round(3))
    request_e2e_time.append(result["request_e2e_time"].round(3))

df = pd.DataFrame({
    "trace_name": trace_names,
    "TPOP": TPOPs,
    "token_times_90": token_times_90,
    "token_times_95": token_times_95,
    "token_times_99": token_times_99,
    "request_e2e_time": request_e2e_time,
})
df

Unnamed: 0,trace_name,TPOP,token_times_90,token_times_95,token_times_99,request_e2e_time
0,NoEE_NoPriority,0.191,0.407,0.445,0.559,200.944
1,EE_NoPriority,0.14,0.298,0.332,0.432,147.41
2,NoEE_Priority,0.189,0.406,0.446,0.574,198.799
3,EE_Priority,0.138,0.296,0.333,0.432,146.585


In [15]:
trace_names = ["EE_NoPriority", "NoEE_NoPriority"]
trace_reults = {}
for trace_name in trace_names:
    print(trace_name)
    trace_reults[trace_name] = analyze_request(trace_name)
    print(trace_reults[trace_name])
    print()

EE_NoPriority
{'TPOP': 0.13979405298500291, 'token_times_90': 0.2976953406584648, 'token_times_95': 0.3324351733657154, 'token_times_99': 0.4321192981652931, 'request_e2e_time': 147.40964765446782, 'request_e2e_time_normalized': 0.9608448568341269, 'request_num_prefill_tokens': 2567.39599609375, 'request_num_decode_tokens': 289.83447265625}

NoEE_NoPriority
{'TPOP': 0.1909032555448989, 'token_times_90': 0.40732838053553655, 'token_times_95': 0.4445233739717671, 'token_times_99': 0.5588676652061576, 'request_e2e_time': 200.94381609418116, 'request_e2e_time_normalized': 1.3097251067321645, 'request_num_prefill_tokens': 2567.39599609375, 'request_num_decode_tokens': 289.83447265625}



In [16]:
for metric, value in trace_reults["EE_NoPriority"].items():
    no_ee_value = trace_reults["NoEE_NoPriority"][metric]
    print(f"Improvement for {metric}: {(no_ee_value - value) / no_ee_value}")

Improvement for TPOP: 0.26772305382647343
Improvement for token_times_90: 0.2691514883714493
Improvement for token_times_95: 0.2521536710309653
Improvement for token_times_99: 0.22679495510642758
Improvement for request_e2e_time: 0.2664136149112556
Improvement for request_e2e_time_normalized: 0.26637669851844925
Improvement for request_num_prefill_tokens: 0.0
Improvement for request_num_decode_tokens: 0.0
