-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_litellm_overhead.py
116 lines (101 loc) · 3.69 KB
/
test_litellm_overhead.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import json
import os
import sys
import time
from datetime import datetime
from unittest.mock import AsyncMock, patch, MagicMock
import pytest
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model",
[
"bedrock/mistral.mistral-7b-instruct-v0:2",
"openai/gpt-4o",
"openai/self_hosted",
"bedrock/anthropic.claude-3-5-haiku-20241022-v1:0",
],
)
async def test_litellm_overhead(model):
litellm._turn_on_debug()
start_time = datetime.now()
if model == "openai/self_hosted":
response = await litellm.acompletion(
model=model,
messages=[{"role": "user", "content": "Hello, world!"}],
api_base="https://exampleopenaiendpoint-production.up.railway.app/",
)
else:
response = await litellm.acompletion(
model=model,
messages=[{"role": "user", "content": "Hello, world!"}],
)
end_time = datetime.now()
total_time_ms = (end_time - start_time).total_seconds() * 1000
print(response)
print(response._hidden_params)
litellm_overhead_ms = response._hidden_params["litellm_overhead_time_ms"]
# calculate percent of overhead caused by litellm
overhead_percent = litellm_overhead_ms * 100 / total_time_ms
print("##########################\n")
print("total_time_ms", total_time_ms)
print("response litellm_overhead_ms", litellm_overhead_ms)
print("litellm overhead_percent {}%".format(overhead_percent))
print("##########################\n")
assert litellm_overhead_ms > 0
assert litellm_overhead_ms < 1000
# latency overhead should be less than total request time
assert litellm_overhead_ms < (end_time - start_time).total_seconds() * 1000
# latency overhead should be under 40% of total request time
assert overhead_percent < 40
pass
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model",
[
"bedrock/mistral.mistral-7b-instruct-v0:2",
"openai/gpt-4o",
"bedrock/anthropic.claude-3-5-haiku-20241022-v1:0",
"openai/self_hosted",
],
)
async def test_litellm_overhead_stream(model):
litellm._turn_on_debug()
start_time = datetime.now()
if model == "openai/self_hosted":
response = await litellm.acompletion(
model=model,
messages=[{"role": "user", "content": "Hello, world!"}],
api_base="https://exampleopenaiendpoint-production.up.railway.app/",
stream=True,
)
else:
response = await litellm.acompletion(
model=model,
messages=[{"role": "user", "content": "Hello, world!"}],
stream=True,
)
async for chunk in response:
print()
end_time = datetime.now()
total_time_ms = (end_time - start_time).total_seconds() * 1000
print(response)
print(response._hidden_params)
litellm_overhead_ms = response._hidden_params["litellm_overhead_time_ms"]
# calculate percent of overhead caused by litellm
overhead_percent = litellm_overhead_ms * 100 / total_time_ms
print("##########################\n")
print("total_time_ms", total_time_ms)
print("response litellm_overhead_ms", litellm_overhead_ms)
print("litellm overhead_percent {}%".format(overhead_percent))
print("##########################\n")
assert litellm_overhead_ms > 0
assert litellm_overhead_ms < 1000
# latency overhead should be less than total request time
assert litellm_overhead_ms < (end_time - start_time).total_seconds() * 1000
# latency overhead should be under 40% of total request time
assert overhead_percent < 40
pass