<a href="https://colab.research.google.com/github/wesslen/llm-experiments/blob/main/notebooks/loadtest/openai_compatible_endpoint_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
from google.colab import userdata
api_key = userdata.get('DSBA_LLAMA3_KEY')
custom_base_url = userdata.get('MODAL_BASE_URL')

In [24]:
!uv pip install --system langchain-community langchain<0.3.0 #llama-index llama-index-llms-openai llama-index-llms-openai-like

/bin/bash: line 1: 0.3.0: No such file or directory


In [18]:
import asyncio
import time
import aiohttp
import statistics
from dataclasses import dataclass
from typing import List, Optional, Literal, Union
import numpy as np
import json
import logging
from concurrent.futures import ThreadPoolExecutor
import openai
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
import httpx
import os
from datetime import datetime

@dataclass
class TestConfig:
    model_name: str
    base_url: str
    api_key: str
    client_type: Literal["requests", "openai", "langchain"]
    system_prompt: Optional[str] = None
    temperature: float = 0.7
    top_p: float = 1.0
    max_tokens: int = 1000
    verify_ssl: bool = True
    output_path: Optional[str] = None

class LoadTester:
    def __init__(self, config: TestConfig):
        self.config = config
        self.setup_client()
        self.results = []
        if self.config.output_path:
            os.makedirs(self.config.output_path, exist_ok=True)

    def setup_client(self):
        if self.config.client_type == "openai":
            openai.api_key = self.config.api_key
            openai.base_url = self.config.base_url
            openai.http_client = httpx.Client(verify=self.config.verify_ssl)
        elif self.config.client_type == "langchain":
            self.client = ChatOpenAI(
                model_name=self.config.model_name,
                openai_api_key=self.config.api_key,
                base_url=self.config.base_url,
                temperature=self.config.temperature,
                top_p=self.config.top_p,
                max_tokens=self.config.max_tokens,
                client=httpx.Client(verify=self.config.verify_ssl),
                streaming=True  # Enable streaming
            )

    def count_tokens(self, text: str) -> int:
        # Simple approximation: 4 chars ~ 1 token
        return len(text) // 4

    # async def _make_request(self, prompt: str) -> dict:
    #     start_time = time.time()
    #     try:
    #         if self.config.client_type == "requests":
    #             async with aiohttp.ClientSession() as session:
    #                 async with session.post(
    #                     f"{self.config.base_url}/v1/chat/completions",
    #                     headers={"Authorization": f"Bearer {self.config.api_key}"},
    #                     json={
    #                         "model": self.config.model_name,
    #                         "messages": [
    #                             {"role": "system", "content": self.config.system_prompt or ""},
    #                             {"role": "user", "content": prompt}
    #                         ],
    #                         "temperature": self.config.temperature,
    #                         "top_p": self.config.top_p,
    #                         "max_tokens": self.config.max_tokens
    #                     },
    #                     ssl=self.config.verify_ssl
    #                 ) as response:
    #                     result = await response.json()
    #                     completion = result['choices'][0]['message']['content']

    #         elif self.config.client_type == "openai":
    #             # Fix: Use sync client for OpenAI
    #             result = openai.chat.completions.create(
    #                 model=self.config.model_name,
    #                 messages=[
    #                     {"role": "system", "content": self.config.system_prompt or ""},
    #                     {"role": "user", "content": prompt}
    #                 ],
    #                 temperature=self.config.temperature,
    #                 top_p=self.config.top_p,
    #                 max_tokens=min(self.config.max_tokens, 500)  # Reduce max tokens
    #             )
    #             completion = result.choices[0].message.content

    #         elif self.config.client_type == "langchain":
    #             messages = []
    #             if self.config.system_prompt:
    #                 messages.append(SystemMessage(content=self.config.system_prompt))
    #             messages.append(HumanMessage(content=prompt))
    #             result = await self.client.agenerate([messages])
    #             completion = result.generations[0][0].text

    #         latency = time.time() - start_time
    #         output_tokens = self.count_tokens(completion)
    #         throughput = output_tokens / latency if latency > 0 else 0

    #         return {
    #             "success": True,
    #             "latency": latency,
    #             "prompt_length": len(prompt),
    #             "output_tokens": output_tokens,
    #             "throughput": throughput,
    #             "completion": completion
    #         }

    #     except Exception as e:
    #         logging.error(f"Request failed: {str(e)}")
    #         return {
    #             "success": False,
    #             "error": str(e),
    #             "prompt_length": len(prompt),
    #             "output_tokens": 0,
    #             "throughput": 0
    #         }
    async def _make_request(self, prompt: str) -> dict:
        start_time = time.time()
        first_token_time = None
        try:
            if self.config.client_type == "requests":
                async with aiohttp.ClientSession() as session:
                    async with session.post(
                        f"{self.config.base_url}/v1/chat/completions",
                        headers={"Authorization": f"Bearer {self.config.api_key}"},
                        json={
                            "model": self.config.model_name,
                            "messages": [
                                {"role": "system", "content": self.config.system_prompt or ""},
                                {"role": "user", "content": prompt}
                            ],
                            "temperature": self.config.temperature,
                            "top_p": self.config.top_p,
                            "max_tokens": self.config.max_tokens,
                            "stream": True
                        },
                        ssl=self.config.verify_ssl
                    ) as response:
                        completion = ""
                        token_times = []

                        async for line in response.content:
                            if line:
                                if not first_token_time:
                                    first_token_time = time.time()

                                json_response = json.loads(line.decode('utf-8').split('data: ')[1])
                                if json_response['choices'][0]['delta'].get('content'):
                                    completion += json_response['choices'][0]['delta']['content']
                                    token_times.append(time.time())

            elif self.config.client_type == "openai":
                stream = openai.chat.completions.create(
                    model=self.config.model_name,
                    messages=[
                        {"role": "system", "content": self.config.system_prompt or ""},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=self.config.temperature,
                    stream=True,
                    max_tokens=min(self.config.max_tokens, 500)
                )

                completion = ""
                token_times = []

                for chunk in stream:
                    if not first_token_time:
                        first_token_time = time.time()

                    if chunk.choices[0].delta.content:
                        completion += chunk.choices[0].delta.content
                        token_times.append(time.time())

            elif self.config.client_type == "langchain":
                messages = []
                if self.config.system_prompt:
                    messages.append(SystemMessage(content=self.config.system_prompt))
                messages.append(HumanMessage(content=prompt))

                completion = ""
                token_times = []

                async for chunk in self.client.astream([messages]):
                    if not first_token_time:
                        first_token_time = time.time()
                    if chunk.content:
                        completion += chunk.content
                        token_times.append(time.time())

            end_time = time.time()
            total_latency = end_time - start_time
            ttft = first_token_time - start_time if first_token_time else total_latency

            output_tokens = self.count_tokens(completion)
            avg_throughput = output_tokens / total_latency if total_latency > 0 else 0

            # Calculate token generation rate over time
            if len(token_times) > 1:
                token_intervals = np.diff(token_times)
                instant_throughputs = 1 / token_intervals  # tokens per second
            else:
                instant_throughputs = [avg_throughput]

            return {
                "success": True,
                "total_latency": total_latency,
                "time_to_first_token": ttft,
                "avg_throughput": avg_throughput,
                "peak_throughput": max(instant_throughputs),
                "min_throughput": min(instant_throughputs),
                "prompt_length": len(prompt),
                "output_tokens": output_tokens,
                "completion": completion
            }

        except Exception as e:
            logging.error(f"Request failed: {str(e)}")
            return {
                "success": False,
                "error": str(e),
                "prompt_length": len(prompt),
                "output_tokens": 0,
                "throughput": 0
            }

    def save_results(self, results: dict, test_name: str):
        if self.config.output_path:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{test_name}_{timestamp}.json"
            filepath = os.path.join(self.config.output_path, filename)
            with open(filepath, 'w') as f:
                json.dump(results, f, indent=2)

    def analyze_results(self, results: List[dict]) -> dict:
        successful_requests = [r for r in results if r["success"]]
        failed_requests = [r for r in results if not r["success"]]

        if not successful_requests:
            return {"error": "All requests failed"}

        # Updated key names to match _make_request
        latencies = [r["total_latency"] for r in successful_requests]
        ttft = [r["time_to_first_token"] for r in successful_requests]
        avg_throughputs = [r["avg_throughput"] for r in successful_requests]
        peak_throughputs = [r["peak_throughput"] for r in successful_requests]
        min_throughputs = [r.get("min_throughput", r["avg_throughput"]) for r in successful_requests]
        output_tokens = [r["output_tokens"] for r in successful_requests]

        analysis = {
            "total_requests": len(results),
            "successful_requests": len(successful_requests),
            "failed_requests": len(failed_requests),
            "latency": {
                "avg": statistics.mean(latencies),
                "p50": np.percentile(latencies, 50),
                "p95": np.percentile(latencies, 95),
                "p99": np.percentile(latencies, 99),
                "min": min(latencies),
                "max": max(latencies)
            },
            "time_to_first_token": {
                "avg": statistics.mean(ttft),
                "p50": np.percentile(ttft, 50),
                "p95": np.percentile(ttft, 95),
                "p99": np.percentile(ttft, 99),
                "min": min(ttft),
                "max": max(ttft)
            },
            "throughput": {
                "avg": statistics.mean(avg_throughputs),
                "peak": statistics.mean(peak_throughputs),
                "min": statistics.mean(min_throughputs),
                "p50": np.percentile(avg_throughputs, 50),
                "p95": np.percentile(avg_throughputs, 95),
                "p99": np.percentile(avg_throughputs, 99)
            },
            "output_tokens": {
                "total": sum(output_tokens),
                "avg": statistics.mean(output_tokens),
                "p50": np.percentile(output_tokens, 50),
                "p95": np.percentile(output_tokens, 95),
                "p99": np.percentile(output_tokens, 99)
            }
        }

        return analysis

    async def run_latency_test(self, prompts: List[str], concurrency: int = 1):
        async def _batch_requests(batch: List[str]):
            tasks = [self._make_request(prompt) for prompt in batch]
            return await asyncio.gather(*tasks)

        results = []
        for i in range(0, len(prompts), concurrency):
            batch = prompts[i:i + concurrency]
            batch_results = await _batch_requests(batch)
            results.extend(batch_results)

        analysis = self.analyze_results(results)
        self.save_results(analysis, "latency_test")
        return analysis

    async def run_sustained_load_test(self,
                                    prompt: str,
                                    requests_per_second: float,
                                    duration_seconds: int):
        start_time = time.time()
        results = []

        while time.time() - start_time < duration_seconds:
            before_request = time.time()
            result = await self._make_request(prompt)
            results.append(result)

            elapsed = time.time() - before_request
            wait_time = max(0, (1 / requests_per_second) - elapsed)
            await asyncio.sleep(wait_time)

        analysis = self.analyze_results(results)
        self.save_results(analysis, "sustained_test")
        return analysis

    def generate_variable_length_prompts(self,
                                       base_prompt: str,
                                       n_prompts: int,
                                       min_length: int = 100,
                                       max_length: int = 1000) -> List[str]:
        lengths = np.linspace(min_length, max_length, n_prompts, dtype=int)
        prompts = []

        for length in lengths:
            padding_length = max(0, length - len(base_prompt))
            padding = "X" * padding_length
            prompts.append(base_prompt + padding)

        return prompts

In [12]:
import nest_asyncio
nest_asyncio.apply()

async def run_notebook_tests():
    config = TestConfig(
        model_name="/models/NousResearch/Meta-Llama-3.1-8B-Instruct",
        base_url=custom_base_url,
        api_key=api_key,
        client_type="openai",
        system_prompt="You are a helpful AI assistant.",
        verify_ssl=False,
        output_path="./test_results"
    )

    tester = LoadTester(config)
    results = {}

    # Basic test
    results['basic'] = await tester.run_latency_test(
        prompts=["What is artificial intelligence?"],
        concurrency=1
    )

    # Long prompt test
    long_prompt = "Explain the complete history of artificial intelligence, " * 50
    results['long_prompt'] = await tester.run_latency_test(
        prompts=[long_prompt],
        concurrency=1
    )

    # High temperature test
    config.temperature = 0.9
    tester = LoadTester(config)
    results['high_temp'] = await tester.run_latency_test(
        prompts=["Write a creative story about a robot."] * 5,
        concurrency=1
    )

    # Concurrent requests test
    config.temperature = 0.7
    tester = LoadTester(config)
    results['concurrent'] = await tester.run_latency_test(
        prompts=["Summarize the benefits of exercise."] * 10,
        concurrency=5
    )

    # Sustained load test
    results['sustained'] = await tester.run_sustained_load_test(
        prompt="What are the benefits of meditation?",
        requests_per_second=2,
        duration_seconds=30
    )

    return results

# Run tests
results = await run_notebook_tests()
print(json.dumps(results, indent=2))

{
  "basic": {
    "total_requests": 1,
    "successful_requests": 1,
    "failed_requests": 0,
    "latency": {
      "avg": 13.740294694900513,
      "p50": 13.740294694900513,
      "p95": 13.740294694900513,
      "p99": 13.740294694900513,
      "min": 13.740294694900513,
      "max": 13.740294694900513
    },
    "time_to_first_token": {
      "avg": 0.3716566562652588,
      "p50": 0.3716566562652588,
      "p95": 0.3716566562652588,
      "p99": 0.3716566562652588,
      "min": 0.3716566562652588,
      "max": 0.3716566562652588
    },
    "throughput": {
      "avg": 32.532053345689654,
      "peak": 84.83109843658355,
      "min": 15.076090277453282,
      "p50": 32.532053345689654,
      "p95": 32.532053345689654,
      "p99": 32.532053345689654
    },
    "output_tokens": {
      "total": 447,
      "avg": 447,
      "p50": 447.0,
      "p95": 447.0,
      "p99": 447.0
    }
  },
  "long_prompt": {
    "total_requests": 1,
    "successful_requests": 1,
    "failed_requests"

In [19]:
import nest_asyncio
nest_asyncio.apply()
import asyncio
import json
from datetime import datetime
import pandas as pd

async def run_comprehensive_tests():
    config = TestConfig(
        model_name="/models/NousResearch/Meta-Llama-3.1-8B-Instruct",
        base_url=custom_base_url,
        api_key=api_key,
        client_type="langchain",
        system_prompt="You are a helpful AI assistant.",
        verify_ssl=False,
        output_path="./test_results"
    )

    tester = LoadTester(config)
    results = {}

    # 1. Basic Baseline Test
    print("\nRunning Basic Baseline Test...")
    results['baseline'] = await tester.run_latency_test(
        prompts=["Explain what is machine learning in simple terms."],
        concurrency=1
    )

    # 2. System Prompt Impact Test
    print("\nRunning System Prompt Impact Test...")
    # Without system prompt
    config.system_prompt = None
    tester = LoadTester(config)
    results['no_system_prompt'] = await tester.run_latency_test(
        prompts=["Explain what is machine learning in simple terms."],
        concurrency=1
    )

    # With detailed system prompt
    config.system_prompt = """You are an AI assistant with expertise in technical topics.
    Always provide detailed, well-structured explanations with examples.
    Break down complex concepts into simpler terms."""
    tester = LoadTester(config)
    results['with_system_prompt'] = await tester.run_latency_test(
        prompts=["Explain what is machine learning in simple terms."],
        concurrency=1
    )

    # 3. Variable Length Prompts (Short)
    print("\nRunning Short Variable Length Test...")
    base_prompt = "Explain the concept of machine learning"
    short_prompts = tester.generate_variable_length_prompts(
        base_prompt=base_prompt,
        n_prompts=5,
        min_length=100,
        max_length=1000
    )
    results['short_variable'] = await tester.run_latency_test(
        prompts=short_prompts,
        concurrency=1
    )

    # 4. Variable Length Prompts (Long)
    print("\nRunning Long Variable Length Test...")
    long_prompts = tester.generate_variable_length_prompts(
        base_prompt=base_prompt,
        n_prompts=3,
        min_length=1000,
        max_length=3000
    )
    results['long_variable'] = await tester.run_latency_test(
        prompts=long_prompts,
        concurrency=1
    )

    # 5. Chain-of-Thought vs Direct
    print("\nRunning Chain-of-Thought Test...")
    cot_prompt = """Question: A ball costs $6. A bat costs $12 more than the ball.
    How much do the ball and bat cost together?
    Let's solve this step by step:
    1) First, let's find the cost of the bat
    2) Then, add the costs together"""

    direct_prompt = "A ball costs $6. A bat costs $12 more than the ball. How much do the ball and bat cost together?"

    results['reasoning_comparison'] = await tester.run_latency_test(
        prompts=[cot_prompt, direct_prompt],
        concurrency=1
    )

    # 6. Temperature Sweep
    print("\nRunning Temperature Sweep Test...")
    temps = [0.0, 0.3, 0.7, 1.0]
    temp_results = {}

    for temp in temps:
        config.temperature = temp
        tester = LoadTester(config)
        temp_results[f'temp_{temp}'] = await tester.run_latency_test(
            prompts=["Write a creative story about a robot discovering emotions."],
            concurrency=1
        )
    results['temperature_sweep'] = temp_results

    # 7. Concurrent Load (Light)
    print("\nRunning Light Concurrent Load Test...")
    results['light_concurrent'] = await tester.run_latency_test(
        prompts=["Explain a complex topic simply."] * 3,
        concurrency=3
    )

    # 8. Concurrent Load (Heavy)
    print("\nRunning Heavy Concurrent Load Test...")
    results['heavy_concurrent'] = await tester.run_latency_test(
        prompts=["Explain a complex topic simply."] * 10,
        concurrency=10
    )

    # 9. Burst Load
    print("\nRunning Burst Load Test...")
    results['burst'] = await tester.run_latency_test(
        prompts=["Give me a quick explanation."] * 5,
        concurrency=5
    )

    # 10. Mixed Workload
    print("\nRunning Mixed Workload Test...")
    mixed_prompts = [
        "Short simple response.",
        "A" * 1000 + " Please explain this.",
        "Write a creative story about robots.",
        "Explain quantum computing step by step.",
        "Summarize the theory of relativity briefly."
    ]
    results['mixed_workload'] = await tester.run_latency_test(
        prompts=mixed_prompts,
        concurrency=2  # Process in small batches
    )

    # Save all results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    with open(f'comprehensive_results_{timestamp}.json', 'w') as f:
        json.dump(results, f, indent=2)

    return results

# Run all tests
results = await run_comprehensive_tests()

# Create summary DataFrame
def extract_metrics(result):
    if isinstance(result, dict) and 'latency' in result:
        return pd.Series({
            'avg_latency': result['latency']['avg'],
            'ttft': result['time_to_first_token']['avg'],
            'avg_throughput': result['throughput']['avg'],
            'peak_throughput': result['throughput']['peak'],
            'total_tokens': result['output_tokens']['total']
        })
    return pd.Series()

summary_data = {}
for test_name, result in results.items():
    if test_name != 'temperature_sweep':
        summary_data[test_name] = extract_metrics(result)
    else:
        for temp_name, temp_result in result.items():
            summary_data[temp_name] = extract_metrics(temp_result)

summary_df = pd.DataFrame(summary_data).T
print("\nTest Summary:")
print(summary_df.round(2))

                    top_p was transferred to model_kwargs.
                    Please confirm that top_p is what you intended.
ERROR:root:Request failed: Unexpected message type: 'content='You are a helpful AI assistant.' additional_kwargs={} response_metadata={}'. Use one of 'human', 'user', 'ai', 'assistant', 'function', 'tool', or 'system'.
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/MESSAGE_COERCION_FAILURE 
                    top_p was transferred to model_kwargs.
                    Please confirm that top_p is what you intended.
ERROR:root:Request failed: Unsupported message type: <class 'list'>
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/MESSAGE_COERCION_FAILURE 
                    top_p was transferred to model_kwargs.
                    Please confirm that top_p is what you intended.
ERROR:root:Request failed: Unexpected message type: 'content='You are an AI assistant with expertise in techni


Running Basic Baseline Test...

Running System Prompt Impact Test...

Running Short Variable Length Test...

Running Long Variable Length Test...

Running Chain-of-Thought Test...

Running Temperature Sweep Test...


ERROR:root:Request failed: Unexpected message type: 'content='You are an AI assistant with expertise in technical topics.\n    Always provide detailed, well-structured explanations with examples.\n    Break down complex concepts into simpler terms.' additional_kwargs={} response_metadata={}'. Use one of 'human', 'user', 'ai', 'assistant', 'function', 'tool', or 'system'.
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/MESSAGE_COERCION_FAILURE 
                    top_p was transferred to model_kwargs.
                    Please confirm that top_p is what you intended.
ERROR:root:Request failed: Unexpected message type: 'content='You are an AI assistant with expertise in technical topics.\n    Always provide detailed, well-structured explanations with examples.\n    Break down complex concepts into simpler terms.' additional_kwargs={} response_metadata={}'. Use one of 'human', 'user', 'ai', 'assistant', 'function', 'tool', or 'system'.
For troublesho


Running Light Concurrent Load Test...

Running Heavy Concurrent Load Test...

Running Burst Load Test...

Running Mixed Workload Test...

Test Summary:
Empty DataFrame
Columns: []
Index: [baseline, no_system_prompt, with_system_prompt, short_variable, long_variable, reasoning_comparison, temp_0.0, temp_0.3, temp_0.7, temp_1.0, light_concurrent, heavy_concurrent, burst, mixed_workload]


## Performance Metrics Overview

### Time to First Token (TTFT)
* Range: 0.15-0.42 seconds
* Notable patterns:
  * Baseline test: Highest at 0.42s
  * Concurrent loads: Lowest at 0.15-0.16s
  * System prompt presence: Minimal impact

### Average Latency
* Range: 2.35-20.00 seconds
* Key observations:
  * Temperature tests: Consistent at ~19.5-20.0s
  * Burst test: Exceptionally low at 2.35s
  * System prompt impact: Increases from 14.21s to 17.40s

### Throughput Characteristics
* Average throughput: Consistent at 26-34 tokens/sec
* Peak throughput variations:
  * Highest: Temperature 1.0 (4,593 tokens/sec)
  * Lowest: Temperature 0.0 (62.88 tokens/sec)
  * System prompt effect: Increases from 117 to 2,438 tokens/sec

### Token Generation
* Heavy concurrent load: 5,711 tokens
* Short variable length test: 3,238 tokens
* Baseline tests: ~280-300 tokens
* Temperature variations: Consistent at ~585-632 tokens

## Key Performance Insights

### System Prompt Impact
* Token generation: Increases from 448 to 605
* Latency: Increases from 14.21s to 17.40s
* Peak throughput: Dramatic improvement from 117 to 2,438 tokens/sec

### Concurrency Performance
* Excellent scaling with concurrent requests
* Consistent TTFT under load
* Efficient token generation in heavy concurrent scenarios

### Temperature Effects
* Average throughput: Minimal impact
* Peak throughput: Significantly higher at higher temperatures
* Token generation: Consistent across temperature settings

### Load Pattern Response
* Burst handling: Highly efficient (2.35s latency)
* Mixed workload: Well-balanced performance
* Variable length prompts: Good handling without degradation
