<a href="https://colab.research.google.com/github/wesslen/llm-experiments/blob/main/notebooks/loadtest/openai_compatible_endpoint_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from google.colab import userdata
api_key = userdata.get('DSBA_LLAMA3_KEY')
custom_base_url = userdata.get('MODAL_BASE_URL')

In [None]:
!pip install langchain<=0.3.0 langchain-community

In [None]:
# Cell 1: Imports and Setup
import os
import time
import statistics
from dataclasses import dataclass
from typing import List, Optional, Literal, Union
import numpy as np
import json
import logging
from concurrent.futures import ThreadPoolExecutor
import httpx
from datetime import datetime

# For LangChain 0.2.0
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
import openai

In [None]:
@dataclass
class TestConfig:
    model_name: str
    base_url: str
    api_key: str
    client_type: Literal["requests", "openai", "langchain"]
    system_prompt: Optional[str] = None
    temperature: float = 0.7
    top_p: float = 1.0
    max_tokens: int = 1000
    verify_ssl: bool = True
    output_path: Optional[str] = None

# Cell 3: LoadTester Class
class LoadTester:
    def __init__(self, config: TestConfig):
        self.config = config
        self.setup_client()
        self.results = []
        if self.config.output_path:
            os.makedirs(self.config.output_path, exist_ok=True)

    def setup_client(self):
        if self.config.client_type == "openai":
            openai.api_key = self.config.api_key
            openai.api_base = self.config.base_url
        elif self.config.client_type == "langchain":
            # Updated for LangChain 0.2.0
            self.client = ChatOpenAI(
                model_name=self.config.model_name,
                openai_api_key=self.config.api_key,
                openai_api_base=self.config.base_url,
                temperature=self.config.temperature,
                max_tokens=self.config.max_tokens,
                streaming=True
            )

    def count_tokens(self, text: str) -> int:
        return len(text) // 4  # Simple approximation

    def _make_request(self, prompt: str) -> dict:
        start_time = time.time()
        first_token_time = None
        try:
            if self.config.client_type == "openai":
                messages = [
                    {"role": "system", "content": self.config.system_prompt or ""},
                    {"role": "user", "content": prompt}
                ]

                completion = ""
                token_times = []

                # Updated for older OpenAI API version
                for chunk in openai.ChatCompletion.create(
                    model=self.config.model_name,
                    messages=messages,
                    stream=True,
                    temperature=self.config.temperature,
                    max_tokens=self.config.max_tokens
                ):
                    if not first_token_time:
                        first_token_time = time.time()

                    if 'content' in chunk['choices'][0]['delta']:
                        token = chunk['choices'][0]['delta']['content']
                        completion += token
                        token_times.append(time.time())

            elif self.config.client_type == "langchain":
                messages = []
                if self.config.system_prompt:
                    messages.append(SystemMessage(content=self.config.system_prompt))
                messages.append(HumanMessage(content=prompt))

                completion = ""
                token_times = []

                # Updated for LangChain 0.2.0
                for chunk in self.client.stream(messages):
                    if not first_token_time:
                        first_token_time = time.time()
                    if chunk.content:
                        completion += chunk.content
                        token_times.append(time.time())

            end_time = time.time()
            total_latency = end_time - start_time
            ttft = first_token_time - start_time if first_token_time else total_latency

            output_tokens = self.count_tokens(completion)
            avg_throughput = output_tokens / total_latency if total_latency > 0 else 0

            # Calculate token generation rates
            if len(token_times) > 1:
                token_intervals = np.diff(token_times)
                instant_throughputs = 1 / token_intervals
            else:
                instant_throughputs = [avg_throughput]

            return {
                "success": True,
                "total_latency": total_latency,
                "time_to_first_token": ttft,
                "avg_throughput": avg_throughput,
                "peak_throughput": max(instant_throughputs),
                "min_throughput": min(instant_throughputs),
                "prompt_length": len(prompt),
                "output_tokens": output_tokens,
                "completion": completion
            }

        except Exception as e:
            logging.error(f"Request failed: {str(e)}")
            return {
                "success": False,
                "error": str(e),
                "prompt_length": len(prompt),
                "output_tokens": 0,
                "throughput": 0
            }

    def analyze_results(self, results: List[dict]) -> dict:
        successful_requests = [r for r in results if r["success"]]
        failed_requests = [r for r in results if not r["success"]]

        if not successful_requests:
            return {"error": "All requests failed"}

        latencies = [r["total_latency"] for r in successful_requests]
        ttft = [r["time_to_first_token"] for r in successful_requests]
        avg_throughputs = [r["avg_throughput"] for r in successful_requests]
        peak_throughputs = [r["peak_throughput"] for r in successful_requests]
        min_throughputs = [r.get("min_throughput", r["avg_throughput"]) for r in successful_requests]
        output_tokens = [r["output_tokens"] for r in successful_requests]

        return {
            "total_requests": len(results),
            "successful_requests": len(successful_requests),
            "failed_requests": len(failed_requests),
            "latency": {
                "avg": statistics.mean(latencies),
                "p50": np.percentile(latencies, 50),
                "p95": np.percentile(latencies, 95),
                "p99": np.percentile(latencies, 99),
                "min": min(latencies),
                "max": max(latencies)
            },
            "time_to_first_token": {
                "avg": statistics.mean(ttft),
                "p50": np.percentile(ttft, 50),
                "p95": np.percentile(ttft, 95),
                "p99": np.percentile(ttft, 99),
            },
            "throughput": {
                "avg": statistics.mean(avg_throughputs),
                "peak": statistics.mean(peak_throughputs),
                "min": statistics.mean(min_throughputs),
            },
            "output_tokens": {
                "total": sum(output_tokens),
                "avg": statistics.mean(output_tokens),
                "p50": np.percentile(output_tokens, 50),
                "p95": np.percentile(output_tokens, 95),
            }
        }

    def run_latency_test(self, prompts: List[str], concurrency: int = 1):
        with ThreadPoolExecutor(max_workers=concurrency) as executor:
            results = list(executor.map(self._make_request, prompts))
        return self.analyze_results(results)

    def run_sustained_load_test(self, prompt: str, requests_per_second: float, duration_seconds: int):
        start_time = time.time()
        results = []

        while time.time() - start_time < duration_seconds:
            before_request = time.time()
            result = self._make_request(prompt)
            results.append(result)

            elapsed = time.time() - before_request
            wait_time = max(0, (1 / requests_per_second) - elapsed)
            time.sleep(wait_time)

        return self.analyze_results(results)

## OpenAI

In [None]:
config = TestConfig(
    model_name="/models/NousResearch/Meta-Llama-3.1-8B-Instruct",
    base_url=custom_base_url,
    api_key=api_key,
    client_type="openai",
    system_prompt="You are a helpful AI assistant.",
    verify_ssl=False,
    output_path="./test_results"
)

tester = LoadTester(config)

# Run a simple test
results = tester.run_latency_test(
    prompts=["What is artificial intelligence?"],
    concurrency=1
)
print(json.dumps(results, indent=2))

## Requests

In [None]:
config = TestConfig(
    model_name="/models/NousResearch/Meta-Llama-3.1-8B-Instruct",
    base_url=custom_base_url,
    api_key=api_key,
    client_type="requests",
    system_prompt="You are a helpful AI assistant.",
    verify_ssl=False,
    output_path="./test_results"
)

tester = LoadTester(config)

# Run a simple test
results = tester.run_latency_test(
    prompts=["What is artificial intelligence?"],
    concurrency=1
)
print(json.dumps(results, indent=2))

## LangChain

In [None]:
config = TestConfig(
    model_name="/models/NousResearch/Meta-Llama-3.1-8B-Instruct",
    base_url=custom_base_url,
    api_key=api_key,
    client_type="langchain",
    system_prompt="You are a helpful AI assistant.",
    verify_ssl=False,
    output_path="./test_results"
)

tester = LoadTester(config)

# Run a simple test
results = tester.run_latency_test(
    prompts=["What is artificial intelligence?"],
    concurrency=1
)
print(json.dumps(results, indent=2))