In [None]:
import datetime
import random
from typing import Any

import pandas as pd
import requests
from tqdm import tqdm

from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
SAMPLE_OUTPUT_FILEPATH = "../data/50_data_points_from_alpaca.csv"
MODEL = "qwen3:4b"
URL = "http://localhost:11434/api/chat"

In [None]:
df_inference = pd.read_csv(SAMPLE_OUTPUT_FILEPATH)
inference_requests = [
    {
        "model": MODEL,
        "stream": False,
        "messages": [
            {"role": "user", "content": sample.instruction},
        ],
        "options": {
            "temperature": 0.0,
            "top_k": 20,  # Added to align with vLLM default settings
            "top_p": 0.95,  # Added to align with vLLM default settings
        },
    }
    for sample in df_inference.itertuples()
]

In [None]:
def send_requests(
    inference_requests: list[dict],
    url: str,
) -> list[dict]:

    def _make_request(request_data: dict) -> dict[str, Any]:
        start_time = datetime.datetime.now()
        response = requests.post(url, json=request_data)
        end_time = datetime.datetime.now()

        data = response.json()
        data["start_time"] = start_time
        data["end_time"] = end_time
        return data

    results = []
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(_make_request, req) for req in tqdm(inference_requests)]
        results = [future.result() for future in tqdm(as_completed(futures), total=len(futures))]

    return results

In [None]:
random.shuffle(inference_requests)
results = send_requests(inference_requests, url=URL)

In [None]:
def filter_out_outliers(results: list[dict]) -> list[dict]:
    # Here we simply filter out requests with 5% longest/shortest runtime as outliers
    runtimes = [(result["end_time"] - result["start_time"]).total_seconds() for result in results]
    lower_bound = pd.Series(runtimes).quantile(0.05)
    upper_bound = pd.Series(runtimes).quantile(0.95)
    filtered_results = [
        result
        for result in results
        if lower_bound <= (result["end_time"] - result["start_time"]).total_seconds() <= upper_bound
    ]
    return filtered_results


filtered_results = filter_out_outliers(results)
run_time = (
    max(result["end_time"] for result in filtered_results) - min(result["start_time"] for result in filtered_results)
).total_seconds()
n_tokens = sum([req["usage"]["completion_tokens"] for req in filtered_results])

In [None]:
# Print throughput metrics
print(f"Total run time: {run_time:.2f} seconds")
print(f"Total tokens generated: {n_tokens}")
print(f"Throughput (tokens/sec): {n_tokens / run_time:.2f}")
print(f"Throughput (requests/sec): {len(results) / run_time:.3f}")