In [1]:
import ollama
import time
import json

# Define the models and settings to benchmark
models = ["qwen2.5-coder:3b", "qwen2.5-coder:7b-instruct-q4_0"]  #
settings = [
    {"num_threads": 1, "temperature": 0.0, "top_p": 0.95, "num_ctx": 24000},
    {"num_threads": 2, "temperature": 0.0, "top_p": 0.95, "num_ctx": 24000},
    {"num_threads": 4, "temperature": 0.0, "top_p": 0.95, "num_ctx": 24000},
    {"num_threads": 8, "temperature": 0.0, "top_p": 0.95, "num_ctx": 24000},
    {"num_threads": 1, "temperature": 0.0, "top_p": 0.95, "num_ctx": 16000},
    {"num_threads": 2, "temperature": 0.0, "top_p": 0.95, "num_ctx": 16000},
    {"num_threads": 4, "temperature": 0.0, "top_p": 0.95, "num_ctx": 16000},
    {"num_threads": 8, "temperature": 0.0, "top_p": 0.95, "num_ctx": 16000}
]

# Test prompt
prompt = "What is the meaning of life? Write 3 paragraphs on it."

results = []

for model in models:
    for setting in settings:
        start_time = time.time()
        
        try:
            # Prepare the message for the model
            messages = [{"role": "user", "content": prompt}]
            
            # Make the API call with the current settings
            response = ollama.chat(
                model=model, 
                messages=messages, 
                options=setting
            )
            
            # Calculate the time taken for the response
            end_time = time.time()
            elapsed_time = end_time - start_time
            
            # Store the results
            results.append({
                "model": model,
                "settings": setting,
                "time_taken": elapsed_time,
                "output_length": len(response.message.content)  # Optional: measure response length
            })
            print(f"Model: {model}, Settings: {json.dumps(setting)}, Time: {elapsed_time:.2f}s")
        except Exception as e:
            print(f"Error with model {model} and settings {json.dumps(setting)}: {str(e)}")

# Output results
print("\nBenchmark Results:")
for result in results:
    print(f"Model: {result['model']}")
    print(f"  Settings: {json.dumps(result['settings'])}")
    print(f"  Time Taken: {result['time_taken']:.2f} seconds")
    print(f"  Output Length: {result['output_length']}")
    print()

# Optionally, you could write these results to a CSV or JSON file for further analysis

Model: qwen2.5-coder:3b, Settings: {"num_threads": 1, "temperature": 0.0, "top_p": 0.95, "num_ctx": 24000}, Time: 25.96s
Model: qwen2.5-coder:3b, Settings: {"num_threads": 2, "temperature": 0.0, "top_p": 0.95, "num_ctx": 24000}, Time: 21.04s
Model: qwen2.5-coder:3b, Settings: {"num_threads": 4, "temperature": 0.0, "top_p": 0.95, "num_ctx": 24000}, Time: 25.52s
Model: qwen2.5-coder:3b, Settings: {"num_threads": 8, "temperature": 0.0, "top_p": 0.95, "num_ctx": 24000}, Time: 19.53s
Model: qwen2.5-coder:3b, Settings: {"num_threads": 1, "temperature": 0.0, "top_p": 0.95, "num_ctx": 16000}, Time: 16.23s
Model: qwen2.5-coder:3b, Settings: {"num_threads": 2, "temperature": 0.0, "top_p": 0.95, "num_ctx": 16000}, Time: 12.75s
Model: qwen2.5-coder:3b, Settings: {"num_threads": 4, "temperature": 0.0, "top_p": 0.95, "num_ctx": 16000}, Time: 12.70s
Model: qwen2.5-coder:3b, Settings: {"num_threads": 8, "temperature": 0.0, "top_p": 0.95, "num_ctx": 16000}, Time: 14.00s
Model: qwen2.5-coder:7b-instruct