# LLM Cost & Token Efficiency Analyzer
> A data-driven benchmarking notebook comparing **all major LLM providers** across cost, latency, and accuracy.

---

**Providers Covered:**
| Tier | Provider | Models |
|------|----------|--------|
| Free | **Groq** | llama-3.1-8b-instant, llama-3.3-70b-versatile, llama-4-scout-17b, qwen3-32b |
| Free | **Google Gemini** | gemini-1.5-flash, gemini-2.0-flash-exp |
| Free | **Cerebras** | llama3.1-8b, llama3.3-70b |
| Paid | **OpenAI** | gpt-4o, gpt-4o-mini, gpt-3.5-turbo |
| Paid | **Anthropic** | claude-3-5-sonnet, claude-3-haiku |
| Paid | **Gemini (paid)** | gemini-1.5-pro |

**Sections:**
1. Setup & Configuration
2. Benchmark Dataset
3. Model Runner Function
4. Run Experiments
5. Visualization & Analysis
6. Summary & Recommendations
7. RAG Chunk Size vs Cost Experiment

> **Currently active tier:** Set `ACTIVE_TIER = "free"` to run with free API keys.
> Switch to `"paid"` or `"all"` when you have paid-tier keys.

## Section 1 â€” Setup & Configuration

In [None]:
# Install all dependencies (run once if not already installed)
# !pip install groq google-generativeai cerebras-cloud-sdk openai anthropic \
#             pandas numpy matplotlib seaborn tiktoken python-dotenv tenacity

# Or install from requirements.txt:
# !pip install -r requirements.txt

In [None]:
import os
import time
import json
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from datetime import datetime
from collections import defaultdict

warnings.filterwarnings('ignore')

# â”€â”€ Plot styling â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
plt.rcParams.update({
    'figure.facecolor': '#0f0f1a',
    'axes.facecolor':   '#1a1a2e',
    'axes.edgecolor':   '#444466',
    'axes.labelcolor':  '#c8c8e8',
    'axes.titlecolor':  '#ffffff',
    'xtick.color':      '#c8c8e8',
    'ytick.color':      '#c8c8e8',
    'text.color':       '#c8c8e8',
    'grid.color':       '#2a2a4a',
    'grid.linestyle':   '--',
    'grid.alpha':       0.6,
    'font.family':      'DejaVu Sans',
    'font.size':        11,
})

# Extended palette for 14+ models
PALETTE = [
    '#7c5cbf', '#3aa8c1', '#e84393', '#f5a623', '#50fa7b', '#ff6b6b',
    '#bd93f9', '#8be9fd', '#ff79c6', '#ffb86c', '#6272a4', '#44475a',
    '#f1fa8c', '#00d4a0',
]

# Provider brand colors for annotations
PROVIDER_COLORS = {
    'groq':      '#f84f1d',
    'gemini':    '#4285f4',
    'cerebras':  '#8c52ff',
    'openai':    '#10a37f',
    'anthropic': '#cc785c',
}

print("Libraries loaded successfully")
print(f"Run date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# â”€â”€ Load API Keys from .env â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
from dotenv import load_dotenv
load_dotenv(override=True)

# â”€â”€ FREE TIER keys (get these for free) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
GROQ_API_KEY      = os.getenv("GROQ_API_KEY",      "")   # https://console.groq.com/keys
GEMINI_API_KEY    = os.getenv("GEMINI_API_KEY",    "")   # https://aistudio.google.com/app/apikey
CEREBRAS_API_KEY  = os.getenv("CEREBRAS_API_KEY",  "")   # https://cloud.cerebras.ai

# â”€â”€ PAID TIER keys (add when you have them) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
OPENAI_API_KEY    = os.getenv("OPENAI_API_KEY",    "")   # https://platform.openai.com/api-keys
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")   # https://console.anthropic.com/settings/keys

# â”€â”€ Tier Selection â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# "free"  â†’ only free-tier providers: Groq, Gemini (free), Cerebras
# "paid"  â†’ only paid providers: OpenAI, Anthropic, Gemini Pro
# "all"   â†’ run every configured model
ACTIVE_TIER = os.getenv("ACTIVE_TIER", "free")

# â”€â”€ Demo Mode â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# True  â†’ simulate API responses (no keys needed, no usage)
# False â†’ call real APIs using the keys above
DEMO_MODE = os.getenv("DEMO_MODE", "true").lower() == "true"

# â”€â”€ Status report â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
print(f"Mode        : {'DEMO (simulated)' if DEMO_MODE else 'LIVE (real API)'}")
print(f"Active tier : {ACTIVE_TIER.upper()}")
print()
print("Key status:")
print(f"  GROQ_API_KEY      : {'set' if GROQ_API_KEY     else 'missing'}")
print(f"  GEMINI_API_KEY    : {'set' if GEMINI_API_KEY   else 'missing'}")
print(f"  CEREBRAS_API_KEY  : {'set' if CEREBRAS_API_KEY else 'missing'}")
print(f"  OPENAI_API_KEY    : {'set' if OPENAI_API_KEY   else 'missing'}")
print(f"  ANTHROPIC_API_KEY : {'set' if ANTHROPIC_API_KEY else 'missing'}")
if not DEMO_MODE:
    missing_free = [k for k,v in {"GROQ":GROQ_API_KEY,"GEMINI":GEMINI_API_KEY,"CEREBRAS":CEREBRAS_API_KEY}.items() if not v]
    if ACTIVE_TIER in ("free","all") and missing_free:
        print(f"\nWARNING: Missing free-tier keys: {missing_free}. Those models will be skipped.")

In [None]:
# â”€â”€ Model Registry â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# provider  : which SDK/API to call
# api_id    : exact model ID string sent to the API
# tier      : "free" or "paid"
# input/output : cost per token in USD (free-tier = $0.0)

MODEL_REGISTRY = {
    # â”€â”€ FREE TIER: Groq (hardware-accelerated inference) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
    "llama-3.1-8b [Groq]":     {"provider": "groq",     "api_id": "llama-3.1-8b-instant",                    "tier": "free", "input": 0.0,        "output": 0.0},
    "llama-3.3-70b [Groq]":    {"provider": "groq",     "api_id": "llama-3.3-70b-versatile",                 "tier": "free", "input": 0.0,        "output": 0.0},
    "llama-4-scout [Groq]":    {"provider": "groq",     "api_id": "meta-llama/llama-4-scout-17b-16e-instruct","tier": "free", "input": 0.0,        "output": 0.0},
    "qwen3-32b [Groq]":        {"provider": "groq",     "api_id": "qwen/qwen3-32b",                          "tier": "free", "input": 0.0,        "output": 0.0},
    # â”€â”€ FREE TIER: Google Gemini â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
    "gemini-1.5-flash":        {"provider": "gemini",   "api_id": "gemini-1.5-flash",        "tier": "free", "input": 0.0,        "output": 0.0},
    "gemini-2.0-flash":        {"provider": "gemini",   "api_id": "gemini-2.0-flash-exp",    "tier": "free", "input": 0.0,        "output": 0.0},
    # â”€â”€ FREE TIER: Cerebras (wafer-scale chip, extremely fast) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
    "llama3.1-8b [Cerebras]":  {"provider": "cerebras", "api_id": "llama3.1-8b",             "tier": "free", "input": 0.0,        "output": 0.0},
    "llama3.3-70b [Cerebras]": {"provider": "cerebras", "api_id": "llama3.3-70b",            "tier": "free", "input": 0.0,        "output": 0.0},
    # â”€â”€ PAID TIER: OpenAI â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
    "gpt-4o":                  {"provider": "openai",   "api_id": "gpt-4o",                  "tier": "paid", "input": 5.0/1e6,    "output": 15.0/1e6},
    "gpt-4o-mini":             {"provider": "openai",   "api_id": "gpt-4o-mini",             "tier": "paid", "input": 0.15/1e6,   "output": 0.6/1e6},
    "gpt-3.5-turbo":           {"provider": "openai",   "api_id": "gpt-3.5-turbo",           "tier": "paid", "input": 0.5/1e6,    "output": 1.5/1e6},
    # â”€â”€ PAID TIER: Anthropic â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
    "claude-3-5-sonnet":       {"provider": "anthropic","api_id": "claude-3-5-sonnet-20241022","tier": "paid","input": 3.0/1e6,   "output": 15.0/1e6},
    "claude-3-haiku":          {"provider": "anthropic","api_id": "claude-3-haiku-20240307",  "tier": "paid", "input": 0.25/1e6,  "output": 1.25/1e6},
    # â”€â”€ PAID TIER: Gemini Pro â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
    "gemini-1.5-pro":          {"provider": "gemini",   "api_id": "gemini-1.5-pro",          "tier": "paid", "input": 1.25/1e6,   "output": 5.0/1e6},
}

# Backward-compat dict used by calculate_cost()
MODEL_PRICING = {name: {"input": v["input"], "output": v["output"]} for name, v in MODEL_REGISTRY.items()}

# â”€â”€ Task Types â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
TASK_TYPES = ["summarization", "qa", "rag", "classification", "code_generation"]

# â”€â”€ Tier filtering â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
FREE_MODELS = [k for k, v in MODEL_REGISTRY.items() if v["tier"] == "free"]
PAID_MODELS = [k for k, v in MODEL_REGISTRY.items() if v["tier"] == "paid"]
ALL_MODELS  = list(MODEL_REGISTRY.keys())

if ACTIVE_TIER == "free":
    MODELS_TO_TEST = FREE_MODELS
elif ACTIVE_TIER == "paid":
    MODELS_TO_TEST = PAID_MODELS
else:
    MODELS_TO_TEST = ALL_MODELS

print(f"Configuration complete")
print(f"Total models in registry : {len(MODEL_REGISTRY)}  ({len(FREE_MODELS)} free, {len(PAID_MODELS)} paid)")
print(f"Active tier              : {ACTIVE_TIER.upper()} â†’ {len(MODELS_TO_TEST)} models will run")
print(f"Task types               : {', '.join(TASK_TYPES)}")
print()

# Display registry table
reg_rows = []
for name, v in MODEL_REGISTRY.items():
    reg_rows.append({
        "model": name,
        "provider": v["provider"],
        "tier": v["tier"],
        "api_id": v["api_id"],
        "input ($/1K tok)":  round(v["input"]  * 1000, 6),
        "output ($/1K tok)": round(v["output"] * 1000, 6),
    })
reg_df = pd.DataFrame(reg_rows)
reg_df.style.apply(
    lambda col: ["background-color: #1a3a1a" if t == "free" else "background-color: #2a1a2a" for t in reg_df["tier"]],
    subset=["model"]
)

## Section 2 â€” Benchmark Dataset

In [None]:
# â”€â”€ Benchmark Prompts â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
BENCHMARK_DATASET = {
    "summarization": [
        {
            "prompt": """Summarize the following article in 2-3 sentences:\n
Artificial intelligence has rapidly evolved over the past decade, transforming industries from 
healthcare to finance. Machine learning models can now diagnose diseases with accuracy rivaling 
specialists, detect fraud in milliseconds, and generate human-quality text. However, concerns 
around bias, privacy, and job displacement continue to challenge regulators and companies alike.""",
            "expected_keywords": ["AI", "machine learning", "healthcare", "finance", "bias"]
        },
        {
            "prompt": """Summarize this technical concept briefly:\n
Transformer architecture, introduced in 'Attention Is All You Need' (2017), replaced recurrent 
networks with self-attention mechanisms, enabling parallel processing of sequences. This led to 
dramatic improvements in NLP tasks and became the foundation for GPT, BERT, and modern LLMs.""",
            "expected_keywords": ["transformer", "attention", "NLP", "parallel"]
        },
    ],
    "qa": [
        {
            "prompt": "What is the capital of France? Answer in one word.",
            "expected": "Paris"
        },
        {
            "prompt": "Who wrote the Python programming language? Answer with the name only.",
            "expected": "Guido van Rossum"
        },
        {
            "prompt": "What does REST stand for in API design? Give only the full form.",
            "expected": "Representational State Transfer"
        },
    ],
    "rag": [
        {
            "prompt": """Context: Our Q3 2024 earnings report shows revenue of $4.2B, up 18% YoY. 
Operating margin improved to 23% from 19%. Key growth drivers include cloud services (+34%) 
and AI products (+52%). Headcount decreased 3% due to efficiency initiatives.\n\n
Question: What was the YoY revenue growth and what drove it?""",
            "expected_keywords": ["18%", "cloud", "AI"]
        },
        {
            "prompt": """Context: The company's refund policy states that customers can return 
products within 30 days for a full refund. Electronics must be unopened. Software licenses 
are non-refundable after activation. Gift cards cannot be returned.\n\n
Question: Can I return opened electronics?""",
            "expected_keywords": ["no", "unopened", "cannot"]
        },
    ],
    "classification": [
        {
            "prompt": "Classify this customer review as Positive, Negative, or Neutral. Reply with one word only.\nReview: 'The product arrived on time and works exactly as described. Very happy with my purchase!'",
            "expected": "Positive"
        },
        {
            "prompt": "Classify this email as Spam or Not Spam. Reply with one phrase only.\nEmail: 'Congratulations! You've won $1,000,000. Click here to claim your prize NOW!!!'",
            "expected": "Spam"
        },
        {
            "prompt": "Classify the programming language: 'def hello(): print(\"Hello World\")'. Reply with one word.",
            "expected": "Python"
        },
    ],
    "code_generation": [
        {
            "prompt": "Write a Python function to calculate the factorial of n using recursion. Include only the function, no explanation.",
            "expected_keywords": ["def", "factorial", "return", "if"]
        },
        {
            "prompt": "Write a SQL query to find the top 5 customers by total order value. Include only the SQL.",
            "expected_keywords": ["SELECT", "ORDER BY", "LIMIT", "SUM"]
        },
    ],
}

total_prompts = sum(len(v) for v in BENCHMARK_DATASET.values())
print(f"Benchmark dataset loaded: {total_prompts} prompts across {len(BENCHMARK_DATASET)} task types")
for task, prompts in BENCHMARK_DATASET.items():
    print(f" {task:20s}: {len(prompts)} prompts")

## Section 3 â€” Model Runner Function

In [None]:
# â”€â”€ Simulated Token Counter (mirrors tiktoken behavior) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
def estimate_tokens(text: str) -> int:
    """Rough token estimate: ~4 chars per token (GPT-style tokenization)."""
    return max(1, len(text) // 4)


# â”€â”€ Cost Calculator â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
def calculate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
    """Calculate request cost in USD."""
    pricing = MODEL_PRICING[model]
    cost = (input_tokens * pricing["input"]) + (output_tokens * pricing["output"])
    return round(cost, 8)


# â”€â”€ Accuracy Scorer â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
def score_output(task: str, output: str, benchmark: dict) -> float:
    """Score model output against expected results. Returns 0.0â€“1.0."""
    output_lower = output.lower().strip()
    
    if "expected" in benchmark:
        # Exact / partial match for QA and classification
        expected = benchmark["expected"].lower()
        if expected in output_lower:
            return 1.0
        # Partial credit: check word overlap
        exp_words = set(expected.split())
        out_words = set(output_lower.split())
        overlap = len(exp_words & out_words) / len(exp_words) if exp_words else 0
        return round(overlap, 2)
    
    elif "expected_keywords" in benchmark:
        # Keyword coverage for summarization, RAG, code gen
        keywords = [kw.lower() for kw in benchmark["expected_keywords"]]
        hits = sum(1 for kw in keywords if kw in output_lower)
        return round(hits / len(keywords), 2)
    
    return 0.5  # Default partial score if no benchmark available


print("Utility functions defined")

In [None]:
# â”€â”€ LIVE API Runners (one per provider) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

def _run_groq(api_id: str, prompt: str) -> dict:
    from groq import Groq
    client = Groq(api_key=GROQ_API_KEY)
    start = time.time()
    resp  = client.chat.completions.create(
        model=api_id,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=512,
    )
    latency = round(time.time() - start, 3)
    return {
        "output":        resp.choices[0].message.content,
        "input_tokens":  resp.usage.prompt_tokens,
        "output_tokens": resp.usage.completion_tokens,
        "latency":       latency,
    }

def _run_gemini(api_id: str, prompt: str) -> dict:
    import google.generativeai as genai
    genai.configure(api_key=GEMINI_API_KEY)
    model_obj = genai.GenerativeModel(api_id)
    start = time.time()
    resp  = model_obj.generate_content(prompt)
    latency = round(time.time() - start, 3)
    meta = resp.usage_metadata
    return {
        "output":        resp.text,
        "input_tokens":  meta.prompt_token_count,
        "output_tokens": meta.candidates_token_count,
        "latency":       latency,
    }

def _run_cerebras(api_id: str, prompt: str) -> dict:
    from cerebras.cloud.sdk import Cerebras
    client = Cerebras(api_key=CEREBRAS_API_KEY)
    start = time.time()
    resp  = client.chat.completions.create(
        model=api_id,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=512,
    )
    latency = round(time.time() - start, 3)
    return {
        "output":        resp.choices[0].message.content,
        "input_tokens":  resp.usage.prompt_tokens,
        "output_tokens": resp.usage.completion_tokens,
        "latency":       latency,
    }

def _run_openai(api_id: str, prompt: str) -> dict:
    from openai import OpenAI
    client = OpenAI(api_key=OPENAI_API_KEY)
    start = time.time()
    resp  = client.chat.completions.create(
        model=api_id,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=512,
    )
    latency = round(time.time() - start, 3)
    return {
        "output":        resp.choices[0].message.content,
        "input_tokens":  resp.usage.prompt_tokens,
        "output_tokens": resp.usage.completion_tokens,
        "latency":       latency,
    }

def _run_anthropic(api_id: str, prompt: str) -> dict:
    import anthropic
    client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
    start = time.time()
    resp  = client.messages.create(
        model=api_id,
        max_tokens=512,
        messages=[{"role": "user", "content": prompt}],
    )
    latency = round(time.time() - start, 3)
    return {
        "output":        resp.content[0].text,
        "input_tokens":  resp.usage.input_tokens,
        "output_tokens": resp.usage.output_tokens,
        "latency":       latency,
    }

PROVIDER_RUNNERS = {
    "groq":      _run_groq,
    "gemini":    _run_gemini,
    "cerebras":  _run_cerebras,
    "openai":    _run_openai,
    "anthropic": _run_anthropic,
}

def run_model_live(model_name: str, prompt: str) -> dict:
    """Dispatch to the correct provider runner and attach cost."""
    reg     = MODEL_REGISTRY[model_name]
    runner  = PROVIDER_RUNNERS[reg["provider"]]
    result  = runner(reg["api_id"], prompt)
    result["total_tokens"] = result["input_tokens"] + result["output_tokens"]
    result["cost"]         = calculate_cost(model_name, result["input_tokens"], result["output_tokens"])
    return result


# â”€â”€ DEMO Runner (simulated, no API calls) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

DEMO_OUTPUTS = {
    "summarization":   "AI has transformed industries like healthcare and finance through machine learning, enabling disease diagnosis, fraud detection, and text generation, though bias, privacy, and job concerns remain.",
    "qa":              "Paris",
    "rag":             "Revenue grew 18% YoY, driven primarily by cloud services (+34%) and AI products (+52%).",
    "classification":  "Positive",
    "code_generation": "def factorial(n):\n    if n <= 1:\n        return 1\n    return n * factorial(n - 1)",
}

# Realistic per-model characteristics: (latency_base_s, latency_std, output_multiplier, accuracy_boost)
MODEL_CHARACTERISTICS = {
    # Free â€” Groq (fastest due to LPU hardware)
    "llama-3.1-8b [Groq]":     {"latency_base": 0.30, "latency_std": 0.08, "output_mult": 0.90, "accuracy_boost": 0.00},
    "llama-3.3-70b [Groq]":    {"latency_base": 0.80, "latency_std": 0.15, "output_mult": 1.05, "accuracy_boost": 0.07},
    "llama-4-scout [Groq]":    {"latency_base": 0.55, "latency_std": 0.12, "output_mult": 1.00, "accuracy_boost": 0.08},
    "qwen3-32b [Groq]":        {"latency_base": 0.65, "latency_std": 0.12, "output_mult": 0.98, "accuracy_boost": 0.06},
    # Free â€” Gemini
    "gemini-1.5-flash":        {"latency_base": 0.90, "latency_std": 0.20, "output_mult": 1.00, "accuracy_boost": 0.06},
    "gemini-2.0-flash":        {"latency_base": 0.70, "latency_std": 0.15, "output_mult": 1.02, "accuracy_boost": 0.08},
    # Free â€” Cerebras (wafer-scale, extremely fast)
    "llama3.1-8b [Cerebras]":  {"latency_base": 0.20, "latency_std": 0.05, "output_mult": 0.88, "accuracy_boost": -0.01},
    "llama3.3-70b [Cerebras]": {"latency_base": 0.50, "latency_std": 0.10, "output_mult": 1.03, "accuracy_boost": 0.06},
    # Paid â€” OpenAI
    "gpt-4o":                  {"latency_base": 1.80, "latency_std": 0.40, "output_mult": 1.10, "accuracy_boost": 0.13},
    "gpt-4o-mini":             {"latency_base": 0.90, "latency_std": 0.20, "output_mult": 0.90, "accuracy_boost": 0.07},
    "gpt-3.5-turbo":           {"latency_base": 0.70, "latency_std": 0.20, "output_mult": 0.85, "accuracy_boost": 0.02},
    # Paid â€” Anthropic
    "claude-3-5-sonnet":       {"latency_base": 1.50, "latency_std": 0.30, "output_mult": 1.15, "accuracy_boost": 0.14},
    "claude-3-haiku":          {"latency_base": 0.60, "latency_std": 0.15, "output_mult": 0.80, "accuracy_boost": 0.04},
    # Paid â€” Gemini Pro
    "gemini-1.5-pro":          {"latency_base": 2.00, "latency_std": 0.45, "output_mult": 1.20, "accuracy_boost": 0.10},
}

BASE_ACCURACY = {
    "summarization":   0.74,
    "qa":              0.83,
    "rag":             0.79,
    "classification":  0.88,
    "code_generation": 0.70,
}

def run_model_demo(model_name: str, prompt: str, task: str) -> dict:
    """Simulate an API call with realistic per-model variance."""
    char          = MODEL_CHARACTERISTICS[model_name]
    latency       = max(0.15, np.random.normal(char["latency_base"], char["latency_std"]))
    input_tokens  = estimate_tokens(prompt)
    base_output   = DEMO_OUTPUTS.get(task, "Sample output.")
    output_tokens = max(1, int(estimate_tokens(base_output) * char["output_mult"] * np.random.uniform(0.85, 1.15)))
    cost          = calculate_cost(model_name, input_tokens, output_tokens)
    return {
        "output":          base_output,
        "input_tokens":    input_tokens,
        "output_tokens":   output_tokens,
        "total_tokens":    input_tokens + output_tokens,
        "latency":         round(latency, 3),
        "cost":            cost,
        "_accuracy_base":  BASE_ACCURACY.get(task, 0.75),
        "_accuracy_boost": char["accuracy_boost"],
    }


# â”€â”€ Unified Runner â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
def run_model(model_name: str, prompt: str, task: str = "") -> dict:
    """Calls real API or demo based on DEMO_MODE."""
    if DEMO_MODE:
        return run_model_demo(model_name, prompt, task)
    else:
        return run_model_live(model_name, prompt)


print("Model runner ready â€” supports: Groq, Gemini, Cerebras, OpenAI, Anthropic")
print(f"Characteristics loaded for {len(MODEL_CHARACTERISTICS)} models")
print(f"Demo mode : {DEMO_MODE}")

## Section 4 â€” Run Experiments

In [None]:
# â”€â”€ Key availability check (for live mode) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
KEY_MAP = {
    "groq":      GROQ_API_KEY,
    "gemini":    GEMINI_API_KEY,
    "cerebras":  CEREBRAS_API_KEY,
    "openai":    OPENAI_API_KEY,
    "anthropic": ANTHROPIC_API_KEY,
}

def has_key(model_name: str) -> bool:
    provider = MODEL_REGISTRY[model_name]["provider"]
    return bool(KEY_MAP.get(provider, ""))

# In live mode skip models whose API key is missing; in demo mode run all
if DEMO_MODE:
    runnable_models = MODELS_TO_TEST
else:
    runnable_models = [m for m in MODELS_TO_TEST if has_key(m)]
    skipped = [m for m in MODELS_TO_TEST if not has_key(m)]
    if skipped:
        print(f"Skipping {len(skipped)} models (missing API keys): {skipped}\n")

# â”€â”€ Experiment Loop â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
results   = []
run_count = 0
total_runs = sum(len(p) for p in BENCHMARK_DATASET.values()) * len(runnable_models)

print(f"Starting experiment")
print(f"  Models   : {len(runnable_models)}  ({ACTIVE_TIER.upper()} tier)")
print(f"  Prompts  : {total_prompts}")
print(f"  Total runs: {total_runs}")
print()

for task, prompts in BENCHMARK_DATASET.items():
    for i, benchmark in enumerate(prompts):
        for model in runnable_models:
            run_count += 1
            try:
                result = run_model(model, benchmark["prompt"], task)
            except Exception as e:
                print(f"  ERROR [{model}] task={task}: {e}")
                continue

            # Score accuracy
            if DEMO_MODE:
                base     = result.pop("_accuracy_base", 0.75)
                boost    = result.pop("_accuracy_boost", 0.0)
                accuracy = min(1.0, max(0.0, base + boost + np.random.normal(0, 0.04)))
            else:
                accuracy = score_output(task, result["output"], benchmark)

            results.append({
                "model":             model,
                "provider":          MODEL_REGISTRY[model]["provider"],
                "tier":              MODEL_REGISTRY[model]["tier"],
                "task":              task,
                "prompt_id":         i,
                "input_tokens":      result["input_tokens"],
                "output_tokens":     result["output_tokens"],
                "total_tokens":      result["total_tokens"],
                "cost":              result["cost"],
                "latency":           result["latency"],
                "accuracy":          round(accuracy, 4),
                "cost_per_accuracy": round(result["cost"] / max(accuracy, 0.01), 8),
            })

df = pd.DataFrame(results)

print(f"Experiment complete â€” {len(df)} rows\n")
print(f"Models run : {df['model'].nunique()}")
print(f"Providers  : {sorted(df['provider'].unique())}")
print(f"Tiers      : {df.groupby('tier')['model'].nunique().to_dict()}")
df.head(10)

In [None]:
# â”€â”€ Summary Statistics â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
summary = df.groupby(["model", "task"]).agg(
    avg_input_tokens  = ("input_tokens",  "mean"),
    avg_output_tokens = ("output_tokens", "mean"),
    avg_total_tokens  = ("total_tokens",  "mean"),
    avg_cost          = ("cost",          "mean"),
    total_cost        = ("cost",          "sum"),
    avg_latency       = ("latency",       "mean"),
    avg_accuracy      = ("accuracy",      "mean"),
).round(6).reset_index()

print("Summary Statistics (averaged per model/task):")
summary

In [None]:
# â”€â”€ Model-level aggregate â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
model_summary = df.groupby("model").agg(
    total_tokens  = ("total_tokens", "sum"),
    total_cost    = ("cost",         "sum"),
    avg_latency   = ("latency",      "mean"),
    avg_accuracy  = ("accuracy",     "mean"),
).round(6)
model_summary["efficiency_score"] = (model_summary["avg_accuracy"] / model_summary["total_cost"]).round(2)
model_summary = model_summary.sort_values("avg_accuracy", ascending=False)

print("Model Rankings (by average accuracy):")
model_summary

## Section 5 â€” Visualization & Analysis

In [None]:
# â”€â”€ Chart 1: Token Usage per Model â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
import os
OUT_DIR = os.path.join(os.getcwd(), 'outputs')
os.makedirs(OUT_DIR, exist_ok=True)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle("Token Usage Analysis", fontsize=16, fontweight='bold', color='white')

# Grouped bar: avg token breakdown per model
token_data = df.groupby("model")[["input_tokens", "output_tokens"]].mean().sort_values("input_tokens")
x = np.arange(len(token_data))
width = 0.35

ax = axes[0]
bars1 = ax.bar(x - width/2, token_data["input_tokens"],  width, label="Input",  color=PALETTE[0], alpha=0.9)
bars2 = ax.bar(x + width/2, token_data["output_tokens"], width, label="Output", color=PALETTE[1], alpha=0.9)
ax.set_xticks(x)
ax.set_xticklabels(token_data.index, rotation=30, ha='right')
ax.set_ylabel("Avg Tokens per Request")
ax.set_title("Input vs Output Tokens per Model")
ax.legend()
ax.grid(axis='y')

# Stacked bar: avg total tokens per task type
task_token = df.groupby(["model", "task"])["total_tokens"].mean().unstack(fill_value=0)
bottom = np.zeros(len(task_token))
ax2 = axes[1]
for j, task in enumerate(task_token.columns):
    ax2.bar(task_token.index, task_token[task], bottom=bottom, label=task, color=PALETTE[j % len(PALETTE)], alpha=0.85)
    bottom += task_token[task].values
ax2.set_xticklabels(task_token.index, rotation=30, ha='right')
ax2.set_ylabel("Avg Total Tokens")
ax2.set_title("Token Distribution by Task")
ax2.legend(loc='upper left', fontsize=8)
ax2.grid(axis='y')

plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, 'chart1_tokens.png'), dpi=150, bbox_inches='tight', facecolor=fig.get_facecolor())
plt.show()
print("Chart 1 rendered")

In [None]:
# â”€â”€ Chart 2: Latency Comparison â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
fig, axes = plt.subplots(1, 2, figsize=(18, 6))
fig.suptitle("Latency Analysis", fontsize=16, fontweight='bold', color='white')

models_in_df = list(df["model"].unique())

# Box plot of latency distribution
ax = axes[0]
latency_by_model = [df[df["model"] == m]["latency"].values for m in models_in_df]
bp = ax.boxplot(latency_by_model, labels=models_in_df, patch_artist=True,
                medianprops=dict(color='white', linewidth=2))
for patch, color in zip(bp['boxes'], PALETTE):
    patch.set_facecolor(color)
    patch.set_alpha(0.8)
ax.set_xticklabels(models_in_df, rotation=45, ha='right', fontsize=7)
ax.set_ylabel("Latency (seconds)")
ax.set_title("Latency Distribution per Model")
ax.grid(axis='y')

# Heatmap: avg latency per model x task
ax2 = axes[1]
lat_pivot = df.groupby(["model", "task"])["latency"].mean().unstack().round(3)
sns.heatmap(lat_pivot, ax=ax2, cmap="YlOrRd", annot=True, fmt=".2f",
            linewidths=0.5, linecolor='#1a1a2e',
            cbar_kws={'label': 'Avg Latency (s)'})
ax2.set_title("Avg Latency Heatmap (Model x Task)")
ax2.set_xlabel("Task")
ax2.set_ylabel("Model")
ax2.tick_params(axis='x', rotation=30)
ax2.tick_params(axis='y', labelsize=7)

plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, 'chart2_latency.png'), dpi=150, bbox_inches='tight', facecolor=fig.get_facecolor())
plt.show()
print("Chart 2 rendered")

In [None]:
# â”€â”€ Chart 3: Cost per Task & Model â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle("Cost Analysis", fontsize=16, fontweight='bold', color='white')

# Avg cost per model (bar chart)
ax = axes[0]
cost_per_model = df.groupby("model")["cost"].mean().sort_values(ascending=True) * 1e6  # convert to Âµ$
bars = ax.barh(cost_per_model.index, cost_per_model.values,
               color=[PALETTE[i % len(PALETTE)] for i in range(len(cost_per_model))], alpha=0.9)
ax.set_xlabel("Avg Cost per Request (Âµ$ = $0.000001)")
ax.set_title("Average Cost per Request by Model")
for bar, val in zip(bars, cost_per_model.values):
    ax.text(bar.get_width() + 0.1, bar.get_y() + bar.get_height()/2,
            f"{val:.2f}Âµ$", va='center', fontsize=9, color='white')
ax.grid(axis='x')

# Cost breakdown by task (grouped bar)
ax2 = axes[1]
cost_pivot = df.groupby(["task", "model"])["cost"].mean().unstack() * 1e6
cost_pivot.plot(kind='bar', ax=ax2, color=PALETTE[:len(MODELS_TO_TEST)], alpha=0.85, width=0.75)
ax2.set_xticklabels(cost_pivot.index, rotation=30, ha='right')
ax2.set_ylabel("Avg Cost (Âµ$)")
ax2.set_title("Cost per Task by Model")
ax2.legend(loc='upper right', fontsize=7, ncol=2)
ax2.grid(axis='y')

plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, 'chart3_cost.png'), dpi=150, bbox_inches='tight', facecolor=fig.get_facecolor())
plt.show()
print("Chart 3 rendered")

In [None]:
# â”€â”€ Chart 4: Cost vs Accuracy Scatter Plot â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
fig, axes = plt.subplots(1, 2, figsize=(16, 7))
fig.suptitle("Cost vs Accuracy Trade-Off", fontsize=16, fontweight='bold', color='white')

# Scatter: Overall (aggregated per model)
ax = axes[0]
agg = df.groupby("model").agg(avg_cost=("cost","mean"), avg_accuracy=("accuracy","mean")).reset_index()
agg["avg_cost_micro"] = agg["avg_cost"] * 1e6

for i, row in agg.iterrows():
    color = PALETTE[i % len(PALETTE)]
    ax.scatter(row["avg_cost_micro"], row["avg_accuracy"],
               s=250, color=color, zorder=5, edgecolors='white', linewidth=1.5)
    ax.annotate(row["model"], (row["avg_cost_micro"], row["avg_accuracy"]),
                textcoords="offset points", xytext=(8, 4), fontsize=8, color=color)

# Frontier line (Pareto-ish â€” cheapest path to best accuracy)
agg_s = agg.sort_values("avg_cost_micro")
ax.plot(agg_s["avg_cost_micro"], agg_s["avg_accuracy"], '--', color='#ffffff33', lw=1.5)

ax.set_xlabel("Avg Cost per Request (Âµ$)")
ax.set_ylabel("Avg Accuracy Score")
ax.set_title("Overall: Cost vs Accuracy")
ax.grid(True)

# Add quadrant labels
xlim, ylim = ax.get_xlim(), ax.get_ylim()
mid_x = (xlim[0] + xlim[1]) / 2
mid_y = (ylim[0] + ylim[1]) / 2
ax.text(xlim[0]+0.1, ylim[1]-0.01, "Cheap + Accurate",   fontsize=7, color='#50fa7b', alpha=0.7)
ax.text(xlim[1]*0.6, ylim[1]-0.01, "Costly + Accurate",  fontsize=7, color='#f5a623', alpha=0.7)
ax.text(xlim[0]+0.1, ylim[0]+0.005,"Cheap + Inaccurate", fontsize=7, color='#ff6b6b', alpha=0.7)
ax.text(xlim[1]*0.6, ylim[0]+0.005,"Costly + Inaccurate",fontsize=7, color='#e84393', alpha=0.7)

# Per-task breakdown scatter
ax2 = axes[1]
task_agg = df.groupby(["model", "task"]).agg(avg_cost=("cost","mean"), avg_accuracy=("accuracy","mean")).reset_index()
task_agg["avg_cost_micro"] = task_agg["avg_cost"] * 1e6

task_colors = {t: PALETTE[i] for i, t in enumerate(TASK_TYPES)}
for task in TASK_TYPES:
    subset = task_agg[task_agg["task"] == task]
    ax2.scatter(subset["avg_cost_micro"], subset["avg_accuracy"],
                label=task, color=task_colors[task], s=100, alpha=0.8, edgecolors='white', linewidth=0.8)

ax2.set_xlabel("Avg Cost per Request (Âµ$)")
ax2.set_ylabel("Avg Accuracy Score")
ax2.set_title("Cost vs Accuracy by Task Type")
ax2.legend(loc='lower right', fontsize=9)
ax2.grid(True)

plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, 'chart4_scatter.png'), dpi=150, bbox_inches='tight', facecolor=fig.get_facecolor())
plt.show()
print("Chart 4 rendered")

In [None]:
# â”€â”€ Chart 5: Efficiency Score (Accuracy per Dollar) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle("Model Efficiency: Accuracy per Dollar", fontsize=16, fontweight='bold', color='white')

# Efficiency score = accuracy / cost
efficiency = df.groupby("model").apply(
    lambda g: (g["accuracy"].mean() / max(g["cost"].mean(), 1e-9)),
    include_groups=False
).sort_values(ascending=True)

ax = axes[0]
colors = [PALETTE[i % len(PALETTE)] for i in range(len(efficiency))]
bars = ax.barh(efficiency.index, efficiency.values, color=colors, alpha=0.9)
ax.set_xlabel("Efficiency Score (Accuracy / Avg Cost)")
ax.set_title("Value for Money: Higher = Better")
for bar, val in zip(bars, efficiency.values):
    ax.text(bar.get_width() + efficiency.max()*0.01, bar.get_y() + bar.get_height()/2,
            f"{val:.0f}", va='center', fontsize=9, color='white')
ax.grid(axis='x')

# Multi-metric comparison (normalized)
ax2 = axes[1]

metrics = ["accuracy", "latency", "cost"]
metric_labels = ["Accuracy", "Speed", "Cheapness"]
model_metrics = df.groupby("model").agg(
    accuracy=("accuracy","mean"),
    latency=("latency","mean"),
    cost=("cost","mean")
)
# Normalize to 0-1 (higher is always better)
norm = model_metrics.copy()
norm["accuracy"] = (model_metrics["accuracy"] - model_metrics["accuracy"].min()) / (model_metrics["accuracy"].max() - model_metrics["accuracy"].min() + 1e-9)
norm["latency"]  = 1 - (model_metrics["latency"] - model_metrics["latency"].min()) / (model_metrics["latency"].max() - model_metrics["latency"].min() + 1e-9)
norm["cost"]     = 1 - (model_metrics["cost"] - model_metrics["cost"].min()) / (model_metrics["cost"].max() - model_metrics["cost"].min() + 1e-9)

x_pos = np.arange(len(metrics))
for i, model in enumerate(norm.index):
    ax2.plot(x_pos, norm.loc[model, metrics].values, 'o-',
             color=PALETTE[i % len(PALETTE)], label=model, alpha=0.85, lw=2, markersize=7)

ax2.set_xticks(x_pos)
ax2.set_xticklabels(metric_labels)
ax2.set_ylim(-0.05, 1.1)
ax2.set_ylabel("Normalized Score (higher = better)")
ax2.set_title("Multi-Metric Profile Comparison")
ax2.legend(loc='lower right', fontsize=8)
ax2.grid(True)

plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, 'chart5_efficiency.png'), dpi=150, bbox_inches='tight', facecolor=fig.get_facecolor())
plt.show()
print("Chart 5 rendered")

In [None]:
# â”€â”€ Chart 6: Accuracy Heatmap by Model x Task â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
fig, ax = plt.subplots(figsize=(12, 5))
fig.suptitle("Accuracy Heatmap: Model x Task", fontsize=15, fontweight='bold', color='white')

acc_pivot = df.groupby(["model", "task"])["accuracy"].mean().unstack().round(3)
mask = acc_pivot.isnull()
sns.heatmap(acc_pivot, ax=ax, cmap="RdYlGn", vmin=0.5, vmax=1.0,
            annot=True, fmt=".2f", linewidths=0.5, linecolor='#0f0f1a',
            cbar_kws={'label': 'Accuracy Score'},
            mask=mask)
ax.set_xlabel("Task Type")
ax.set_ylabel("Model")
ax.tick_params(axis='x', rotation=30)

plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, 'chart6_accuracy_heatmap.png'), dpi=150, bbox_inches='tight', facecolor='#0f0f1a')
plt.show()
print("Chart 6 rendered")

In [None]:
# â”€â”€ Chart 7: Free vs Paid Tier Direct Comparison â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
tiers_present = df["tier"].unique()
if len(tiers_present) < 2:
    print(f"Only '{tiers_present[0]}' tier data present â€” skipping tier comparison chart.")
    print("Set ACTIVE_TIER = 'all' and re-run to see both tiers.")
else:
    fig, axes = plt.subplots(1, 3, figsize=(20, 6))
    fig.suptitle("Free vs Paid Tier Comparison", fontsize=16, fontweight='bold', color='white')

    tier_agg = df.groupby(["model", "tier"]).agg(
        avg_accuracy  = ("accuracy",  "mean"),
        avg_latency   = ("latency",   "mean"),
        avg_cost_micro= ("cost",      lambda x: x.mean() * 1e6),
    ).reset_index()

    tier_colors = {"free": "#50fa7b", "paid": "#ff79c6"}

    # Panel 1: Accuracy by model, coloured by tier
    ax = axes[0]
    tier_agg_s = tier_agg.sort_values("avg_accuracy", ascending=True)
    bar_colors = [tier_colors[t] for t in tier_agg_s["tier"]]
    ax.barh(tier_agg_s["model"], tier_agg_s["avg_accuracy"], color=bar_colors, alpha=0.85)
    ax.set_xlabel("Avg Accuracy")
    ax.set_title("Accuracy â€” Free vs Paid")
    ax.tick_params(axis='y', labelsize=7)
    ax.grid(axis='x')
    patches = [mpatches.Patch(color=c, label=t.capitalize()) for t, c in tier_colors.items() if t in tiers_present]
    ax.legend(handles=patches, loc='lower right', fontsize=8)

    # Panel 2: Latency by model
    ax2 = axes[1]
    tier_agg_s2 = tier_agg.sort_values("avg_latency", ascending=True)
    bar_colors2 = [tier_colors[t] for t in tier_agg_s2["tier"]]
    ax2.barh(tier_agg_s2["model"], tier_agg_s2["avg_latency"], color=bar_colors2, alpha=0.85)
    ax2.set_xlabel("Avg Latency (s)")
    ax2.set_title("Latency â€” Free vs Paid")
    ax2.tick_params(axis='y', labelsize=7)
    ax2.grid(axis='x')

    # Panel 3: Scatter accuracy vs latency, sized by cost
    ax3 = axes[2]
    for _, row in tier_agg.iterrows():
        color = tier_colors.get(row["tier"], "#ffffff")
        size  = max(50, min(500, (row["avg_cost_micro"] + 0.5) * 80))
        ax3.scatter(row["avg_latency"], row["avg_accuracy"],
                    s=size, color=color, alpha=0.85,
                    edgecolors='white', linewidth=0.8, zorder=5)
        ax3.annotate(row["model"], (row["avg_latency"], row["avg_accuracy"]),
                     textcoords="offset points", xytext=(5, 3), fontsize=6, color=color)
    ax3.set_xlabel("Avg Latency (s)")
    ax3.set_ylabel("Avg Accuracy")
    ax3.set_title("Accuracy vs Latency\n(bubble size = cost)")
    ax3.grid(True)
    ax3.legend(handles=patches, loc='lower right', fontsize=8)

    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, 'chart7_tier_comparison.png'), dpi=150, bbox_inches='tight', facecolor=fig.get_facecolor())
    plt.show()
    print("Chart 7 rendered")

## Section 6 â€” Summary & Recommendations

In [None]:
# â”€â”€ Final Rankings Table â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
final_report = df.groupby(["model", "provider", "tier"]).agg(
    avg_accuracy   = ("accuracy",     "mean"),
    avg_latency_s  = ("latency",      "mean"),
    avg_cost_micro = ("cost",         lambda x: x.mean() * 1e6),
    total_cost_usd = ("cost",         "sum"),
    total_tokens   = ("total_tokens", "sum"),
).round(4).reset_index()

final_report["efficiency"]    = (final_report["avg_accuracy"] / (final_report["avg_cost_micro"] + 0.001)).round(2)
final_report["rank_accuracy"] = final_report["avg_accuracy"].rank(ascending=False).astype(int)
final_report["rank_cost"]     = final_report["avg_cost_micro"].rank(ascending=True).astype(int)
final_report["rank_speed"]    = final_report["avg_latency_s"].rank(ascending=True).astype(int)
final_report = final_report.sort_values("avg_accuracy", ascending=False).set_index("model")

print("=" * 75)
print("FINAL MODEL BENCHMARK REPORT")
print("=" * 75)
cols = ["provider","tier","avg_accuracy","avg_latency_s","avg_cost_micro","efficiency","rank_accuracy","rank_speed","rank_cost"]
print(final_report[cols].to_string())
print()

# â”€â”€ Overall winners â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
best_accuracy  = final_report["avg_accuracy"].idxmax()
best_speed     = final_report["avg_latency_s"].idxmin()
best_efficient = final_report["efficiency"].idxmax()

print("-" * 55)
print("OVERALL WINNERS")
print("-" * 55)
print(f"Best Accuracy   : {best_accuracy}")
print(f"Fastest Model   : {best_speed}")
print(f"Best Value      : {best_efficient}  (accuracy / cost)")

# â”€â”€ Free tier winners â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
free_df = final_report[final_report["tier"] == "free"]
if not free_df.empty:
    print()
    print("-" * 55)
    print("FREE TIER WINNERS")
    print("-" * 55)
    print(f"Best Accuracy   : {free_df['avg_accuracy'].idxmax()}")
    print(f"Fastest Model   : {free_df['avg_latency_s'].idxmin()}")
    print(f"Best Value      : {free_df['efficiency'].idxmax()}")

# â”€â”€ Paid tier winners â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
paid_df = final_report[final_report["tier"] == "paid"]
if not paid_df.empty:
    best_cheap_paid = paid_df["avg_cost_micro"].idxmin()
    print()
    print("-" * 55)
    print("PAID TIER WINNERS")
    print("-" * 55)
    print(f"Best Accuracy   : {paid_df['avg_accuracy'].idxmax()}")
    print(f"Most Affordable : {best_cheap_paid}")
    print(f"Best Value      : {paid_df['efficiency'].idxmax()}")

# â”€â”€ Use case guide â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
print()
print("-" * 55)
print("USE CASE GUIDE")
print("-" * 55)
print(f"No budget / prototyping       : {free_df['avg_accuracy'].idxmax() if not free_df.empty else 'N/A'}")
print(f"Fastest response needed       : {best_speed}")
print(f"High-stakes tasks             : {best_accuracy}")
print(f"High-volume production        : {paid_df['avg_cost_micro'].idxmin() if not paid_df.empty else best_efficient}")
print(f"Balanced (accuracy + cost)    : {best_efficient}")

In [None]:
# â”€â”€ Export Results â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
df.to_csv(os.path.join(OUT_DIR, 'llm_benchmark_results.csv'), index=False)
final_report.to_csv(os.path.join(OUT_DIR, 'llm_benchmark_summary.csv'))

print("Results exported:")
print(f" {os.path.join(OUT_DIR, 'llm_benchmark_results.csv')}  â€” Full run-level data")
print(f" {os.path.join(OUT_DIR, 'llm_benchmark_summary.csv')}  â€” Model-level summary")
print()
print(f"Total records in dataset : {len(df)}")
print(f"Total simulated spend    : ${df['cost'].sum():.6f} USD")
print(f"Total tokens processed   : {df['total_tokens'].sum():,}")
print(f"Total simulated latency  : {df['latency'].sum():.1f} seconds")
print()
print("Benchmark complete! Scroll up to review charts & recommendations.")

## Section 7 â€” RAG Chunk Size vs Cost Experiment

> **What this tests:** How chunk size in retrieval affects total context tokens, answer accuracy, and cost per query.
>
> Chunk sizes tested: **200 / 500 / 1000 tokens**
>
> For each chunk size, we simulate retrieving top-k chunks and measure input tokens, cost, and accuracy.
> This reveals the optimal chunk size sweet spot â€” large chunks increase cost, small chunks risk losing context quality.

In [None]:
# â”€â”€ RAG Experiment Setup â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

# Long document to chunk (realistic enterprise knowledge base article)
SOURCE_DOCUMENT = """
Artificial intelligence (AI) is transforming industries at an unprecedented pace. In healthcare,
AI systems can now detect diseases from medical imaging with accuracy that rivals experienced
specialists. In finance, machine learning models process millions of transactions per second to
identify fraud patterns that humans would miss. In manufacturing, predictive maintenance powered
by AI reduces equipment downtime by up to 40%.

The transformer architecture, introduced in the landmark paper "Attention Is All You Need" (2017),
replaced recurrent neural networks with self-attention mechanisms. This allowed parallel processing
of entire sequences, dramatically speeding up training times and enabling models to capture
long-range dependencies in text. The architecture became the foundation for GPT, BERT, T5, and
virtually all modern large language models.

Retrieval-Augmented Generation (RAG) combines a retrieval system with a generative model. When a
question is asked, relevant documents are first retrieved from a knowledge base, then passed as
context to the LLM to generate a grounded answer. This reduces hallucination, improves factual
accuracy, and allows the model to answer questions about private or recent data without retraining.

Prompt engineering is the practice of designing inputs to language models to elicit desired outputs.
Techniques include zero-shot prompting, few-shot prompting with examples, chain-of-thought reasoning,
and role assignment. Effective prompt engineering can dramatically improve model performance without
any fine-tuning. Studies show well-engineered prompts can improve accuracy by 20-40% on complex tasks.

Vector databases store high-dimensional embeddings and enable fast similarity search. Systems like
Pinecone, Weaviate, Chroma, and FAISS index millions of vectors and return nearest neighbors in
milliseconds. This is essential for RAG pipelines where relevant chunks must be retrieved quickly
from large knowledge bases at query time.

Fine-tuning adapts a pretrained model to a specific domain or task by continuing training on a
curated dataset. Techniques include full fine-tuning, LoRA (Low-Rank Adaptation), QLoRA, and
instruction tuning. Fine-tuning can significantly improve performance on specialized tasks
but requires labeled data, compute resources, and careful hyperparameter selection.

Model evaluation requires careful benchmark design. Common metrics include BLEU and ROUGE for
text generation, F1 score for information extraction, perplexity for language modeling, and
human preference evaluation for open-ended generation. A robust evaluation suite should cover
accuracy, fairness, robustness, and cost efficiency across diverse task types.
""".strip()

# â”€â”€ Chunking Function â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
def chunk_document(text: str, chunk_size_tokens: int, overlap_tokens: int = 50) -> list[str]:
    """Split document into overlapping chunks of approximately chunk_size_tokens tokens."""
    words = text.split()
    # ~0.75 words per token (GPT tokenizer approximation)
    words_per_chunk = int(chunk_size_tokens * 0.75)
    overlap_words   = int(overlap_tokens * 0.75)
    chunks, start   = [], 0
    while start < len(words):
        end = min(start + words_per_chunk, len(words))
        chunks.append(" ".join(words[start:end]))
        start += words_per_chunk - overlap_words
    return chunks

# â”€â”€ Chunk Configuration â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
CHUNK_SIZES   = [200, 500, 1000]          # tokens
TOP_K         = 3                          # chunks retrieved per query
RAG_QUESTION  = "What is RAG and how does it reduce hallucination?"
RAG_EXPECTED_KEYWORDS = ["retrieval", "retrieval-augmented", "hallucination", "context", "grounded", "knowledge"]

# Preview chunks for each size
print(f"Source document: ~{estimate_tokens(SOURCE_DOCUMENT)} tokens\n")
for cs in CHUNK_SIZES:
    chunks = chunk_document(SOURCE_DOCUMENT, cs)
    avg_tok = int(np.mean([estimate_tokens(c) for c in chunks]))
    print(f"  Chunk size {cs:>4} tokens â†’ {len(chunks)} chunks, avg {avg_tok} tokens each")
print(f"\nTop-k retrieval : {TOP_K} chunks per query")
print(f"RAG question    : {RAG_QUESTION}")

In [None]:
# â”€â”€ RAG Chunk Size Experiment Runner â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

def simple_retrieve(chunks: list[str], question: str, top_k: int) -> list[str]:
    """Keyword-overlap retrieval (simulates vector similarity search)."""
    q_words = set(question.lower().split())
    scored  = [(sum(1 for w in c.lower().split() if w in q_words), c) for c in chunks]
    scored.sort(key=lambda x: x[0], reverse=True)
    return [c for _, c in scored[:top_k]]

def build_rag_prompt(retrieved_chunks: list[str], question: str) -> str:
    context = "\n\n".join(f"[Chunk {i+1}]\n{c}" for i, c in enumerate(retrieved_chunks))
    return f"Context:\n{context}\n\nQuestion: {question}\nAnswer concisely based only on the context."

rag_results = []

print("Running RAG Chunk Size Experiment...")
print(f"{'Chunk Size':>12} {'Model':>28} {'Input Tok':>10} {'Cost (Âµ$)':>10} {'Accuracy':>10} {'Latency(s)':>12}")
print("-" * 80)

for chunk_size in CHUNK_SIZES:
    chunks    = chunk_document(SOURCE_DOCUMENT, chunk_size)
    retrieved = simple_retrieve(chunks, RAG_QUESTION, TOP_K)
    prompt    = build_rag_prompt(retrieved, RAG_QUESTION)
    context_tokens = estimate_tokens(prompt)

    for model in runnable_models:
        result = run_model(model, prompt, "rag")

        # Score accuracy against RAG keywords
        output_lower = (result["output"]).lower()
        hits     = sum(1 for kw in RAG_EXPECTED_KEYWORDS if kw in output_lower)
        accuracy = round(hits / len(RAG_EXPECTED_KEYWORDS), 3)
        if DEMO_MODE:
            char     = MODEL_CHARACTERISTICS[model]
            accuracy = min(1.0, max(0.0,
                BASE_ACCURACY["rag"] + char["accuracy_boost"] + np.random.normal(0, 0.04)
            ))

        cost_micro = result["cost"] * 1e6
        rag_results.append({
            "chunk_size":    chunk_size,
            "model":         model,
            "provider":      MODEL_REGISTRY[model]["provider"],
            "tier":          MODEL_REGISTRY[model]["tier"],
            "num_chunks":    len(chunks),
            "retrieved_k":   TOP_K,
            "input_tokens":  result["input_tokens"],
            "output_tokens": result["output_tokens"],
            "total_tokens":  result["total_tokens"],
            "cost_usd":      result["cost"],
            "cost_micro":    cost_micro,
            "latency":       result["latency"],
            "accuracy":      round(accuracy, 4),
        })
        print(f"{chunk_size:>12} {model:>28} {result['input_tokens']:>10} {cost_micro:>10.4f} {accuracy:>10.4f} {result['latency']:>12.3f}")

rag_df = pd.DataFrame(rag_results)
print(f"\nTotal runs: {len(rag_df)}  ({len(CHUNK_SIZES)} chunk sizes Ã— {len(runnable_models)} models)")
rag_df.head(9)

In [None]:
# â”€â”€ RAG Chunk Size Visualization â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle("RAG Chunk Size vs Cost, Accuracy & Token Usage", fontsize=16, fontweight='bold', color='white')

chunk_agg = rag_df.groupby(["chunk_size", "model"]).agg(
    avg_input_tokens = ("input_tokens", "mean"),
    avg_cost_micro   = ("cost_micro",   "mean"),
    avg_accuracy     = ("accuracy",     "mean"),
    avg_latency      = ("latency",      "mean"),
).reset_index()

models_rag = list(rag_df["model"].unique())
color_map  = {m: PALETTE[i % len(PALETTE)] for i, m in enumerate(models_rag)}

# â”€â”€ Panel 1: Input Tokens vs Chunk Size â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
ax = axes[0]
for model in models_rag:
    d = chunk_agg[chunk_agg["model"] == model]
    ax.plot(d["chunk_size"], d["avg_input_tokens"], 'o-',
            color=color_map[model], label=model, lw=2, markersize=7)
ax.set_xlabel("Chunk Size (tokens)")
ax.set_ylabel("Avg Input Tokens per Request")
ax.set_title("ðŸ“ˆ Input Tokens vs Chunk Size")
ax.set_xticks(CHUNK_SIZES)
ax.legend(fontsize=7, loc='upper left')
ax.grid(True)

# â”€â”€ Panel 2: Accuracy vs Chunk Size â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
ax2 = axes[1]
for model in models_rag:
    d = chunk_agg[chunk_agg["model"] == model]
    ax2.plot(d["chunk_size"], d["avg_accuracy"], 'o-',
             color=color_map[model], label=model, lw=2, markersize=7)
ax2.set_xlabel("Chunk Size (tokens)")
ax2.set_ylabel("Avg Accuracy Score")
ax2.set_title("ðŸŽ¯ Accuracy vs Chunk Size")
ax2.set_xticks(CHUNK_SIZES)
ax2.set_ylim(0, 1.05)
ax2.legend(fontsize=7, loc='lower right')
ax2.grid(True)

# â”€â”€ Panel 3: Cost vs Accuracy bubble (sized by chunk size) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
ax3 = axes[2]
for _, row in chunk_agg.iterrows():
    color  = color_map[row["model"]]
    size   = (row["chunk_size"] / 10)          # 200â†’20, 500â†’50, 1000â†’100
    ax3.scatter(row["avg_cost_micro"], row["avg_accuracy"],
                s=size * 4, color=color, alpha=0.75,
                edgecolors='white', linewidth=0.8, zorder=5)

# Legend: chunk sizes as bubble sizes
for cs in CHUNK_SIZES:
    ax3.scatter([], [], s=(cs/10)*4, color='#aaaaaa',
                label=f"{cs} token chunks", alpha=0.7, edgecolors='white')

ax3.set_xlabel("Avg Cost per Request (Âµ$)")
ax3.set_ylabel("Avg Accuracy Score")
ax3.set_title("ðŸ’° Cost vs Accuracy\n(bubble size = chunk size)")
ax3.legend(fontsize=8, title="Chunk Size", loc='lower right')
ax3.grid(True)

plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, 'chart8_rag_chunk_experiment.png'),
            dpi=150, bbox_inches='tight', facecolor=fig.get_facecolor())
plt.show()

# â”€â”€ Finding Summary â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
print("\nâ”€â”€ RAG Chunk Experiment Findings â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€")
chunk_summary = rag_df.groupby("chunk_size").agg(
    avg_input_tokens = ("input_tokens", "mean"),
    avg_accuracy     = ("accuracy",     "mean"),
    avg_cost_micro   = ("cost_micro",   "mean"),
).round(3)
chunk_summary["tokens_per_accuracy"] = (chunk_summary["avg_input_tokens"] / chunk_summary["avg_accuracy"]).round(1)
print(chunk_summary.to_string())

best_chunk = chunk_summary["tokens_per_accuracy"].idxmin()
print(f"\nâœ… Most token-efficient chunk size : {best_chunk} tokens")
print(f"   (lowest tokens needed per unit of accuracy)")