# Import Modules

In [None]:
# Start with imports - ask ChatGPT to explain any package that you don't know

import os
from pathlib import Path
import json
from dotenv import load_dotenv
from openai import OpenAI
from anthropic import Anthropic
from pydantic_settings import BaseSettings
from pydantic import Field

# Load Environment Variables

In [None]:
load_dotenv()

# Set Run Variables

In [None]:
# load the note text
note_text = Path("note_text.txt").read_text(encoding="utf-8")

# the field name will look for an environment variable with that name
# e.g. openai_api_key will look for OPENAI_API_KEY
class ApiSettings(BaseSettings):
    openai_api_key: str | None = Field(None)
    anthropic_api_key: str | None = Field(None)
    google_api_key: str | None = Field(None)
    deepseek_api_key: str | None = Field(None)
    groq_api_key: str | None = Field(None)

api_settings = ApiSettings()

# Add Necessary Functions

In [None]:
def get_model_response(model_name: str, question: str, **kwargs):

    messages = [{"role": "user", "content": question}]

    match model_name:
        case name if name.startswith("gpt"):
            print(f"Running OpenAI model {model_name}...")
            openai = OpenAI()
            response = openai.chat.completions.create(
                model=model_name,
                messages=messages,
            )
            return response.choices[0].message.content

        case name if name.startswith("claude"):
            print(f"Running Anthropic model {model_name}...")
            anthropic = Anthropic()
            response = anthropic.messages.create(
                model=model_name,
                messages=messages,
                max_tokens=kwargs.get("max_tokens", 1000)
            )
            return response.content[0].text

        case _:


            return "Model not supported."

# Check API Keys

In [None]:
# Check for API Keys
for key_name, key_value in api_settings.model_dump().items():
    if key_value:
        print(f"{key_name} exists and begins {key_value[:8]}")
    else:
        print(f"{key_name} not set")

# Setup Initial Question

In [None]:
diag_question = "What diagnoses are mentioned in the following medical note?\n\n"
diag_question += "Respond with a JSON array of diagnosis strings with the corresponding ICD10CM code. Do not include any other text. Please don't respond in markdown\n\n"
diag_question += f"Medical Note:\n{note_text}"

# Evaluate Several Models

In [None]:
models = [
    "gpt-5-mini",
    "gpt-5-nano",
    "claude-sonnet-4-5",
    "claude-haiku-4-5"
]

answers = []

for model in models:
    response = get_model_response(model, question=diag_question, max_tokens=1000)
    answers.append({"model_name": model, "response": response})


# Combine Responses

In [None]:
together = ""
for index, answer in enumerate(answers, start=1):
    together += f"# Response from competitor {index}\n\n"
    together += answer.get("response") + "\n\n"

# Evaluate Responses

In [None]:
judge = f"""You are judging a competition between {len(models)} competitors.
Each model has been given this question:

{diag_question}

Your job is to evaluate each response for clarity and strength of argument, and rank them in order of best to worst.
Respond with JSON, and only JSON, with the following format:
{{"results": ["best competitor number", "second best competitor number", "third best competitor number", ...]}}

Here are the responses from each competitor:

{together}

Now respond with the JSON with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks."""

# Run Evaluation

In [None]:
eval_results = get_model_response(model_name="gpt-5-nano", question=judge, max_tokens=1000)
print(eval_results)

# Show Leaderboard

In [None]:
results_dict = json.loads(eval_results)
ranks = results_dict["results"]
for rank_num, rank_result in enumerate(ranks, start=1):
    competitor = answers[ranks.index(str(rank_num))]
    print(f"Rank {rank_num}: {competitor.get('model_name')}")