In [1]:
import os
import openai
from dotenv import load_dotenv

In [2]:
load_dotenv()  # Load environment variables from .env file

# Ensure the OPENAI_API_KEY is set
openai.api_key = os.getenv("OPENAI_API_KEY")
if openai.api_key is None:
    raise ValueError("OPENAI_API_KEY environment variable not set")

client = openai.OpenAI()

# Model Optimization

## Evaluating model performance
Test and improve model outputs through evaluations.

Broadly, there are three steps to build and run evals for your LLM application.

1) Describe the task to be done as an eval
2) Run your eval with test inputs (a prompt and input data)
3) Analyze the results, then iterate and improve on your prompt

Here, we will configure evals programmatically using the Evals API. We can also configure evals in the OpenAI dashboard (easier).

### 1 - Create an eval for a task
Let's say that we would like to use a model to classify the contents of IT support tickets into one of three categories: Hardware, Software, or Other.

In [4]:
# This is an example of our task

instructions = """
You are an expert in categorizing IT support tickets. Given the support 
ticket below, categorize the request into one of "Hardware", "Software", 
or "Other". Respond with only one of those words.
"""

ticket = "My monitor won't turn on - help!"

response = client.responses.create(
    model="gpt-4.1-nano",
    input=[
        {"role": "developer", "content": instructions},
        {"role": "user", "content": ticket}
    ]
)

print(response.output_text)

Hardware


Let's set up an eval to test this behavior via API. An eval needs two key ingredients:

- **data_source_config**: A schema for the test data you will use along with the eval.
- **testing_criteria**: The graders that determine if the model output is correct.

In [5]:
eval_obj = client.evals.create(
    name="IT Ticket Categorization",
    data_source_config={
        "type": "custom",
        "item_schema": {
            "type": "object",
            "properties": {
                "ticket_text": {"type": "string"},
                "correct_label": {"type": "string"},
            },
            "required": ["ticket_text", "correct_label"],
        },
        "include_sample_schema": True,
    },
    testing_criteria=[
        {
            "type": "string_check",
            "name": "Match output to human label",
            "input": "{{ sample.output_text }}",
            "operation": "eq",
            "reference": "{{ item.correct_label }}",
        }
    ],
)

print(eval_obj)

EvalCreateResponse(id='eval_6844873742f08191b4fd05228d59f014', created_at=1749321527, data_source_config=EvalCustomDataSourceConfig(schema_={'type': 'object', 'properties': {'item': {'type': 'object', 'properties': {'ticket_text': {'type': 'string'}, 'correct_label': {'type': 'string'}}, 'required': ['ticket_text', 'correct_label']}, 'sample': {'type': 'object', 'properties': {'model': {'type': 'string'}, 'choices': {'type': 'array', 'items': {'type': 'object', 'properties': {'message': {'type': 'object', 'properties': {'role': {'type': 'string', 'enum': ['assistant']}, 'content': {'type': ['string', 'null']}, 'refusal': {'type': ['boolean', 'null']}, 'tool_calls': {'type': ['array', 'null'], 'items': {'type': 'object', 'properties': {'type': {'type': 'string', 'enum': ['function']}, 'function': {'type': 'object', 'properties': {'name': {'type': 'string'}, 'arguments': {'type': 'string'}}, 'required': ['name', 'arguments']}, 'id': {'type': 'string'}}, 'required': ['type', 'function', '

### 2 - Test a prompt with our eval
Now that we have defined how we want our app to behave in an eval, let's construct a prompt that reliably generates the correct output for a representative sample of test data.

#### Uploading test data
Let's upload our test data file to the OpenAI platform so we can reference it later. We can upload files in the OpenAI dashboard, but it's possible to upload files via API as well.

After uploading the file, it is important to make note of the unique id property in the response payload (also available in the UI, if we had uploaded via the browser) - we will need to reference that value later.

In [6]:
# Uploading test data

file = client.files.create(
    file=open("12_aula_tickets.jsonl", "rb"),
    purpose="evals"
)

print(file)

FileObject(id='file-2ECunj3WAhd8spBSDPhYfu', bytes=815, created_at=1749322200, filename='12_aula_tickets.jsonl', object='file', purpose='evals', status='processed', expires_at=None, status_details=None)


#### Creating an eval run
With our test data in place, let's evaluate a prompt and see how it performs against our test criteria.

In [7]:
run = client.evals.runs.create(
    "eval_6844873742f08191b4fd05228d59f014",
    name="Categorization text run",
    data_source={
        "type": "responses",
        "model": "gpt-4.1-nano",
        "input_messages": {
            "type": "template",
            "template": [
                {"role": "developer", "content": "You are an expert in categorizing IT support tickets. Given the support ticket below, categorize the request into one of 'Hardware', 'Software', or 'Other'. Respond with only one of those words."},
                {"role": "user", "content": "{{ item.ticket_text }}"},
            ],
        },
        "source": {"type": "file_id", "id": "file-2ECunj3WAhd8spBSDPhYfu"},
    },
)

print(run)

RunCreateResponse(id='evalrun_68448af3cda88191a5e22e1faccdc275', created_at=1749322483, data_source=DataSourceResponses(source=DataSourceResponsesSourceFileID(id='file-2ECunj3WAhd8spBSDPhYfu', type='file_id'), type='responses', input_messages=DataSourceResponsesInputMessagesTemplate(template=[DataSourceResponsesInputMessagesTemplateTemplateEvalItem(content=ResponseInputText(text="You are an expert in categorizing IT support tickets. Given the support ticket below, categorize the request into one of 'Hardware', 'Software', or 'Other'. Respond with only one of those words.", type='input_text'), role='developer', type='message'), DataSourceResponsesInputMessagesTemplateTemplateEvalItem(content=ResponseInputText(text='{{ item.ticket_text }}', type='input_text'), role='user', type='message')], type='template'), model='gpt-4.1-nano', sampling_params=None), error=None, eval_id='eval_6844873742f08191b4fd05228d59f014', metadata={}, model='gpt-4.1-nano', name='Categorization text run', object='e

Our eval run has now been queued, and it will execute asynchronously as it processes every row in your data set.

### 3 - Analyze the results
Depending on the size of your dataset, the eval run may take some time to complete. You can view current status in the dashboard, but we can also fetch the current status of an eval run via API.

In [9]:
run = client.evals.runs.retrieve(eval_id="eval_6844873742f08191b4fd05228d59f014", run_id="evalrun_68448af3cda88191a5e22e1faccdc275")
print(run)

RunRetrieveResponse(id='evalrun_68448af3cda88191a5e22e1faccdc275', created_at=1749322483, data_source=DataSourceResponses(source=DataSourceResponsesSourceFileID(id='file-2ECunj3WAhd8spBSDPhYfu', type='file_id'), type='responses', input_messages=DataSourceResponsesInputMessagesTemplate(template=[DataSourceResponsesInputMessagesTemplateTemplateEvalItem(content=ResponseInputText(text="You are an expert in categorizing IT support tickets. Given the support ticket below, categorize the request into one of 'Hardware', 'Software', or 'Other'. Respond with only one of those words.", type='input_text'), role='developer', type='message'), DataSourceResponsesInputMessagesTemplateTemplateEvalItem(content=ResponseInputText(text='{{ item.ticket_text }}', type='input_text'), role='user', type='message')], type='template'), model='gpt-4.1-nano', sampling_params=None), error=None, eval_id='eval_6844873742f08191b4fd05228d59f014', metadata={}, model='gpt-4.1-nano', name='Categorization text run', object=

The API response contains granular information about test criteria results, API usage for generating model responses, and a report_url property that takes us to a page in the dashboard where we can explore the results visually.