In [1]:
import json
import numpy as np

with open('who_calls.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

print(data[0].keys())
# query is the original user question
print('query:', data[0]['query'])

# prompt is the full prompt sent to the model
print('prompt:', data[0]['prompt'])


sample_size = 20
data = np.random.choice(data, sample_size, replace=False)

dict_keys(['url', 'site', 'name', 'ranking', 'schema_object', 'sent', 'prompt', 'query'])
query: I am interested in making Kyoto style pottery. Where can I find the kind of blue glaze they use?
prompt: Assign a score between 0 and 100 to the following site based 
        the likelihood that the site will contain an answer to the user's question.
        If the user is looking to buy a product, the site should sell the product, not 
        just have useful information. 

The user's question is: I am interested in making Kyoto style pottery. Where can I find the kind of blue glaze they use?

The site's description is: {'url': 'miyakeceramics.com', '@type': 'Shopify', 'name': 'Miyake Ceramics', 'category': 'Japanese Pottery', 'description': 'Handmade Japanese ceramic artworks', 'detailed_description': '## PRODUCTS BY CATEGORY\n• Plates: Sakura Plate, Wave Plate, Kumo Plate, Mizu Plate, Hana Plate\n• Bowls: Donburi Bowl, Ramen Bowl, Matcha Bowl, Rice Bowl, Soup Bowl\n• Cups: Yunomi Cup, S

In [2]:
#!/usr/bin/env python3
"""
Minimal Azure OpenAI Provider Example with clean_response - Copy & Paste Ready
"""
import asyncio
import sys
import os
import json

sys.path.insert(0, os.path.join('.', 'code', 'python'))

from llm_providers.azure_oai import AzureOpenAIProvider

async def test_azure_openai_completion(user_prompt, system_prompt, model="gpt-4.1-mini"):
    # Create provider (loads config automatically)
    provider = AzureOpenAIProvider()
    
    # Get raw response from Azure OpenAI
    client = provider.get_client()
    
    raw_response = await client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=500,
        temperature=0.8,
        top_p=0.1,
        stream=False,
        presence_penalty=0.0,
        frequency_penalty=0.0,
        model=model
    )
    
    # Get the raw content before cleaning
    response = raw_response.choices[0].message.content
    
    return provider.clean_response(response)

# asyncio.run(test_azure_openai_scoring())

async def test_azure_openai_scoring(prompt, system_prompt=None, high_tier=True):
    # Create provider (loads config automatically)
    provider = AzureOpenAIProvider()
    
    schema = {
        'score': 'integer between 0 and 100', 
        'description': 'short description of the item'
    }
    
    # Get raw response from Azure OpenAI    
    response = await provider.get_completion(
        prompt=prompt,
        schema=schema,
        temperature=0.3,
        max_tokens=500,
        high_tier=high_tier,
        system_prompt=system_prompt,
    )
    
    return response

2025-09-18 12:29:49,839 - azure_oai - ERROR - error:151 - Missing required Azure OpenAI configuration


In [3]:
from tqdm import tqdm

async def get_mean_absolute_error(data, memory=None):
    error, error_higher, error_lower = 0, 0, 0
    count_higher, count_lower = 0, 0
    if memory is None:
        memory = []

    for idx, item in enumerate(tqdm(data)):
        try:
            score_with_high = memory[idx]
        except:
            score_with_high = await test_azure_openai_scoring(prompt=item['prompt'], high_tier=True)
            memory.append(score_with_high)
        
        new_score = score_with_high['score']
        print('new_score:', new_score, 'old_score:', item['ranking']['score'])
        error += abs(new_score - item['ranking']['score'])
        if new_score > item['ranking']['score']:
            error_lower += abs(new_score - item['ranking']['score'])
            count_lower += 1
        else:
            error_higher += abs(new_score - item['ranking']['score'])
            count_higher += 1

    print("Total MAE:", error / len(data))
    print("MAE when score is higher:", error_higher / count_higher if count_higher > 0 else 0)
    print("MAE when score is lower:", error_lower / count_lower if count_lower > 0 else 0)
    return {
        "total_mae": error / len(data),
        "mae_higher": error_higher / count_higher if count_higher > 0 else 0,
        "mae_lower": error_lower / count_lower if count_lower > 0 else 0
    }, memory

stats, memory = await get_mean_absolute_error(data)

  0%|          | 0/20 [00:00<?, ?it/s]


ValueError: Missing required Azure OpenAI configuration

In [None]:
async def regenerate_responses_low_tier(data, system_prompt):
    new_data = []
    for idx, item in enumerate(tqdm(data)):
        score_low = await test_azure_openai_scoring(prompt=item['prompt'], system_prompt=system_prompt, high_tier=False)
        # copy all elements from item and add score_low
        new_item = item.copy()
        new_item['ranking'] = score_low
        new_data.append(new_item)
    
    return new_data

In [None]:
async def improve_system_prompt(system_prompt, prompt):
    schema = {
                "analysis": "comma separated list explaining why the small model might have given an overly optimistic score",
                "new_system_prompt": "string with a new system prompt that could help the small model provide a more accurate score"
            }

    new_prompt = f"""Here is a system prompt and query given to a small model to score a website for its relevance to a user question. The small model gave the site a high score, but a more advanced model disagreed and gave it a lower score. Analyze the system prompt, query, and the small model's response, and explain why the small model might have given an overly optimistic score. Only use general statements in your analysis, e.g., the system prompt needs to provoke the model to break down the query. Based on that analysis, suggest a new system prompt that together with the original user query could help the small model provide a more accurate score. 
Make sure the system prompt is general and does not have any details that is tied to the specific prompt. For example if the query is asking about a product, there should not be any mention of product in the system prompt.
The new system prompt should be a superset of the original system prompt, i.e., it should include all the instructions from the original system prompt, but add more instructions to help the model provide a more accurate score.
## System Prompt:
{system_prompt}
## Query:
{prompt}"""
    analysis_response = await test_azure_openai_completion(
        user_prompt=new_prompt,
        system_prompt=f"You are an expert AI assistant that helps improve prompts for other AI models. Respond with the following json format: {json.dumps(schema)}",
        model="gpt-4.1"
    )
    print('Analysis response:', analysis_response['analysis'])
    print('New system prompt:', analysis_response['new_system_prompt'])
    return analysis_response

new_data = data.copy()
new_stats = stats.copy()
n_repeat = 0

score_schema = {
                    'score': 'integer between 0 and 100', 
                    'description': 'short description of the item'
            }
system_prompt = f"""Provide a response that matches this JSON schema: {json.dumps(score_schema)}"""

while new_stats['total_mae'] >= 5 and n_repeat < 3:
    # find everything with score above 70 and print the function name and score
    for idx, item in enumerate(new_data):
        if item['ranking']['score'] > 70:
            score_high = memory[idx]
            if score_high['score'] < item['ranking']['score']:
                print(f"Score decreased from {item['ranking']['score']} to {score_high['score']}")
                print(f"prompt: {item['prompt']}")
                print('gpt-4.1-mini response:', item['ranking'])
                print('gpt-4.1 response:', score_high)
                print('-' * 40)

                
                # improve the system prompt
                analysis_response = await improve_system_prompt(system_prompt, item['prompt'])
                system_prompt = analysis_response['new_system_prompt']
                
                scoring_low = await test_azure_openai_scoring(prompt=item['prompt'], system_prompt=system_prompt, high_tier=False)
                print('new_score:', scoring_low['score'])
                print('=' * 80)
                break
    new_data = await regenerate_responses_low_tier(new_data, system_prompt)
    new_stats, _ = await get_mean_absolute_error(new_data, memory)
    n_repeat += 1

Score decreased from 75 to 30
prompt: Assign a score between 0 and 100 to the following site based 
        the likelihood that the site will contain an answer to the user's question.
        If the user is looking to buy a product, the site should sell the product, not 
        just have useful information. 

The user's question is: I am interested in making gluten free bread. What kind of special equipment do I need?

The site's description is: {'url': 'anson-mills.myshopify.com', '@type': 'Shopify', 'name': 'Anson Mills', 'category': 'Flour & Grains', 'description': 'Heritage grains and heirloom varieties from historic Southern mill', 'extended_description': 'Heritage grains and heirloom varieties from historic Southern mill. Anson Mills typically offers stone‑milled flours; heritage grains; baking mixes; starters & education; and whole grains. Highlights milling dates, protein % and recommended formulas for breads & pastry. Seasonal & limited releases may include limited heritage l

100%|██████████| 20/20 [00:38<00:00,  1.94s/it]
100%|██████████| 20/20 [00:00<00:00, 25474.06it/s]


new_score: 30 old_score: 20
new_score: 40 old_score: 30
new_score: 30 old_score: 20
new_score: 85 old_score: 80
new_score: 10 old_score: 10
new_score: 30 old_score: 30
new_score: 30 old_score: 40
new_score: 10 old_score: 10
new_score: 90 old_score: 85
new_score: 10 old_score: 30
new_score: 85 old_score: 90
new_score: 80 old_score: 40
new_score: 30 old_score: 30
new_score: 85 old_score: 85
new_score: 20 old_score: 30
new_score: 60 old_score: 40
new_score: 90 old_score: 90
new_score: 20 old_score: 10
new_score: 20 old_score: 10
new_score: 90 old_score: 90
Total MAE: 8.25
MAE when score is higher: 4.090909090909091
MAE when score is lower: 13.333333333333334
Score decreased from 90 to 85
prompt: Assign a score between 0 and 100 to the following site based 
        the likelihood that the site will contain an answer to the user's question.
        If the user is looking to buy a product, the site should sell the product, not 
        just have useful information. 

The user's question is: 

100%|██████████| 20/20 [00:41<00:00,  2.07s/it]
100%|██████████| 20/20 [00:00<00:00, 25183.45it/s]


new_score: 30 old_score: 20
new_score: 40 old_score: 40
new_score: 30 old_score: 20
new_score: 85 old_score: 70
new_score: 10 old_score: 10
new_score: 30 old_score: 20
new_score: 30 old_score: 40
new_score: 10 old_score: 10
new_score: 90 old_score: 85
new_score: 10 old_score: 20
new_score: 85 old_score: 90
new_score: 80 old_score: 40
new_score: 30 old_score: 30
new_score: 85 old_score: 75
new_score: 20 old_score: 30
new_score: 60 old_score: 20
new_score: 90 old_score: 90
new_score: 20 old_score: 10
new_score: 20 old_score: 10
new_score: 90 old_score: 90
Total MAE: 9.75
MAE when score is higher: 3.5
MAE when score is lower: 16.0
Score decreased from 90 to 85
prompt: Assign a score between 0 and 100 to the following site based 
        the likelihood that the site will contain an answer to the user's question.
        If the user is looking to buy a product, the site should sell the product, not 
        just have useful information. 

The user's question is: I am interested in chai with

100%|██████████| 20/20 [00:52<00:00,  2.63s/it]
100%|██████████| 20/20 [00:00<00:00, 24549.63it/s]

new_score: 30 old_score: 20
new_score: 40 old_score: 20
new_score: 30 old_score: 10
new_score: 85 old_score: 60
new_score: 10 old_score: 10
new_score: 30 old_score: 20
new_score: 30 old_score: 30
new_score: 10 old_score: 10
new_score: 90 old_score: 75
new_score: 10 old_score: 20
new_score: 85 old_score: 85
new_score: 80 old_score: 30
new_score: 30 old_score: 20
new_score: 85 old_score: 75
new_score: 20 old_score: 20
new_score: 60 old_score: 30
new_score: 90 old_score: 90
new_score: 20 old_score: 10
new_score: 20 old_score: 10
new_score: 90 old_score: 85
Total MAE: 11.75
MAE when score is higher: 1.4285714285714286
MAE when score is lower: 17.307692307692307



