In [None]:
# %% Minimal setup
# If needed (uncomment in a notebook):
# !pip install requests python-dotenv

import os, json, textwrap, re, time
import requests

API_KEY  = os.getenv("OPENAI_API_KEY", "cse476")
API_BASE = os.getenv("API_BASE", "http://10.4.58.53:41701/v1")  
MODEL    = os.getenv("MODEL_NAME", "bens_model")              

def call_model_chat_completions(prompt: str,
                                system: str = "You are a helpful assistant. Reply with only the final answer‚Äîno explanation.",
                                model: str = MODEL,
                                temperature: float = 0.3,
                                timeout: int = 60,
                                max_tokens: int = 128) -> dict:
    """
    Calls an OpenAI-style /v1/chat/completions endpoint and returns:
    { 'ok': bool, 'text': str or None, 'raw': dict or None, 'status': int, 'error': str or None, 'headers': dict }
    """
    url = f"{API_BASE}/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type":  "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user",   "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": max_tokens,
    }

    try:
        #{'id': 'chatcmpl-88b6d7e18a5542b5bed5bf2828f0661e', 'object': 'chat.completion', 'created': 1763204718, 'model': 'bens_model', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'US Highway 281', 'refusal': None, 'annotations': None, 'audio': None, 'function_call': None, 'tool_calls': [], 'reasoning_content': None}, 'logprobs': None, 'finish_reason': 'stop', 'stop_reason': None, 'token_ids': None}], 'service_tier': None, 'system_fingerprint': None, 'usage': {'prompt_tokens': 50, 'total_tokens': 57, 'completion_tokens': 7, 'prompt_tokens_details': None}, 'prompt_logprobs': None, 'prompt_token_ids': None, 'kv_transfer_params': None}
        resp = requests.post(url, headers=headers, json=payload, timeout=timeout)
        status = resp.status_code
        hdrs   = dict(resp.headers)
        if status == 200:
            data = resp.json()
            #print(data)
            text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            tokens_used = data.get("usage",[{}]).get("completion_tokens", {})
            #print('used tokens:', tokens_used)
            
            return {"ok": True, "text": text, "raw": data, "status": status, "error": None, "headers": hdrs, "tokens_used":tokens_used}
        else:
            # try best-effort to surface error text
            err_text = None
            try:
                err_text = resp.json()
            except Exception:
                err_text = resp.text
            return {"ok": False, "text": None, "raw": None, "status": status, "error": str(err_text), "headers": hdrs}
    except requests.RequestException as e:
        return {"ok": False, "text": None, "raw": None, "status": -1, "error": str(e), "headers": {}}


In [107]:
from IPython.display import Markdown, display

In [None]:
# %% Direct call example
def direct_call(prompt="What is 17 + 28? Answer with just the number.", temperature=0.2, max_tokens=128):
    demo_prompt = prompt
    result = call_model_chat_completions(demo_prompt, temperature=temperature, max_tokens=max_tokens)
    print("OK:", result["ok"], "HTTP:", result["status"])
    print("MODEL SAYS:", (result["text"] or "").strip())

    # Optional: Inspect rate-limit headers if your provider exposes them
    for k in ["x-ratelimit-remaining-requests", "x-ratelimit-limit-requests", "x-request-id"]:
        if k in result["headers"]:
            print(f"{k}: {result['headers'][k]}")


In [90]:
# %% Define three tests: input + expected
my_tests = [
    {
        "id": "math_inequality",
        "type": "numeric",  # grader will prefer numeric extraction
        "prompt": "Solve for the smallest integer n such that 3n + 5 > 26. Answer with just the integer.",
        "expected": "8",    # Because 3n > 21 => n > 7, smallest integer is 8
    },
    {
        "id": "commonsense_ice",
        "type": "text",
        "prompt": (
            "You place an ice cube in a glass of water and mark the water level. "
            "After the ice melts, does the water level rise, fall, or stay the same? "
            "Answer with exactly one of: 'rise', 'fall', 'stay the same'."
        ),
        "expected": "stay the same",
    },
    {
        "id": "logic_race",
        "type": "text",
        "prompt": (
            "In a race, you pass the person in second place. What position are you now in? "
            "Answer with a single word like 'first', 'second', 'third'."
        ),
        "expected": "second",
    },
]


In [142]:
import json
from pprint import pprint
import random
from collections import Counter

POSSIBLE_TYPES = ['math', 'common_sense', 'planning', 'coding', 'future_prediction']

all_tests = json.load(open("parsed_dev_data.json", "r", encoding="utf-8"))

type_counts = Counter(t['domain'] for t in all_tests)
print(type_counts)

formatted_tests = []
for i, t in enumerate(all_tests, start=1):
    
    formatted_tests.append({
        "id": t['id'], # domain_domainIndex_domainTestIndex_testIndex
        "type": t['domain'],
        "prompt": t['input'],
        "expected": t['output'],
        "char_count": t['input_char_count'],
        "exp_word_count": t['exp_word_count']
    })
    
all_tests = formatted_tests

Counter({'common_sense': 400, 'math': 300, 'coding': 100, 'future_prediction': 100, 'planning': 100})


In [None]:
def print_test(test):
    print(json.dumps(test, indent=2, ensure_ascii=False))

#pass test_type as a list of types
#generalized get test function
def get_tests(n=0, test_type=POSSIBLE_TYPES, start=0, end=None, lower_char=0, upper_char=float('inf'), lower_exp=0, upper_exp=float('inf')):
    filtered_tests = [t for t in all_tests if t['type'] in test_type and lower_char <= t['char_count'] <= upper_char and lower_exp <= t['exp_word_count'] <= upper_exp]
    print('filtered size:', len(filtered_tests))
    sample_size = min(n, len(filtered_tests))
    
    if n == 0:
        return filtered_tests[start:end]
    elif n == -1:
        filtered_type_counts = Counter(t['type'] for t in filtered_tests)
        each_test = []
        count = 0
        
        for val in filtered_type_counts.values():
            rand = random.randint(count, count + val)
            count = count + val
            each_test.append(filtered_tests[rand])
            
        print("sampled size:", len(each_test))    
        return each_test
    else:
        return random.sample(filtered_tests, sample_size)
    
""" def get_test_type(test_type, start=0, end=None, lower=0, upper=float('inf')):
    tests = [t for t in all_tests if t['type'] in test_type and lower <= t['char_count'] <= upper]
    return tests[start:end]

def get_random_tests(n=5, lower=0, upper=float('inf'), test_type=POSSIBLE_TYPES):
    filtered_tests = get_test_type(test_type=test_type, lower=lower, upper=upper) #[t for t in all_tests if lower <= t['char_count'] <= upper]
    sample_size = min(n, len(filtered_tests)) #prevent error
    return random.sample(filtered_tests, sample_size) """

" def get_test_type(test_type, start=0, end=None, lower=0, upper=float('inf')):\n    tests = [t for t in all_tests if t['type'] in test_type and lower <= t['char_count'] <= upper]\n    return tests[start:end]\n\ndef get_random_tests(n=5, lower=0, upper=float('inf'), test_type=POSSIBLE_TYPES):\n    filtered_tests = get_test_type(test_type=test_type, lower=lower, upper=upper) #[t for t in all_tests if lower <= t['char_count'] <= upper]\n    sample_size = min(n, len(filtered_tests)) #prevent error\n    return random.sample(filtered_tests, sample_size) "

In [159]:
tests = get_tests(n=-1, upper_char=300) #get_test_type('math', end=10, lower=0, upper=500)
pprint(tests)

filtered size: 440
3
[{'char_count': 274,
  'exp_word_count': 59,
  'expected': '\n'
              '    exit_codes = []\n'
              '\n'
              '    def execute_file(file):\n'
              '        file_path = file\n'
              '        process = subprocess.Popen(file_path)\n'
              '        time.sleep(1)  # wait for the process to start\n'
              '        exit_codes.append(process.poll())  # store the exit '
              'code\n'
              '\n'
              '    # Start a thread for each file\n'
              '    threads = [threading.Thread(target=execute_file, '
              'args=(file,)) for file in file_list]\n'
              '    for thread in threads:\n'
              '        thread.start()\n'
              '\n'
              '    # Wait for all threads to finish\n'
              '    for thread in threads:\n'
              '        thread.join()\n'
              '\n'
              '    return exit_codes',
  'id': 'coding_0_99_99',
  'pro

In [94]:
#simple hello world call to kick off the commits
#direct_call(prompt="how do I find the derivative of y=x^2 using python?")

In [95]:
def interactive_chat():
    messages = ["<Start of message history>"]
    count = 0
    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit']:
            print("Exiting chat.")
            break
        response = call_model_chat_completions(prompt=f"Old messages{messages}, CURRENT USER INPUT:{user_input} <--- ANSWER THIS QUESTION", temperature=0.7)
        count += 1
        messages.append(f"MESSAGE_{count}_[previous user input: {user_input}, previous system response: {response['text']}]")
        if response["ok"]:
            print("Model:", response["text"].strip())
        else:
            print("Error:", response["error"])
        print(messages)
#interactive_chat()

In [None]:
""" def execute_tests():
    rows = []
    for t in tests:
        r = call_model_chat_completions(
            prompt,
            system=system,
            model=model,
            temperature=0.3,
            max_tokens=128
        ) """

' def execute_tests():\n    rows = []\n    for t in tests:\n        r = call_model_chat_completions(\n            prompt,\n            system=system,\n            model=model,\n            temperature=0.3,\n            max_tokens=128\n        ) '

In [198]:
def self_evaluate(question, prediction, expected_answer, model=MODEL):
    """
    Use the model itself as a strict grader.
    Returns True if the model says the prediction matches the expected answer; else False.
    Falls back to a simple normalized string compare if the model's reply is malformed.
    """
    import re

    system = "You are a strict grader. Reply with exactly True or False. No punctuation. No explanation."
    prompt = f"""You are grading a question-answer pair.

Return exactly True if the PREDICTION would be accepted as correct for the EXPECTED_ANSWER.
Otherwise, return False.

QUESTION:
{question}

PREDICTION:
{prediction}

EXPECTED_ANSWER:
{expected_answer}

Answer with exactly: True or False
"""

    r = call_model_chat_completions(
        prompt,
        system=system,
        model=model,
        temperature=0.3,
    )

    reply = (r.get("text") or "").strip().lower()
    if reply.startswith("true"):
        return True
    if reply.startswith("false"):
        return False

    # No Fallback yet


In [None]:
def self_evaluate2(question, model_output, prediction, expected_answer, model=MODEL):
    """
    Use the model itself as a strict grader.
    Returns True if the model says the prediction matches the expected answer; else False.
    Falls back to a simple normalized string compare if the model's reply is malformed.
    """
    import re

    system = "You are a strict grader. Reply with exactly True or False. No punctuation. No explanation."
    prompt = f"""You are grading an automated evaluation algorithm that returns true or false depending on the model's output.

Return exactly True if the automated grading algorithm's PREDICTION on the correctness of MODEL_OUTPUT would be accepted as correct for the EXPECTED_ANSWER.
Otherwise, return False.

QUESTION (The user prompt):
{question}

MODEL_OUTPUT (Output returned from model for user prompt)
{model_output}

PREDICTION (autograder correctness output):
{prediction}

EXPECTED_ANSWER (does MODEL_OUTPUT actually contain the correct answer):
{expected_answer}

Answer with exactly: True or False, depending if the autograder returned the correct grade.
"""

    r = call_model_chat_completions(
        prompt,
        system=system,
        model=model,
        temperature=0.3,
    )

    reply = (r.get("text") or "").strip().lower()
    if reply.startswith("true"):
        return True
    if reply.startswith("false"):
        return False

    # No Fallback yet


In [191]:
tf_map = {'yes':'true', 'no':'false'}
def map_tf(output):
    out = output.lower().strip('.')
    return tf_map.get(out) if out in tf_map else output

In [249]:


def basic_match_check(test, output):
    exp = test["expected"]
    
    output = map_tf(output)
    
    matches = re.findall(re.escape(str(exp)), output, re.IGNORECASE)
    
    num_matches = len(matches)
    if num_matches > 0:
        #print('MATCH(ES) FOUND:', matches)
        return True
    
    #print('NO MATCH FOUND')
    return False

In [260]:
def seperator(text, tokens_used=None, max_tokens=400):
    if tokens_used is not None:
        print(f'{text} (TOKENS USED: {tokens_used}/{max_tokens})')
        if int(tokens_used) > max_tokens: print('MAXED TOKENS REACHED - OUTPUT TRUNCATED')
    else:
        print(text)
    print('-'*32)

In [258]:
def self_evaluate_tests(tests, model=MODEL, grader_model=None, sleep_sec=0.2, verbose=True):
    """
    Run the tests by querying the model for each prompt, then use LLM-as-a-judge
    (self_evaluate) to determine correctness.

    Args:
        tests: list of dicts with keys: id, prompt, expected (and optionally type)
        model: model used to generate predictions
        grader_model: model used to judge correctness (defaults to `model` if None)
        sleep_sec: small delay between calls to be polite to the API
        verbose: if True, print a summary line per test

    Returns:
        rows: list of dicts with fields:
              id, expected, got, correct, status, error
    """
    import time

    judge_model = grader_model or model
    MAX_TOKENS = 400

    count = 0
    for t in tests:
        count += 1
        # 1) Get model prediction
        #print('prompt:', t['prompt'])
        print('='*64)
        seperator('TEST_CASE')
        print_test(t)
        r = call_model_chat_completions(
            f"{t['prompt']}",
            system="Give a short answer to each prompt, don't explain.",
            model=model,
            temperature=0.3,
            max_tokens=MAX_TOKENS
        )
        got = (r.get("text") or "").strip()
        tokens_used = (r.get("tokens_used") or "")
        

        got = map_tf(got)
        
        seperator('\nMODEL_OUTPUT', tokens_used, MAX_TOKENS)
        display(Markdown(f"\n{got}"))
        #print('raw: ', got)
        
        match_check = basic_match_check(t, got)
        match_check = bool(match_check)
        
        # 2) LLM-as-a-judge: strict True/False
        is_correct = self_evaluate(
            question=t["prompt"],
            prediction=got,
            expected_answer=t["expected"],
            model=judge_model,
        )
        is_correct = bool(is_correct)
        
        seperator('\nEVALUATION')
        print('match check:', match_check)
        print('self_eval:', is_correct)
        correctness = is_correct and match_check
        agreement = is_correct == match_check
        print('‚úÖ CORRECT') if correctness else print('‚ùå INCORRECT')
        print('üÜó AGREED') if agreement else print('üÜò DISAGREED')


        if sleep_sec:
            time.sleep(sleep_sec)

    return correctness

# Example:


In [120]:
import re

In [262]:
test_prompts = get_tests(n=5) #get_test_type(["math"],end=10, upper=300) get_random_tests(n=3, upper=300)
results_llm_judge = self_evaluate_tests(test_prompts, verbose=True, model=MODEL, grader_model=MODEL)

filtered size: 1000
TEST_CASE
--------------------------------
{
  "id": "common_sense_1_204_304",
  "type": "common_sense",
  "prompt": "What American stage, film, and television actor  who also appeared in a large number of musicals, played Samson in the 1949 film \"Samson and Delilah\".",
  "expected": "Victor John Mature",
  "char_count": 150,
  "exp_word_count": 3
}

MODEL_OUTPUT (TOKENS USED: 3/400)
--------------------------------



Gary Cooper


EVALUATION
--------------------------------
match check: False
self_eval: False
‚ùå INCORRECT
üÜó AGREED
TEST_CASE
--------------------------------
{
  "id": "common_sense_1_284_384",
  "type": "common_sense",
  "prompt": "Is capturing giant squid in natural habitat impossible with no gear?",
  "expected": true,
  "char_count": 68,
  "exp_word_count": 1
}

MODEL_OUTPUT (TOKENS USED: 3/400)
--------------------------------



true


EVALUATION
--------------------------------
match check: True
self_eval: True
‚úÖ CORRECT
üÜó AGREED
TEST_CASE
--------------------------------
{
  "id": "coding_0_57_57",
  "type": "coding",
  "prompt": "Counts the number of words, characters, and unique characters in a given text.\nNote that: This function considers whitespace-separated substrings as words. When counting characters, this function excludes whitespace and special characters (i.e. string.punctuation).\nThe function should output with:\n    tuple: A tuple containing three integers: the number of words,\n    the number of characters,\n    the number of unique characters.\nYou should write self-contained code starting with:\n```\nimport string\nimport re\ndef task_func(text: str) -> tuple:\n```",
  "expected": "    words = text.split()\n    chars = re.sub(\"\\s\", \"\", re.sub(f\"[{string.punctuation}]\", \"\", text))\n\n    return len(words), len(chars), len(set(chars))",
  "char_count": 551,
  "exp_word_count": 14
}

M


```python
import string
import re

def task_func(text: str) -> tuple:
    words = len(text.split())
    chars = len(re.sub(r'[^a-zA-Z0-9]', '', text))
    unique_chars = len(set(re.sub(r'[^a-zA-Z0-9]', '', text)))
    return (words, chars, unique_chars)
```


EVALUATION
--------------------------------
match check: False
self_eval: False
‚ùå INCORRECT
üÜó AGREED
TEST_CASE
--------------------------------
{
  "id": "planning_4_36_936",
  "type": "planning",
  "prompt": "I have to plan the logistics of transporting crates between a number of depots and distributors via trucks that are loaded by hoists. Depots and distributors are directly connected by roads (trucks can drive between any two depots or distributors).\n\nA depot is a type of place.\nA distributor is a type of place.\nA pallet is a type of surface.\nA crate is a type of surface.\n\nHere are the actions that can be performed:\n\nDrive a truck from one place to another place.\nUse a hoist to lift a crate from a surface at a place.\nUse a hoist to drop a crate to a surface at a place.\nUse a hoist to load a crate into a truck at a place.\nUse a hoist to unload a crate from a truck at a place.\n\nThe following are the restrictions on the actions:\nA truck can be driven from one pla


Use hoist1 to lift crate0 from pallet1 at depot1  
Use hoist1 to load crate0 into truck2 at depot1  
Drive truck2 from depot1 to distributor0  
Use hoist3 to unload crate0 from truck2 at distributor0  
Use hoist3 to drop crate0 to pallet3 at distributor0  
Use hoist3 to lift crate2 from pallet3 at distributor0  
Use hoist3 to load crate2 into truck0 at distributor0  
Drive truck0 from distributor0 to depot0  
Use hoist0 to unload crate2 from truck0 at depot0  
Use hoist0 to drop crate2 to pallet0 at depot0


EVALUATION
--------------------------------
match check: False
self_eval: True
‚ùå INCORRECT
üÜò DISAGREED
TEST_CASE
--------------------------------
{
  "id": "planning_4_81_981",
  "type": "planning",
  "prompt": "I am playing with a set of objects. Here are the actions I can do\n\n   Attack object\n   Feast object from another object\n   Succumb object\n   Overcome object from another object\n\nI have the following restrictions on my actions:\n    To perform Attack action, the following facts need to be true: Province object, Planet object, Harmony.\n    Once Attack action is performed the following facts will be true: Pain object.\n    Once Attack action is performed the following facts will be false: Province object, Planet object, Harmony.\n    To perform Succumb action, the following facts need to be true: Pain object.\n    Once Succumb action is performed the following facts will be true: Province object, Planet object, Harmony.    \n    Once Succumb action is performed the f


feast object a from object d  
overcome object a from object b  
attack object d  
succumb object d  
feast object d from object c  
overcome object d from object c  
attack object c  
succumb object c  
feast object c from object a  
overcome object c from object a


EVALUATION
--------------------------------
match check: False
self_eval: False
‚ùå INCORRECT
üÜó AGREED
