In [None]:
# %% Minimal setup
# If needed (uncomment in a notebook):
# !pip install requests python-dotenv

import os, json, textwrap, re, time
import requests

API_KEY  = os.getenv("OPENAI_API_KEY", "cse476")
API_BASE = os.getenv("API_BASE", "http://10.4.58.53:41701/v1")  
MODEL    = os.getenv("MODEL_NAME", "bens_model")              

def call_model_chat_completions(prompt: str,
                                system: str = "You are a helpful assistant. Reply with only the final answer—no explanation.",
                                model: str = MODEL,
                                temperature: float = 0.3,
                                timeout: int = 60,
                                max_tokens: int = 128) -> dict:
    """
    Calls an OpenAI-style /v1/chat/completions endpoint and returns:
    { 'ok': bool, 'text': str or None, 'raw': dict or None, 'status': int, 'error': str or None, 'headers': dict }
    """
    url = f"{API_BASE}/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type":  "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user",   "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": max_tokens,
    }

    try:
        resp = requests.post(url, headers=headers, json=payload, timeout=timeout)
        status = resp.status_code
        hdrs   = dict(resp.headers)
        if status == 200:
            data = resp.json()
            text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            return {"ok": True, "text": text, "raw": data, "status": status, "error": None, "headers": hdrs}
        else:
            # try best-effort to surface error text
            err_text = None
            try:
                err_text = resp.json()
            except Exception:
                err_text = resp.text
            return {"ok": False, "text": None, "raw": None, "status": status, "error": str(err_text), "headers": hdrs}
    except requests.RequestException as e:
        return {"ok": False, "text": None, "raw": None, "status": -1, "error": str(e), "headers": {}}


In [107]:
from IPython.display import Markdown, display

In [89]:
# %% Direct call example
def direct_call(prompt="What is 17 + 28? Answer with just the number.", temperature=0.2, max_tokens=128):
    demo_prompt = prompt
    result = call_model_chat_completions(demo_prompt, temperature=temperature, max_tokens=max_tokens)
    print("OK:", result["ok"], "HTTP:", result["status"])
    print("MODEL SAYS:", (result["text"] or "").strip())

    # Optional: Inspect rate-limit headers if your provider exposes them
    for k in ["x-ratelimit-remaining-requests", "x-ratelimit-limit-requests", "x-request-id"]:
        if k in result["headers"]:
            print(f"{k}: {result['headers'][k]}")


In [90]:
# %% Define three tests: input + expected
my_tests = [
    {
        "id": "math_inequality",
        "type": "numeric",  # grader will prefer numeric extraction
        "prompt": "Solve for the smallest integer n such that 3n + 5 > 26. Answer with just the integer.",
        "expected": "8",    # Because 3n > 21 => n > 7, smallest integer is 8
    },
    {
        "id": "commonsense_ice",
        "type": "text",
        "prompt": (
            "You place an ice cube in a glass of water and mark the water level. "
            "After the ice melts, does the water level rise, fall, or stay the same? "
            "Answer with exactly one of: 'rise', 'fall', 'stay the same'."
        ),
        "expected": "stay the same",
    },
    {
        "id": "logic_race",
        "type": "text",
        "prompt": (
            "In a race, you pass the person in second place. What position are you now in? "
            "Answer with a single word like 'first', 'second', 'third'."
        ),
        "expected": "second",
    },
]


In [142]:
import json
from pprint import pprint
import random
from collections import Counter

POSSIBLE_TYPES = ['math', 'common_sense', 'planning', 'coding', 'future_prediction']

all_tests = json.load(open("parsed_dev_data.json", "r", encoding="utf-8"))

type_counts = Counter(t['domain'] for t in all_tests)
print(type_counts)

formatted_tests = []
for i, t in enumerate(all_tests, start=1):
    
    formatted_tests.append({
        "id": t['id'], # domain_domainIndex_domainTestIndex_testIndex
        "type": t['domain'],
        "prompt": t['input'],
        "expected": t['output'],
        "char_count": t['input_char_count'],
        "exp_word_count": t['exp_word_count']
    })
    
all_tests = formatted_tests

Counter({'common_sense': 400, 'math': 300, 'coding': 100, 'future_prediction': 100, 'planning': 100})


In [None]:
def print_test(test):
    print(json.dumps(test, indent=2, ensure_ascii=False))

#pass test_type as a list of types
#generalized get test function
def get_tests(n=0, test_type=POSSIBLE_TYPES, start=0, end=None, lower_char=0, upper_char=float('inf'), lower_exp=0, upper_exp=float('inf')):
    filtered_tests = [t for t in all_tests if t['type'] in test_type and lower_char <= t['char_count'] <= upper_char and lower_exp <= t['exp_word_count'] <= upper_exp]
    print('filtered size:', len(filtered_tests))
    sample_size = min(n, len(filtered_tests))
    
    if n == 0:
        return filtered_tests[start:end]
    elif n == -1:
        filtered_type_counts = Counter(t['type'] for t in filtered_tests)
        each_test = []
        count = 0
        
        for val in filtered_type_counts.values():
            rand = random.randint(count, count + val)
            count = count + val
            each_test.append(filtered_tests[rand])
            
        print("sampled size:", len(each_test))    
        return each_test
    else:
        return random.sample(filtered_tests, sample_size)
    
""" def get_test_type(test_type, start=0, end=None, lower=0, upper=float('inf')):
    tests = [t for t in all_tests if t['type'] in test_type and lower <= t['char_count'] <= upper]
    return tests[start:end]

def get_random_tests(n=5, lower=0, upper=float('inf'), test_type=POSSIBLE_TYPES):
    filtered_tests = get_test_type(test_type=test_type, lower=lower, upper=upper) #[t for t in all_tests if lower <= t['char_count'] <= upper]
    sample_size = min(n, len(filtered_tests)) #prevent error
    return random.sample(filtered_tests, sample_size) """

" def get_test_type(test_type, start=0, end=None, lower=0, upper=float('inf')):\n    tests = [t for t in all_tests if t['type'] in test_type and lower <= t['char_count'] <= upper]\n    return tests[start:end]\n\ndef get_random_tests(n=5, lower=0, upper=float('inf'), test_type=POSSIBLE_TYPES):\n    filtered_tests = get_test_type(test_type=test_type, lower=lower, upper=upper) #[t for t in all_tests if lower <= t['char_count'] <= upper]\n    sample_size = min(n, len(filtered_tests)) #prevent error\n    return random.sample(filtered_tests, sample_size) "

In [159]:
tests = get_tests(n=-1, upper_char=300) #get_test_type('math', end=10, lower=0, upper=500)
pprint(tests)

filtered size: 440
3
[{'char_count': 274,
  'exp_word_count': 59,
  'expected': '\n'
              '    exit_codes = []\n'
              '\n'
              '    def execute_file(file):\n'
              '        file_path = file\n'
              '        process = subprocess.Popen(file_path)\n'
              '        time.sleep(1)  # wait for the process to start\n'
              '        exit_codes.append(process.poll())  # store the exit '
              'code\n'
              '\n'
              '    # Start a thread for each file\n'
              '    threads = [threading.Thread(target=execute_file, '
              'args=(file,)) for file in file_list]\n'
              '    for thread in threads:\n'
              '        thread.start()\n'
              '\n'
              '    # Wait for all threads to finish\n'
              '    for thread in threads:\n'
              '        thread.join()\n'
              '\n'
              '    return exit_codes',
  'id': 'coding_0_99_99',
  'pro

In [94]:
#simple hello world call to kick off the commits
#direct_call(prompt="how do I find the derivative of y=x^2 using python?")

In [95]:
def interactive_chat():
    messages = ["<Start of message history>"]
    count = 0
    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit']:
            print("Exiting chat.")
            break
        response = call_model_chat_completions(prompt=f"Old messages{messages}, CURRENT USER INPUT:{user_input} <--- ANSWER THIS QUESTION", temperature=0.7)
        count += 1
        messages.append(f"MESSAGE_{count}_[previous user input: {user_input}, previous system response: {response['text']}]")
        if response["ok"]:
            print("Model:", response["text"].strip())
        else:
            print("Error:", response["error"])
        print(messages)
#interactive_chat()

In [96]:
""" def execute_tests():
    rows = []
    for t in tests:
        r = call_model_chat_completions(
            prompt,
            system=system,
            model=model,
            temperature=0.3,
            max_tokens=128
        ) """

' def execute_tests():\n    rows = []\n    for t in tests:\n        r = call_model_chat_completions(\n            prompt,\n            system=system,\n            model=model,\n            temperature=0.3,\n            max_tokens=128\n        ) '

In [198]:
def self_evaluate(question, prediction, expected_answer, model=MODEL):
    """
    Use the model itself as a strict grader.
    Returns True if the model says the prediction matches the expected answer; else False.
    Falls back to a simple normalized string compare if the model's reply is malformed.
    """
    import re

    system = "You are a strict grader. Reply with exactly True or False. No punctuation. No explanation."
    prompt = f"""You are grading a question-answer pair.

Return exactly True if the PREDICTION would be accepted as correct for the EXPECTED_ANSWER.
Otherwise, return False.

QUESTION:
{question}

PREDICTION:
{prediction}

EXPECTED_ANSWER:
{expected_answer}

Answer with exactly: True or False
"""

    r = call_model_chat_completions(
        prompt,
        system=system,
        model=model,
        temperature=0.3,
    )

    reply = (r.get("text") or "").strip().lower()
    if reply.startswith("true"):
        return True
    if reply.startswith("false"):
        return False

    # No Fallback yet


In [None]:
def self_evaluate2(question, model_output, prediction, expected_answer, model=MODEL):
    """
    Use the model itself as a strict grader.
    Returns True if the model says the prediction matches the expected answer; else False.
    Falls back to a simple normalized string compare if the model's reply is malformed.
    """
    import re

    system = "You are a strict grader. Reply with exactly True or False. No punctuation. No explanation."
    prompt = f"""You are grading an automated evaluation algorithm that returns true or false depending on the model's output.

Return exactly True if the automated grading algorithm's PREDICTION on the correctness of MODEL_OUTPUT would be accepted as correct for the EXPECTED_ANSWER.
Otherwise, return False.

QUESTION (The user prompt):
{question}

MODEL_OUTPUT (Output returned from model for user prompt)
{model_output}

PREDICTION (autograder correctness output):
{prediction}

EXPECTED_ANSWER (does MODEL_OUTPUT actually contain the correct answer):
{expected_answer}

Answer with exactly: True or False, depending if the autograder returned the correct grade.
"""

    r = call_model_chat_completions(
        prompt,
        system=system,
        model=model,
        temperature=0.3,
    )

    reply = (r.get("text") or "").strip().lower()
    if reply.startswith("true"):
        return True
    if reply.startswith("false"):
        return False

    # No Fallback yet


In [191]:
tf_map = {'yes':'true', 'no':'false'}
def map_tf(output):
    out = output.lower().strip('.')
    return tf_map.get(out) if out in tf_map else output

In [192]:


def basic_match_check(test, output):
    exp = test["expected"]
    
    output = map_tf(output)
    
    matches = re.findall(str(exp), output, re.IGNORECASE)
    
    num_matches = len(matches)
    if num_matches > 0:
        print('MATCH(ES) FOUND:', matches)
        return True
    
    print('NO MATCH FOUND')
    return False

In [201]:
def self_evaluate_tests(tests, model=MODEL, grader_model=None, sleep_sec=0.2, verbose=True):
    """
    Run the tests by querying the model for each prompt, then use LLM-as-a-judge
    (self_evaluate) to determine correctness.

    Args:
        tests: list of dicts with keys: id, prompt, expected (and optionally type)
        model: model used to generate predictions
        grader_model: model used to judge correctness (defaults to `model` if None)
        sleep_sec: small delay between calls to be polite to the API
        verbose: if True, print a summary line per test

    Returns:
        rows: list of dicts with fields:
              id, expected, got, correct, status, error
    """
    import time

    judge_model = grader_model or model
    rows = []
    count = 0
    for t in tests:
        count += 1
        # 1) Get model prediction
        #print('prompt:', t['prompt'])
        print_test(t)
        r = call_model_chat_completions(
            f"{t['prompt']}",
            system="Give a short answer to each prompt, don't explain.",
            model=model,
            temperature=0.3,
            max_tokens=400
        )
        got = (r.get("text") or "").strip()
        got = map_tf(got)
        display(Markdown(f"OUTPUT: \n{got}"))
        print('raw: ', got)
        print(basic_match_check(t, got))
        
        # 2) LLM-as-a-judge: strict True/False
        is_correct = self_evaluate(
            question=t["prompt"],
            prediction=got,
            expected_answer=t["expected"],
            model=judge_model,
        )

        row = {
            "id": t.get("id", "<unnamed>"),
            "expected": t["expected"],
            "got": got,
            "correct": bool(is_correct),
            "status": r.get("status"),
            "error": r.get("error"),
        }
        
        rows.append(row)
        print(json.dumps(row, indent=2, ensure_ascii=False))
        if verbose:
            mark = "✅" if is_correct else "❌"
            print(f"{mark} {row['id']}: expected={row['expected']!r}, got={row['got']!r} (HTTP {row['status']})")
            if row["error"]:
                print("   error:", row["error"])

        if sleep_sec:
            time.sleep(sleep_sec)

    return rows

# Example:


In [120]:
import re

In [202]:
test_prompts = get_tests(n=5, upper_exp=1, upper_char=200) #get_test_type(["math"],end=10, upper=300) get_random_tests(n=3, upper=300)
results_llm_judge = self_evaluate_tests(test_prompts, verbose=True, model=MODEL, grader_model=MODEL)

filtered size: 227
{
  "id": "math_3_272_872",
  "type": "math",
  "prompt": "Find the constant term in the expansion of $$\\left(10x^3-\\frac{1}{2x^2}\\right)^{5}$$",
  "expected": "-125",
  "char_count": 84,
  "exp_word_count": 1
}


OUTPUT: 
$-\frac{5}{32}$

raw:  $-\frac{5}{32}$
NO MATCH FOUND
False
{
  "id": "math_3_272_872",
  "expected": "-125",
  "got": "$-\\frac{5}{32}$",
  "correct": false,
  "status": 200,
  "error": null
}
❌ math_3_272_872: expected='-125', got='$-\\frac{5}{32}$' (HTTP 200)
{
  "id": "math_3_299_899",
  "type": "math",
  "prompt": "Evaluate $(1+2i)6-3i$.",
  "expected": "6+9i",
  "char_count": 22,
  "exp_word_count": 1
}


OUTPUT: 
$36 + 9i$

raw:  $36 + 9i$
NO MATCH FOUND
False
{
  "id": "math_3_299_899",
  "expected": "6+9i",
  "got": "$36 + 9i$",
  "correct": false,
  "status": 200,
  "error": null
}
❌ math_3_299_899: expected='6+9i', got='$36 + 9i$' (HTTP 200)
{
  "id": "math_3_216_816",
  "type": "math",
  "prompt": "Ten points are marked on a circle. How many distinct convex polygons of three or more sides can be drawn using some (or all) of the ten points as vertices? ",
  "expected": "968",
  "char_count": 156,
  "exp_word_count": 1
}


OUTPUT: 
425

raw:  425
NO MATCH FOUND
False
{
  "id": "math_3_216_816",
  "expected": "968",
  "got": "425",
  "correct": false,
  "status": 200,
  "error": null
}
❌ math_3_216_816: expected='968', got='425' (HTTP 200)
{
  "id": "math_3_248_848",
  "type": "math",
  "prompt": "The coordinates of a parallelogram are (5, 3), (6, 8), (7, 4) and $(x, y)$ and $x > 7$. What is the value of $x + y$?",
  "expected": "17",
  "char_count": 117,
  "exp_word_count": 1
}


OUTPUT: 
To find the value of $ x + y $, we use the property that the diagonals of a parallelogram bisect each other. The midpoint of the diagonals must be the same.

Let the points be $ A(5, 3) $, $ B(6, 8) $, $ C(7, 4) $, and $ D(x, y) $.

Assume $ A $ and $ C $ are one diagonal, and $ B $ and $ D $ are the other diagonal.

Midpoint of $ AC $:  
$$
\left( \frac{5 + 7}{2}, \frac{3 + 4}{2} \right) = \left( 6, \frac{7}{2} \right)
$$

Midpoint of $ BD $:  
$$
\left( \frac{6 + x}{2}, \frac{8 + y}{2} \right)
$$

Set the midpoints equal:  
$$
\frac{6 + x}{2} = 6 \quad \text{and} \quad \frac{8 + y}{2} = \frac{7}{2}
$$

Solve for $ x $:  
$$
6 + x = 12 \Rightarrow x = 6
$$

Solve for $ y $:  
$$
8 + y = 7 \Rightarrow y = -1
$$

Now, $ x + y = 6 + (-1) = 5 $

**Answer: 5**

raw:  To find the value of $ x + y $, we use the property that the diagonals of a parallelogram bisect each other. The midpoint of the diagonals must be the same.

Let the points be $ A(5, 3) $, $ B(6, 8) $, $ C(7, 4) $, and $ D(x, y) $.

Assume $ A $ and $ C $ are one diagonal, and $ B $ and $ D $ are the other diagonal.

Midpoint of $ AC $:  
$$
\left( \frac{5 + 7}{2}, \frac{3 + 4}{2} \right) = \left( 6, \frac{7}{2} \right)
$$

Midpoint of $ BD $:  
$$
\left( \frac{6 + x}{2}, \frac{8 + y}{2} \right)
$$

Set the midpoints equal:  
$$
\frac{6 + x}{2} = 6 \quad \text{and} \quad \frac{8 + y}{2} = \frac{7}{2}
$$

Solve for $ x $:  
$$
6 + x = 12 \Rightarrow x = 6
$$

Solve for $ y $:  
$$
8 + y = 7 \Rightarrow y = -1
$$

Now, $ x + y = 6 + (-1) = 5 $

**Answer: 5**
NO MATCH FOUND
False
{
  "id": "math_3_248_848",
  "expected": "17",
  "got": "To find the value of $ x + y $, we use the property that the diagonals of a parallelogram bisect each other. The midpoint of the diagonals must be t

OUTPUT: 
false

raw:  false
MATCH(ES) FOUND: ['false']
True
{
  "id": "common_sense_1_359_459",
  "expected": false,
  "got": "false",
  "correct": true,
  "status": 200,
  "error": null
}
✅ common_sense_1_359_459: expected=False, got='false' (HTTP 200)
