In [None]:
# %% Minimal setup
# If needed (uncomment in a notebook):
# !pip install requests python-dotenv

import os, json, textwrap, re, time
import requests

API_KEY  = os.getenv("OPENAI_API_KEY", "cse476")
API_BASE = os.getenv("API_BASE", "http://10.4.58.53:41701/v1")  
MODEL    = os.getenv("MODEL_NAME", "bens_model")              

def call_model_chat_completions(prompt: str,
                                system: str = "You are a helpful assistant. Reply with only the final answer—no explanation.",
                                model: str = MODEL,
                                temperature: float = 0.3,
                                timeout: int = 60,
                                max_tokens: int = 128) -> dict:
    """
    Calls an OpenAI-style /v1/chat/completions endpoint and returns:
    { 'ok': bool, 'text': str or None, 'raw': dict or None, 'status': int, 'error': str or None, 'headers': dict }
    """
    url = f"{API_BASE}/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type":  "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user",   "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": max_tokens,
    }

    try:
        resp = requests.post(url, headers=headers, json=payload, timeout=timeout)
        status = resp.status_code
        hdrs   = dict(resp.headers)
        if status == 200:
            data = resp.json()
            text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            return {"ok": True, "text": text, "raw": data, "status": status, "error": None, "headers": hdrs}
        else:
            # try best-effort to surface error text
            err_text = None
            try:
                err_text = resp.json()
            except Exception:
                err_text = resp.text
            return {"ok": False, "text": None, "raw": None, "status": status, "error": str(err_text), "headers": hdrs}
    except requests.RequestException as e:
        return {"ok": False, "text": None, "raw": None, "status": -1, "error": str(e), "headers": {}}


In [107]:
from IPython.display import Markdown, display

In [89]:
# %% Direct call example
def direct_call(prompt="What is 17 + 28? Answer with just the number.", temperature=0.2, max_tokens=128):
    demo_prompt = prompt
    result = call_model_chat_completions(demo_prompt, temperature=temperature, max_tokens=max_tokens)
    print("OK:", result["ok"], "HTTP:", result["status"])
    print("MODEL SAYS:", (result["text"] or "").strip())

    # Optional: Inspect rate-limit headers if your provider exposes them
    for k in ["x-ratelimit-remaining-requests", "x-ratelimit-limit-requests", "x-request-id"]:
        if k in result["headers"]:
            print(f"{k}: {result['headers'][k]}")


In [90]:
# %% Define three tests: input + expected
my_tests = [
    {
        "id": "math_inequality",
        "type": "numeric",  # grader will prefer numeric extraction
        "prompt": "Solve for the smallest integer n such that 3n + 5 > 26. Answer with just the integer.",
        "expected": "8",    # Because 3n > 21 => n > 7, smallest integer is 8
    },
    {
        "id": "commonsense_ice",
        "type": "text",
        "prompt": (
            "You place an ice cube in a glass of water and mark the water level. "
            "After the ice melts, does the water level rise, fall, or stay the same? "
            "Answer with exactly one of: 'rise', 'fall', 'stay the same'."
        ),
        "expected": "stay the same",
    },
    {
        "id": "logic_race",
        "type": "text",
        "prompt": (
            "In a race, you pass the person in second place. What position are you now in? "
            "Answer with a single word like 'first', 'second', 'third'."
        ),
        "expected": "second",
    },
]


In [127]:
import json
from pprint import pprint
import random

POSSIBLE_TYPES = ['math', 'common_sense', 'planning', 'coding', 'future_prediction']

all_tests = json.load(open("parsed_dev_data.json", "r", encoding="utf-8"))

formatted_tests = []
for i, t in enumerate(all_tests, start=1):
    
    formatted_tests.append({
        "id": t['id'], # domain_domainIndex_domainTestIndex_testIndex
        "type": t['domain'],
        "prompt": t['input'],
        "expected": t['output'],
        "char_count": t['input_char_count'],
        "exp_word_count": t['exp_word_count']
    })
    
all_tests = formatted_tests

In [None]:
def print_test(test):
    print(json.dumps(test, indent=2, ensure_ascii=False))

#pass test_type as a list of types
#generalized get test function
def get_tests(n=0, test_type=POSSIBLE_TYPES, start=0, end=None, lower_char=0, upper_char=float('inf'), lower_exp=0, upper_exp=float('inf')):
    filtered_tests = [t for t in all_tests if t['type'] in test_type and lower_char <= t['char_count'] <= upper_char and lower_exp <= t['exp_word_count'] <= upper_exp]

    if n == 0:
        return filtered_tests[start:end]
    else:
        sample_size = min(n, len(filtered_tests))
        return random.sample(filtered_tests, sample_size)
    
""" def get_test_type(test_type, start=0, end=None, lower=0, upper=float('inf')):
    tests = [t for t in all_tests if t['type'] in test_type and lower <= t['char_count'] <= upper]
    return tests[start:end]

def get_random_tests(n=5, lower=0, upper=float('inf'), test_type=POSSIBLE_TYPES):
    filtered_tests = get_test_type(test_type=test_type, lower=lower, upper=upper) #[t for t in all_tests if lower <= t['char_count'] <= upper]
    sample_size = min(n, len(filtered_tests)) #prevent error
    return random.sample(filtered_tests, sample_size) """

In [131]:
tests = get_tests(n=5, upper_char=400) #get_test_type('math', end=10, lower=0, upper=500)
pprint(tests)

[{'char_count': 29,
  'exp_word_count': 1,
  'expected': False,
  'id': 'common_sense_1_399_499',
  'prompt': 'Is a pound sterling valuable?',
  'type': 'common_sense'},
 {'char_count': 187,
  'exp_word_count': 1,
  'expected': '3',
  'id': 'math_3_191_791',
  'prompt': 'Ten treeks weigh as much as three squigs and one goolee. Two '
            'treeks and one goolee are equal in weight to one squig. The '
            'combined weight of how many treeks equals the weight of one '
            'squig?',
  'type': 'math'},
 {'char_count': 58,
  'exp_word_count': 1,
  'expected': True,
  'id': 'common_sense_1_313_413',
  'prompt': 'Is Alan Alda old enough to have fought in the Vietnam War?',
  'type': 'common_sense'},
 {'char_count': 39,
  'exp_word_count': 1,
  'expected': '4',
  'id': 'math_3_291_891',
  'prompt': 'If $2^8=4^x$, what is the value of $x$?',
  'type': 'math'},
 {'char_count': 333,
  'exp_word_count': 9,
  'expected': 'No, humans have never flown close to the sun',
  'id': 

In [94]:
#simple hello world call to kick off the commits
#direct_call(prompt="how do I find the derivative of y=x^2 using python?")

In [95]:
def interactive_chat():
    messages = ["<Start of message history>"]
    count = 0
    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit']:
            print("Exiting chat.")
            break
        response = call_model_chat_completions(prompt=f"Old messages{messages}, CURRENT USER INPUT:{user_input} <--- ANSWER THIS QUESTION", temperature=0.7)
        count += 1
        messages.append(f"MESSAGE_{count}_[previous user input: {user_input}, previous system response: {response['text']}]")
        if response["ok"]:
            print("Model:", response["text"].strip())
        else:
            print("Error:", response["error"])
        print(messages)
#interactive_chat()

In [96]:
""" def execute_tests():
    rows = []
    for t in tests:
        r = call_model_chat_completions(
            prompt,
            system=system,
            model=model,
            temperature=0.3,
            max_tokens=128
        ) """

' def execute_tests():\n    rows = []\n    for t in tests:\n        r = call_model_chat_completions(\n            prompt,\n            system=system,\n            model=model,\n            temperature=0.3,\n            max_tokens=128\n        ) '

In [97]:
def self_evaluate(question, prediction, expected_answer, model=MODEL):
    """
    Use the model itself as a strict grader.
    Returns True if the model says the prediction matches the expected answer; else False.
    Falls back to a simple normalized string compare if the model's reply is malformed.
    """
    import re

    system = "You are a strict grader. Reply with exactly True or False. No punctuation. No explanation."
    prompt = f"""You are grading a question-answer pair.

Return exactly True if the PREDICTION would be accepted as correct for the EXPECTED_ANSWER.
Otherwise, return False.

QUESTION:
{question}

PREDICTION:
{prediction}

EXPECTED_ANSWER:
{expected_answer}

Answer with exactly: True or False
"""

    r = call_model_chat_completions(
        prompt,
        system=system,
        model=model,
        temperature=0.3,
    )

    reply = (r.get("text") or "").strip().lower()
    if reply.startswith("true"):
        return True
    if reply.startswith("false"):
        return False

    # No Fallback yet


In [118]:
def self_evaluate_tests(tests, model=MODEL, grader_model=None, sleep_sec=0.2, verbose=True):
    """
    Run the tests by querying the model for each prompt, then use LLM-as-a-judge
    (self_evaluate) to determine correctness.

    Args:
        tests: list of dicts with keys: id, prompt, expected (and optionally type)
        model: model used to generate predictions
        grader_model: model used to judge correctness (defaults to `model` if None)
        sleep_sec: small delay between calls to be polite to the API
        verbose: if True, print a summary line per test

    Returns:
        rows: list of dicts with fields:
              id, expected, got, correct, status, error
    """
    import time

    judge_model = grader_model or model
    rows = []
    count = 0
    for t in tests:
        count += 1
        # 1) Get model prediction
        #print('prompt:', t['prompt'])
        print_test(t)
        r = call_model_chat_completions(
            f"{t['prompt']}",
            system="Give a short answer to each prompt, don't explain.",
            model=model,
            temperature=0.3,
            max_tokens=400
        )
        got = (r.get("text") or "").strip()
        display(Markdown(f"OUTPUT: \n{got}"))
        print('raw: ', got)
        # 2) LLM-as-a-judge: strict True/False
        """ is_correct = self_evaluate(
            question=t["prompt"],
            prediction=got,
            expected_answer=t["expected"],
            model=judge_model,
        )

        row = {
            "id": t.get("id", "<unnamed>"),
            "expected": t["expected"],
            "got": got,
            "correct": bool(is_correct),
            "status": r.get("status"),
            "error": r.get("error"),
        }
        
        rows.append(row)
        print(json.dumps(row, indent=2, ensure_ascii=False))
        if verbose:
            mark = "✅" if is_correct else "❌"
            print(f"{mark} {row['id']}: expected={row['expected']!r}, got={row['got']!r} (HTTP {row['status']})")
            if row["error"]:
                print("   error:", row["error"]) """

        if sleep_sec:
            time.sleep(sleep_sec)

    return rows

# Example:


In [120]:
import re

In [None]:
def basic_match_check(test, output):
    exp = test["expected"]
    matches = re.findall(exp, output, re.IGNORECASE)
    
    num_matches = len(matches)
    if len(num_matches) > 0:
        print('MATCH(ES) FOUND:', matches)
        return True
    
    print('NO MATCH FOUND')
    return False
    
    
    
    """ if exp.lower() in output.lower():
        print('FOUND MATCH SOMEWHERE')
        if len(exp) < 3:
            
        else:
            #exp is too large for basic match
            print('EXPECTED TOO LARGE')
    
    #if we find zero matches at all just return false
    print('NO MATCH')
    return False """

""" if test['type'] == 'math':
    num_matches = re.findall(r'-?\d+(\.\d+)?', output)
word_matches = re.findall(r'[a-zA-Z]+', output)
if matches:
    return True
return False """

In [132]:
test_prompts = get_tests(n=3) #get_test_type(["math"],end=10, upper=300) get_random_tests(n=3, upper=300)
results_llm_judge = self_evaluate_tests(test_prompts, verbose=True, model=MODEL, grader_model=MODEL)

{
  "id": "planning_4_77_977",
  "type": "planning",
  "prompt": "I am playing with a set of objects. Here are the actions I can do\n\n   Attack object\n   Feast object from another object\n   Succumb object\n   Overcome object from another object\n\nI have the following restrictions on my actions:\n    To perform Attack action, the following facts need to be true: Province object, Planet object, Harmony.\n    Once Attack action is performed the following facts will be true: Pain object.\n    Once Attack action is performed the following facts will be false: Province object, Planet object, Harmony.\n    To perform Succumb action, the following facts need to be true: Pain object.\n    Once Succumb action is performed the following facts will be true: Province object, Planet object, Harmony.    \n    Once Succumb action is performed the following facts will be false: Pain object.\n    To perform Overcome action, the following needs to be true: Province other object, Pain object.\n    Onc

OUTPUT: 
feast object a from object b  
overcome object a from object d  
attack object b  
overcome object b from object a  
attack object d  
overcome object d from object a

raw:  feast object a from object b  
overcome object a from object d  
attack object b  
overcome object b from object a  
attack object d  
overcome object d from object a
{
  "id": "math_3_108_708",
  "type": "math",
  "prompt": "Leo's assignment was divided into three parts. He finished the first part of his assignment in 25 minutes. It took him twice as long to finish the second part. If he was able to finish his assignment in 2 hours, how many minutes did Leo finish the third part of the assignment?",
  "expected": "It took Leo 25 x 2 = <<25*2=50>>50 minutes to finish the second part of the assignment.\nLeo finished the first and second parts of the assignment in 25 + 50 = <<25+50=75>>75 minutes.\nHe finished the entire assignment in 60 x 2 = <<60*2=120>>120 minutes.\nTherefore, it took Leo 120 - 75 = <<120-75=45>>45 minutes to finish the third part of the assignment.\n#### 45",
  "char_count": 277,
  "exp_word_count": 66
}


OUTPUT: 
First part: 25 minutes  
Second part: 2 × 25 = 50 minutes  
Total time: 2 hours = 120 minutes  
Third part: 120 - 25 - 50 = 45 minutes  

45

raw:  First part: 25 minutes  
Second part: 2 × 25 = 50 minutes  
Total time: 2 hours = 120 minutes  
Third part: 120 - 25 - 50 = 45 minutes  

45
{
  "id": "math_3_251_851",
  "type": "math",
  "prompt": "Let $z$ be a complex number such that $z^5 = 1$ and $z \\neq 1.$  Compute\n\\[z + \\frac{1}{z} + z^2 + \\frac{1}{z^2}.\\]",
  "expected": "-1",
  "char_count": 115,
  "exp_word_count": 1
}


OUTPUT: 
$-1$

raw:  $-1$
