---

# Main chat completion function

---

Same function from tutorial, with some additions like extra hyperparams

In [2]:
# %% Minimal setup
# If needed (uncomment in a notebook):
# !pip install requests python-dotenv

import os, json, textwrap, re, time
import requests

API_KEY  = os.getenv("OPENAI_API_KEY", "cse476")
API_BASE = os.getenv("API_BASE", "http://10.4.58.53:41701/v1")  
MODEL    = os.getenv("MODEL_NAME", "bens_model")              

def call_model_chat_completions(prompt: str,
                                system: str = "You are a helpful assistant. Reply with only the final answer‚Äîno explanation.",
                                model: str = MODEL,
                                temperature: float = 0.3,
                                timeout: int = 60,
                                max_tokens: int = 128,
                                top_p: int = None,
                                top_k: int = None,
                                stop: str = None) -> dict:
    """
    Calls an OpenAI-style /v1/chat/completions endpoint and returns:
    { 'ok': bool, 'text': str or None, 'raw': dict or None, 'status': int, 'error': str or None, 'headers': dict }
    """
    url = f"{API_BASE}/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type":  "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user",   "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": max_tokens,
        "top_p": top_p,
        "top_k": top_k,
        "stop": stop
    }

    try:
        #{'id': 'chatcmpl-88b6d7e18a5542b5bed5bf2828f0661e', 'object': 'chat.completion', 'created': 1763204718, 'model': 'bens_model', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'US Highway 281', 'refusal': None, 'annotations': None, 'audio': None, 'function_call': None, 'tool_calls': [], 'reasoning_content': None}, 'logprobs': None, 'finish_reason': 'stop', 'stop_reason': None, 'token_ids': None}], 'service_tier': None, 'system_fingerprint': None, 'usage': {'prompt_tokens': 50, 'total_tokens': 57, 'completion_tokens': 7, 'prompt_tokens_details': None}, 'prompt_logprobs': None, 'prompt_token_ids': None, 'kv_transfer_params': None}
        resp = requests.post(url, headers=headers, json=payload, timeout=timeout)
        status = resp.status_code
        hdrs   = dict(resp.headers)
        if status == 200:
            data = resp.json()
            #print(data)
            text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            tokens_used = data.get("usage",[{}]).get("completion_tokens", {})
            #print('used tokens:', tokens_used)
            
            return {"ok": True, "text": text, "raw": data, "status": status, "error": None, "headers": hdrs, "tokens_used":tokens_used}
        else:
            # try best-effort to surface error text
            err_text = None
            try:
                err_text = resp.json()
            except Exception:
                err_text = resp.text
            return {"ok": False, "text": None, "raw": None, "status": status, "error": str(err_text), "headers": hdrs}
    except requests.RequestException as e:
        return {"ok": False, "text": None, "raw": None, "status": -1, "error": str(e), "headers": {}}


---
# TESTS

---

In [3]:
from IPython.display import Markdown, display

Get all the tests, load/save, define possible types

In [4]:
import json
from pprint import pprint
import random
from collections import Counter

POSSIBLE_TYPES = ['math', 'common_sense', 'planning', 'coding', 'future_prediction']

def load_save_json(path_in="parsed_dev_data.json", path_out=None, data_in=None, clear=False):
    data = json.load(open(path_in, "r", encoding="utf-8")) if not clear else []
    if path_out is not None:
        data.append(data_in)
        with open(path_out, "w") as f:
            json.dump(data, f, indent=4)
            
    return data
            
all_tests = load_save_json()

type_counts = Counter(t['domain'] for t in all_tests)
print(type_counts)

formatted_tests = []
for i, t in enumerate(all_tests, start=1):
    
    formatted_tests.append({
        "id": t['id'], # domain_domainIndex_domainTestIndex_testIndex
        "type": t['domain'],
        "prompt": t['input'],
        "expected": t['output'],
        "char_count": t['input_char_count'],
        "exp_word_count": t['exp_word_count']
    })
    
all_tests = formatted_tests

Counter({'common_sense': 400, 'math': 300, 'coding': 100, 'future_prediction': 100, 'planning': 100})


Generic get test function, mostly used for getting random tests of certain type(s)

In [5]:
def print_json(test):
    print(json.dumps(test, indent=2, ensure_ascii=False))

#pass test_type as a list of types
#generalized get test function
def get_tests(n=0, test_type=POSSIBLE_TYPES, start=0, end=None, lower_char=0, upper_char=float('inf'), lower_exp=0, upper_exp=float('inf'), seed=None, index=None):
    if index is not None: return all_tests[index]
    
    filtered_tests = [t for t in all_tests if t['type'] in test_type and lower_char <= t['char_count'] <= upper_char and lower_exp <= t['exp_word_count'] <= upper_exp]
    #print('filtered size:', len(filtered_tests))
    sample_size = min(n, len(filtered_tests))
    
    if seed is not None: random.seed(seed)
    
    if n == 0:
        return [filtered_tests[start:end]]
    elif n == -1:
        filtered_type_counts = Counter(t['type'] for t in filtered_tests)
        each_test = []
        count = 0
        
        for val in filtered_type_counts.values():
            rand = random.randint(count, count + val)
            count = count + val
            each_test.append(filtered_tests[rand])
            
        #print("sampled size:", len(each_test))    
        return each_test
    else:
        return random.sample(filtered_tests, sample_size)
    

In [6]:
""" def interactive_chat():
    messages = ["<Start of message history>"]
    count = 0
    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit']:
            print("Exiting chat.")
            break
        response = call_model_chat_completions(prompt=f"Old messages{messages}, CURRENT USER INPUT:{user_input} <--- ANSWER THIS QUESTION", temperature=0.7)
        count += 1
        messages.append(f"MESSAGE_{count}_[previous user input: {user_input}, previous system response: {response['text']}]")
        if response["ok"]:
            print("Model:", response["text"].strip())
        else:
            print("Error:", response["error"])
        print(messages) """
#interactive_chat()

' def interactive_chat():\n    messages = ["<Start of message history>"]\n    count = 0\n    while True:\n        user_input = input("You: ")\n        if user_input.lower() in [\'exit\', \'quit\']:\n            print("Exiting chat.")\n            break\n        response = call_model_chat_completions(prompt=f"Old messages{messages}, CURRENT USER INPUT:{user_input} <--- ANSWER THIS QUESTION", temperature=0.7)\n        count += 1\n        messages.append(f"MESSAGE_{count}_[previous user input: {user_input}, previous system response: {response[\'text\']}]")\n        if response["ok"]:\n            print("Model:", response["text"].strip())\n        else:\n            print("Error:", response["error"])\n        print(messages) '

### Handle guessing the prompt domain/type

In [7]:
import re

Terrible by itself (This qwen model is just not very capable...), might work better with this new implementation to supplement low NB confidence

In [8]:
#performs really really bad
def guess_test_type(input, p1, p2):
    sys = """
    You are a basic decider model.
    Your goal is to distinguish between if an input sentence represents a math, common sense, planning, or coding question.
    Respond with one word depending on the input sentence type
    """
    #POSSIBLE_TYPES = ['math', 'common_sense', 'planning', 'coding', 'future_prediction']
    prompt = """
    What question type does the following input correspond to?
    
    INPUT: {input}
    ----------------------
    Is it a {p1} question?
    
    OR
    
    Is it a {p2} question?
    
    Respond with one word
    """
    return call_model_chat_completions(system=sys, prompt=prompt, temperature=0.3)["text"]

Simple NB classifer trained in colab on the dev data. Good test accuracy around 95.5%

In [9]:
import pickle

with open('nb_classifier.pkl', 'rb') as f:
    classifier = pickle.load(f)

with open('nb_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)


In [10]:
data = []
with open('cse_476_final_project_test_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [11]:
def classify_prompt(prompt):
    prompt_vec = vectorizer.transform([prompt])
    
    prediction = classifier.predict(prompt_vec)[0]
    probabilities = classifier.predict_proba(prompt_vec)[0]
    
    print(f"prediction: {prediction}")
    print(f"Classes order: {classifier.classes_}")
    print(f"probabilities: {probabilities}")
    
    result = {
        'prompt': prompt,
        'predicted_type': prediction,
        'probabilities': {class_name: prob for class_name, prob in zip(classifier.classes_, probabilities)}
    }
    
    high_prob_classes = [class_name for class_name, prob in zip(classifier.classes_, probabilities) if prob >= 0.40]
    
    if len(high_prob_classes) == 2:
        print(f"TWO HIGH PROB: {high_prob_classes[0]} {high_prob_classes[1]}")
        return guess_test_type(prompt, high_prob_classes[0], high_prob_classes[1])
    
    return result['predicted_type']

""" for test,i in zip(data,range(0,100)):
    print(test)
    #print('ACTUAL:', test["domain"])
    result = classify_prompt(test["input"])
    print(f"PREDICTED: {result['predicted_type']}")
    print()
    print(f"Probabilities: {result['probabilities']}") """

' for test,i in zip(data,range(0,100)):\n    print(test)\n    #print(\'ACTUAL:\', test["domain"])\n    result = classify_prompt(test["input"])\n    print(f"PREDICTED: {result[\'predicted_type\']}")\n    print()\n    print(f"Probabilities: {result[\'probabilities\']}") '

---

# First Final Project Implementation

---

So I didn't realize we don't get expected answers on the final test data, my first implementation of the final project was based off the tutorial notebooks using expected answer.

### Self Evaluate functions

In [12]:
def self_evaluate(question, prediction, expected_answer, model=MODEL):
    """
    Use the model itself as a strict grader.
    Returns True if the model says the prediction matches the expected answer; else False.
    Falls back to a simple normalized string compare if the model's reply is malformed.
    """

    system = "You are a strict grader. Reply with exactly True or False. No punctuation. No explanation."
    prompt = f"""You are grading a question-answer pair.

Return exactly True if the PREDICTION would be accepted as correct for the EXPECTED_ANSWER.
Otherwise, return False.

QUESTION:
{question}

PREDICTION:
{prediction}

EXPECTED_ANSWER:
{expected_answer}

Answer with exactly: True or False
"""

    r = call_model_chat_completions(
        prompt,
        system=system,
        model=model,
        temperature=0.3,
    )

    reply = (r.get("text") or "").strip().lower()
    if reply.startswith("true"):
        return True
    if reply.startswith("false"):
        return False

    # No Fallback yet


In [13]:
def self_evaluate2(question, model_output, prediction, expected_answer, model=MODEL):
    """
    Use the model itself as a strict grader.
    Returns True if the model says the prediction matches the expected answer; else False.
    Falls back to a simple normalized string compare if the model's reply is malformed.
    """
    import re

    system = "You are a strict grader. Reply with exactly Yes or No. No punctuation. No explanation."
    prompt = f"""MODEL_1 thinks this ANSWER is {prediction}, do you agree with MODEL_1 decision?

QUESTION:
{question}

ANSWER:
{model_output}

EXPECTED_ANSWER:
{expected_answer}

-----------------------
MODEL_1 OUTPUT:
{prediction}
-----------------------

Answer with exactly: Yes or No. Do you agree with MODEL_1?
"""

    r = call_model_chat_completions(
        prompt,
        system=system,
        model=model,
        temperature=0.3,
    )

    reply = (r.get("text") or "").strip().lower()
    if reply.startswith("true") or reply.startswith("yes"):
        return True
    if reply.startswith("false") or reply.startswith("no"):
        return False

    # No Fallback yet


### Eval util functions

In [14]:
tf_map = {'yes':'true', 'no':'false'}
def map_tf(output, exp):
    exp = str(exp)
    exp = exp.lower().strip('.')
    out = output.lower().strip('.')
    
    #rare case when exp is actually yes/now and model output is true/false
    if exp == "yes" and out == "true": return "yes"
    if exp == "no" and out == "false": return "no"
    
    return tf_map.get(out) if out in tf_map else output

In [15]:

def basic_match_check(test, output):
    exp = test["expected"]
    
    output = map_tf(output, exp)
    
    matches = re.findall(re.escape(str(exp)), output, re.IGNORECASE)
    
    num_matches = len(matches)
    if num_matches > 0:
        #print('MATCH(ES) FOUND:', matches)
        return True
    
    #print('NO MATCH FOUND')
    return False

In [16]:
def seperator(text, tokens_used=None, max_tokens=400):
    if tokens_used is not None:
        print(f'{text} (TOKENS USED: {tokens_used}/{max_tokens})')
        if int(tokens_used) == max_tokens:
            print('MAXED TOKENS REACHED - OUTPUT TRUNCATED')
            return False
    else:
        print(text)
    print('-'*32)
    
    return True

In [17]:
def check_correct(bool1, bool2):
    correctness = bool1 and bool2
    agreement = bool1 == bool2
    
    print('‚úÖ CORRECT') if correctness else print('‚ùå INCORRECT')
    print('üÜó AGREED') if agreement else print('üÜò DISAGREED')
    
    return correctness, agreement

Bunch of functions used when we have an expected answer

In [18]:
import math

def create_matches(toCount, toMatch):
    counter = Counter(toCount)
    match_counts = {word: counter.get(word, 0) for word in toMatch}
    total_matches = sum(match_counts.values())
    output_len = len(toCount)
    #print(f"{total_matches}/{output_len} : {(total_matches / output_len) * 100 if output_len != 0 else 0}%")
    #print('match counts:', match_counts)
    return total_matches, output_len

def get_cosine(expected_counter, output_counter):
    dot_product = sum(expected_counter[word] * output_counter.get(word, 0) for word in expected_counter)
    
    #print(f"Dot product: {dot_product}")
    
    exp_mag = math.sqrt(sum(v**2 for v in expected_counter.values()))
    out_mag = math.sqrt(sum(v**2 for v in output_counter.values()))
    
    cosine_sim = 0
    if exp_mag > 0 and out_mag > 0:
        cosine_sim = dot_product / (exp_mag * out_mag)
        print(f"\n[Cosine similarity: {cosine_sim}]")
        
    return cosine_sim

def get_start_end_matches(expected, output, exp_len, out_len):
    start_matches = False
    end_matches = False
    if expected[0] in output[0]: start_matches = True
    if expected[exp_len-1] in output[out_len-1]: end_matches = True
    #print('exp', expected)
    #print('output', output)
    
    #print(f"expected[0] {expected[0]}, output[0] {output[0]}")
    #print(f"expected[exp_len-1] {expected[exp_len-1]}, output[out_len-1] {output[out_len-1]}")
    #print(f"START {start_matches} END {end_matches}")
    
    return start_matches, end_matches
    
def super_match(test, output):
    expected = str(test["expected"]).replace('$', '').lower().split()
    output = output.replace('$', '').lower().split()
    
    expected_counter = Counter(expected)
    output_counter = Counter(output)
    
    #not very helpful in the long run...
    get_cosine(expected_counter, output_counter)
    
    exp_matches, out_len = create_matches(output, expected)
    out_matches, exp_len = create_matches(expected, output)
    
    return get_start_end_matches(expected, output, exp_len, out_len)
    #return match_counts

### Main first implementation agent loop

In [19]:
def self_evaluate_tests(tests, model=MODEL, grader_model=None, sleep_sec=0.2, verbose=True):
    """
    Run the tests by querying the model for each prompt, then use LLM-as-a-judge
    (self_evaluate) to determine correctness.

    Args:
        tests: list of dicts with keys: id, prompt, expected (and optionally type)
        model: model used to generate predictions
        grader_model: model used to judge correctness (defaults to `model` if None)
        sleep_sec: small delay between calls to be polite to the API
        verbose: if True, print a summary line per test

    Returns:
        rows: list of dicts with fields:
              id, expected, got, correct, status, error
    """
    import time

    judge_model = grader_model or model
    MAX_TOKENS = 400
    final_answers = []
    count = 0
    test_samples = {
        "count": len(tests),
        "seed": None,
        "samples": None
    }
    
    for t in tests:
        sample = {
            "test_count": count,
            "id": t["id"],
            "input": t['prompt'],
            "expected": t["expected"],
            "got": None,
            "history": {
                "check_correct1": {
                    "match_check": None,
                    "self_eval": None,
                    "correctness": None,
                    "agreement": None
                },
                "no_output": False,
                "truncated": False,
                "check_correct2": {
                    "self_eval": None,
                    "self_eval2": None,
                    "correctness": None,
                    "agreement": None
                },
                "check_correct3": {
                    "self_eval2": None,
                    "sides_matching": None,
                    "correctness": None,
                    "agreement": None
                },
                "final_correctness": None
            }
        }
        count += 1
        # 1) Get model prediction
        #print('prompt:', t['prompt'])
        print('\n','='*64)
        seperator('TEST_CASE')
        print_json(t)
        #handle_test(t)
        
        r = call_model_chat_completions(
            f"{t['prompt']}",
            system="Give a short answer to each prompt, don't explain.",
            model=model,
            temperature=0.3,
            max_tokens=MAX_TOKENS
        )
        got = (r.get("text") or "").strip()
        sample["got"]=got
        tokens_used = r.get("tokens_used")
        

        got = map_tf(got, t["expected"])
        
        has_output  = True if got != "" else False
        
        #If output is truncated and both evals return true, return false
        not_truncated = seperator('\nMODEL_OUTPUT', tokens_used, MAX_TOKENS)
        
        
        got = handle_test(t, got)
        display(Markdown(f"\n{got}"))
        
        print_json(got)
        #print(got)
        #print('raw: ', got)
        
        if not not_truncated:
            #final_answers.append(False)
            sample["history"]['truncated'] = True
            print("‚ùå MAX TOKENS REACHED, OUTPUT TRUNCATED, SKIPPING TESTCASE ‚ùå")
            continue
        elif has_output == False:
            sample["history"]['no_output'] = True
            print("‚ùå NO OUTPUT, PROMPT IS PROBABLY TOO LARGE, SKIPPING TESTCASE ‚ùå")
            continue
        
        match_check = basic_match_check(t, got)
        match_check = bool(match_check)
        sample["history"]["check_correct1"]["match_check"] = match_check
        
        # 2) LLM-as-a-judge: strict True/False
        is_correct = self_evaluate(
            question=t["prompt"],
            prediction=got,
            expected_answer=t["expected"],
            model=judge_model,
        )
        is_correct = bool(is_correct)
        
        sample["history"]["check_correct1"]["self_eval"] = is_correct
        
        seperator('\nMODEL OUTPUT --> FIRST EVAL')
        print('match check:', match_check)
        print('self_eval:', is_correct)
        correctness, agreement = check_correct(match_check, is_correct)
        sample["history"]["check_correct1"]['correctness'] = correctness
        sample["history"]["check_correct1"]['agreement'] = agreement
        
        #starting and ending matches
        #CAN BE USED TO VALIDATE SECOND MODEL, OR AS LAST RESORT
        start_matches, end_matches = super_match(t, got)
        sides_matching = start_matches or end_matches
        
        if not agreement:
            #second model eval
            seperator('\nDISAGREEMENT --> SECOND EVAL')
            is_correct2 = self_evaluate2(
                question=t["prompt"],
                model_output=got,
                expected_answer=t["expected"],
                prediction=is_correct,
                model=judge_model
            )
            is_correct2 = bool(is_correct2)
            
            sample["history"]["check_correct2"]["self_eval"] = is_correct
            sample["history"]["check_correct2"]["self_eval2"] = is_correct2
            
            print('self_eval2:', is_correct2)
            correctness, agreement = check_correct(is_correct, is_correct2)
            sample["history"]["check_correct2"]["correctness"] = correctness
            sample["history"]["check_correct2"]["agreement"] = agreement
            
            
            if not agreement:
                #second model eval
                seperator('\nDISAGREEMENT --> THIRD EVAL')
                print('\nside matching:', sides_matching)
                
                sample["history"]["check_correct3"]["self_eval2"] = is_correct2
                sample["history"]["check_correct3"]["sides_matching"] = sides_matching
                correctness, agreement = check_correct(sides_matching, is_correct2)
                sample["history"]["check_correct3"]["correctness"] = correctness
                sample["history"]["check_correct3"]["agreement"] = agreement    


        sample["history"]["final_correctness"] = f"‚úÖ {correctness}" if correctness else f"‚ùå {correctness}"
        final_answers.append(sample)
        
        if sleep_sec:
            time.sleep(sleep_sec)

    test_samples["samples"] = final_answers
    return test_samples

# Example:


---

# Second Final Project Implementation

---

## Math section

Importing arithmatic solver inference time algorithm from mini lab 5. This inference time agent is based off the one from mini lab 5, but also contains a ton of extra additions like more math functions and extra handling, as well as a custom inference loop.

In [94]:
import os, json, textwrap, re, time, ast, operator as op
import requests
import math
# --- PROVIDED: prompts ---
SYSTEM_AGENT2 = """You are a math tool-using agent.
You may do exactly ONE of the following in your reply:
1) CALCULATE: <arithmetic expression>
   - use only numbers, + - * / **, parentheses, and round(x, ndigits)
   - example: CALCULATE: round((3*2.49)*1.07, 2)
2) FINAL: <answer>
Return ONE line with the directive and value. No other text.
"""
SYSTEM_AGENT = """You are a math tool-using agent.

IMPORTANT: You must respond with EXACTLY ONE LINE in one of these formats:
1) CALCULATE: <arithmetic expression>
2) FINAL: <numeric answer>

Rules:
- NO explanations or reasoning
- NO LaTeX or markdown
- Arithmetic expressions can only use: numbers, +, -, *, /, **, (), round()
- Example valid responses:
  CALCULATE: (37 + 58) * 2
  CALCULATE: round(23.80 * 1.15, 2)
  FINAL: 190
  FINAL: 27.37

Respond with ONLY the directive line, nothing else.
"""

def make_first_prompt(question: str) -> str:
    return f"""Question: {question}
If you need arithmetic to get the answer, reply as:
CALCULATE: <expression>
Otherwise reply:
FINAL: <answer>"""

def make_second_prompt(result: str) -> str:
    return f"""The calculation result is: {result}
Now provide the final answer.
Reply exactly as: FINAL: <answer>"""


ACTION_RE = re.compile(r"^\s*(CALCULATE|FINAL)\s*:\s*(.+?)\s*$", re.IGNORECASE | re.DOTALL)


def parse_action(text: str):
    """
    Returns ("CALCULATE", expr) or ("FINAL", answer); raises ValueError on bad format.
    """
    # Take only the first line
    first_line = text.strip().split('\n')[0]
    m = ACTION_RE.match(first_line)
    if not m:
        raise ValueError(f"Unrecognized action format: {text!r}")
    action = m.group(1).upper()
    payload = m.group(2).strip()
    return action, payload

# We provide this function that evaluates arithmetic expressions.
ALLOWED_BINOPS = {ast.Add: op.add, ast.Sub: op.sub, ast.Mult: op.mul, ast.Div: op.truediv, ast.Pow: op.pow, ast.Mod: op.mod}
ALLOWED_UNOPS  = {ast.UAdd: op.pos, ast.USub: op.neg}

def handle_vars(expr: str, question: str, count):
    #SYSTEM_AGENT = f""

    new_prompt = f"""
                YOUR GOAL IS TO REPLACE VARIABLES WITH NUMBERS
                LETTERS ARE NOT ALLOWED IN THE EXPRESSION.
                
                A variable(s) was detected in this expression: {expr}
                
                Here is the original question: {question}
                
                Replace each variable (letter) with a number (to the best of your ability) and return ONLY the new expression. No explanation. No letters.
              """
    if count > 1:
        new_prompt = f"""
                    Something in the expression caused an AST parsing error.
                    
                    EXPRESSION {expr}
                    
                    Try to fix the ireggularities so data can be AST parsed. Return only the final expression. No explanation.
                """
    res = call_model_chat_completions(system=SYSTEM_AGENT, prompt=new_prompt)
    action, payload = parse_action(res["text"])
    print("handle vars res:", res["text"])
    try:
        parsed = ast.parse(payload, mode="eval")
        return parsed
    except Exception as e:
        new_prompt2 = f"""
                    replace expressions with their final value.
                    
                    for example if the expression is round(1+2+3, 0)
                    
                    you should return round(6, 0).
                    
                    EXPRESSION TO FIX: {payload}
                    
                    Return only the final expression. No explanation.
                """
        res2 = call_model_chat_completions(system=SYSTEM_AGENT, prompt=new_prompt2)
        action, payload = parse_action(res2["text"])
        print("handle vars res inner:", payload)
        return ast.parse(payload, mode="eval")
    
    
def safe_eval(expr: str, question: str):
    """
    Evaluates a tiny arithmetic language: numbers, + - * / ** % parentheses, round(x, ndigits).
    Converts '^' to '**'. Rejects anything else.
    """
    expr = expr.replace("^", "**")
    expr = expr.replace("i", "j")
    if len(expr) > 300: #changed from 200 to 300.
        raise ValueError("Expression too long.")
    node = ast.parse(expr, mode="eval")

    count = 0
    def ev(n):
        if isinstance(n, ast.Expression):  return ev(n.body)
        if isinstance(n, ast.Constant) and isinstance(n.value, (int, float, complex)): return n.value
        if isinstance(n, ast.UnaryOp) and type(n.op) in ALLOWED_UNOPS:        return ALLOWED_UNOPS[type(n.op)](ev(n.operand))
        if isinstance(n, ast.BinOp) and type(n.op) in ALLOWED_BINOPS:         return ALLOWED_BINOPS[type(n.op)](ev(n.left), ev(n.right))
        if isinstance(n, ast.Call) and isinstance(n.func, ast.Name) and n.func.id in ["round", "abs", "sqrt", "ceil"]:
            args = [ev(a) for a in n.args]
            if n.func.id == "round":
                return round(*args)
            elif n.func.id == "abs":
                return abs(*args)
            elif n.func.id == "sqrt":
                return math.sqrt(*args)
            elif n.func.id == "ceil":
                return math.ceil(*args)
        if isinstance(n, ast.Tuple):  # allow round(x,2) with comma
            return tuple(ev(elt) for elt in n.elts)
        
        nonlocal count
        
        if count < 2:
            count += 1
            print(f"Disallowed expression: {ast.dump(n, include_attributes=False)}")
            return ev(handle_vars(n, question, count))

        raise ValueError(f"Disallowed expression: {ast.dump(n, include_attributes=False)}")

    return ev(node)



def run_agent(question: str, max_tool_uses: int = 2, verbose: bool = True):
    # Turn 1
    r1 = call_model_chat_completions(system=SYSTEM_AGENT, prompt=make_first_prompt(question))
    if not r1["ok"]:
        raise RuntimeError(f"API error: {r1['error']}")
    if verbose: print("LLM ‚Üí", r1["text"])
    action, payload = parse_action(r1["text"])

    tool_uses = 0
    while action == "CALCULATE":
        if tool_uses >= max_tool_uses:
            raise RuntimeError("Exceeded tool-use limit.")
        tool_uses += 1

        calc_value = safe_eval(payload, question) #pass it question so we can handle variables
        if verbose: print("CALC =", calc_value)

        # Turn 2 (+)
        rN = call_model_chat_completions(system=SYSTEM_AGENT, prompt=make_second_prompt(str(calc_value)))
        if not rN["ok"]:
            raise RuntimeError(f"API error: {rN['error']}")
        if verbose: print("LLM ‚Üí", rN["text"])

        action, payload = parse_action(rN["text"])

    # action must be FINAL here
    return payload

Implementing the AST parser and math handling

In [None]:
import re, math

# ---------- Baseline: no-tool runner ----------
SYSTEM_DIRECT = "You are a careful math assistant. Reply with only the final numeric answer‚Äîno explanation."

NUM_RE = re.compile(r"[-+]?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?")

def extract_number(text: str) -> str:
    """
    Try to normalize model output to a single numeric string.
    Falls back to the raw text if no number is found.
    """
    m = NUM_RE.search(text)
    num = m.group(0) if m else text.strip()
    #print(f"extracted num: {num}")
    return num

def make_direct_final_prompt(result: str) -> str:
    return f"""The calculation result is: {result}
            Now provide the final answer, no explanation.
            Reply exactly as: FINAL: <answer>"""

def run_direct(question: str, verbose: bool = True) -> str:
    r = call_model_chat_completions(
        system=SYSTEM_DIRECT,
        prompt=question,
        temperature=0.0,
        max_tokens=500
    )
    if not r["ok"]:
        raise RuntimeError(f"API error: {r['error']}")
    if verbose:
        print("LLM(no-tool) ‚Üí", r["text"])
        
    r = call_model_chat_completions(
        system=SYSTEM_DIRECT,
        prompt=make_direct_final_prompt(r["text"]),
        temperature=0.0,
    )
    print(f"final direct: {r["text"]}")
    return extract_number(r["text"])


# ---------- Evaluation harness ----------
def evaluate_side_by_side(input, verbose=False):
    """ rows = []
    direct_correct = 0
    tool_correct   = 0 """

    #for i, (q, gold, ttype) in enumerate(questions, 1):
    """ gold = re.findall(r"\d+", gold)[-1] if len(gold) > 5 else gold
    
    if verbose: print(f"\nQ{i}: {q}") """

    # No-tool
    pred_direct = float(run_direct(input, verbose=verbose))
    #ok_direct   = is_correct(pred_direct, gold)
    #direct_correct += int(ok_direct)
    print(f"\nTotal (No-Tool): {pred_direct}")
    # With tool
    try:
        pred_tool = float(run_agent(input, verbose=verbose))  # uses your agent loop
        print(f"Total (Tool)   : {pred_tool}")
    except Exception as e:
        print(f"ERROR_TYPE: {type(e).__name__} ERROR: {e}")
        return pred_direct
    
    if pred_tool == pred_direct:
        return pred_tool

    comp_prompt = f""" 
        Given this math question: {input}
        
        Which of these answers seems more correct?
        
        ANSWER 1 {pred_direct}
        
        ANSWER 2 {pred_tool}
        
        Respond with only the final answer, no explanation
    """
    r = call_model_chat_completions(
        system=SYSTEM_DIRECT,
        prompt=comp_prompt,
        temperature=0.0,
    )
    
    got = (r.get("text") or "").strip()
    return float(got)
    #ok_tool   = is_correct(pred_tool, gold)
    #tool_correct += int(ok_tool)

    """ rows.append((i, q, gold, pred_direct, "‚úì" if ok_direct else "‚úó",
                            pred_tool,   "‚úì" if ok_tool   else "‚úó"))

    # Pretty print
    print("\n=== Results (No-Tool vs Tool) ===")
    colw = [4, 42, 8, 10, 3, 10, 3]
    header = ["#", "Question", "Gold", "No-Tool", "", "Tool", ""]
    fmt = f"{{:<{colw[0]}}} {{:<{colw[1]}}} {{:>{colw[2]}}}  {{:>{colw[3]}}} {{:^{colw[4]}}}  {{:>{colw[5]}}} {{:^{colw[6]}}}"
    print(fmt.format(*header))
    print("-" * sum(colw) + "-"*10)

    for r in rows:
        i, q, gold, pd, okd, pt, okt = r
        q_short = (q[:colw[1]-3] + "‚Ä¶") if len(q) > colw[1] else q
        print(fmt.format(i, q_short, gold, pd, okd, pt, okt)) """

    
    

#evaluate_side_by_side(QUESTIONS, verbose=False)

  """ gold = re.findall(r"\d+", gold)[-1] if len(gold) > 5 else gold


## Search section (common_sense)

Various search util methods that aren't very useful

In [22]:
from search_utils import search_bing, search_brave, search_duckduck

Wikipedia API

In [None]:
#%pip install wikipedia
import wikipedia

wikipedia.set_lang("en")
def search_wiki(query):
    query = query.strip()
    search_results = wikipedia.search(query, results=5)
    #print(f"Search results: {search_results}")

    #first_result = search_results[0]
    
    try:
        summary = wikipedia.summary(query, sentences=3)
        #print(f"Summary: {summary}")
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"Disambiguation needed: {e.options}")
    except wikipedia.exceptions.PageError:
        print(f"Page not found for: {query}")
        first_result = search_results[0]
        summary = wikipedia.summary(first_result, sentences=3)
        #print(f"Summary: {summary}")
    

    #page = wikipedia.page(first_result)
    #pprint(page.content)
    """ print(f"Title: {page.title}")
    print(f"URL: {page.url}")
    print(f"Content length: {len(page.content)}")
    print(f"Images: {page.images[:3]}") """
    
    return summary

Main search agent/inference loop

In [None]:
def search_bot(question):
    #print(question)

    SYSTEM_AGENT="""
    You are a helpful assistant that handles searching the web for more information.
    You respond with short and concise phrases.
    """
    
    prompt = f"""
    Do you want to research this question?
    
    QUESTION: {question}
    
    (Answer YES or NO)
    """
    
    prompt2 = f"""
    What wikipedia article should we research?
    
    QUESTION: {question}
    
    Only respond with a single article, don't explain.
    """
    
    prompt3 = f"""
    Given the wikipedia response given in the last chat. How would you answer this question:
    
    QUESTION: {question}
    
    Give a short concise response.
    """
    
    prompt4 = f"""
    If you don't want to search, then answer the question directly.
    
    QUESTION: {question}
    
    Give a short concise response.
    """
    
    r = call_model_chat_completions(
        system=SYSTEM_AGENT,
        prompt=prompt,
        temperature=0.2,
    )
    
    got = (r.get("text") or "").strip()
    #print(got)
    
    if got == 'YES':
        chat = {
            "previous_chat": {
                "prompt": prompt,
                "your answer": got
            },
            "current_chat": {
                "prompt": prompt2,
                "your answer": "..."
            }
        }
    
        r = call_model_chat_completions(
            system=SYSTEM_AGENT,
            prompt=f"{chat}",
            temperature=0.2,
        )
        
        got = (r.get("text") or "").strip()
        #print(got)
        chat["current_chat"]["your answer"] = got
        
        res = search_wiki(got)
        #pprint(res)
        chat["current_chat"]["wikipedia response"] = res
        
        chat["previous_chat2"] = chat["current_chat"]
        del chat["current_chat"]
        
        chat["current_chat"] = {
                "prompt": prompt3,
                "your answer": "..."
            }
        #print('chat', chat)
        r = call_model_chat_completions(
            system=SYSTEM_AGENT,
            prompt=f"{chat}",
            temperature=0.2,
        )
        
        got = (r.get("text") or "").strip()
        #print(got)
        
    elif got == 'NO':
        r = call_model_chat_completions(
            system=SYSTEM_AGENT,
            prompt=prompt4,
            temperature=0.2,
        )
        
        got = (r.get("text") or "").strip()
        #print(got)
        
    return got
        

## Coding question section

In [25]:
def handle_test(test, output=None):
    display(Markdown(f"Expected Code:\n\n```python\n{test["expected"]}\n```"))
    if "```python" not in output:
        output = f"```python\n{output}\n```"
    return output


In [26]:
import re
import textwrap

def clean_code(code_str):
    code_str = re.sub(r"^```[a-zA-Z]*\n?", "", code_str)
    code_str = re.sub(r"\n?```$", "", code_str)

    code_str = code_str.encode().decode("unicode_escape")

    return textwrap.dedent(code_str).strip()


In [84]:
import subprocess
import textwrap
import sys

def handle_code(code):
    #code = textwrap.dedent(code)
    result = subprocess.run(
        [sys.executable, "-c", code],
        capture_output=True,
        text=True,
        timeout=50
    )

    print(result.stdout)
    print(result.stderr)
    
    if result.returncode != 0:
        raise RuntimeError(f"Code execution failed: {result.stderr}")
    
    return result.stdout, result.stderr

def get_code(test):
    input = test["prompt"]
    SYSTEM_AGENT = """
    You are a python developer who writes short and quick code.capitalize
    
    Don't forget to call your task func with real data at the end of the code. Print the result.
    """
    
    prompt1 = f"""
    Write the code to complete this question. Keep it short and sweet
    
    QUESTION: {input}
    ---------
    Call your task func with data at the end of the code to test it and print the result.
    """
    
    def extract_call(code):
        parts = code.split('return', 1)
        if len(parts) < 2:
            return None
        
        after_return = parts[1]
        first_newline = after_return.find('\n')
        rest = after_return[first_newline+1:]
        #print('TASK FUNC TEST:', rest)
        
        return rest
    
    def extract_between_backticks(text):
        parts = text.split('```')
        
        if len(parts) >= 3:
            code = parts[1]
            
            code = textwrap.dedent(code)
            #print('CODE',code)
            return code.strip()
        
    def handle_completion(input):
        r = call_model_chat_completions(
            system=SYSTEM_AGENT,
            prompt=input,
            temperature=0.2,
            max_tokens=400
        )
        got = (r.get("text") or "").strip()
        got = handle_test(test, got)
        display(Markdown(f"MODEL OUTPUT:\n{got}"))
        
        return got
    
    got = handle_completion(prompt1)
    
    #print_json(got)
    
    cleaned = clean_code(got)
    task_call = extract_call(cleaned)
    
    try:
        handle_code(cleaned)
    except Exception as e:
        print(f"ERROR_TYPE: {type(e).__name__} ERROR: {e}")
        errPrompt = f"""
            There was an error in your code!!
            
            Please fix and return the proper code.
            
            ONLY RETURN CODE NO EXPLANATIONS
            
            YOUR CODE: {cleaned}
            ---------
            ERROR_TYPE: {type(e).__name__}
            ERROR: {e}
        """
        
        got = handle_completion(errPrompt)
        cleaned = clean_code(got)
        task_call = extract_call(cleaned)
        
        try:
            handle_code(cleaned)
        except:
            pass
    finally:
        return cleaned    
        
    if test.get("expected") is not None:
        exp_header = clean_code(extract_between_backticks(input))
        cleaned_exp = clean_code(test["expected"])
        indented_exp = '\n'.join('    ' + line for line in cleaned_exp.splitlines())
        
        exp_test = f"""{exp_header}
    {indented_exp}
            
        {task_call}
        """
        #print("EXP TEST:", exp_test)
        handle_code(exp_test)

#handle_code(code)


---

# Planning

---

In [None]:
def handle_planning(input):
    SYSTEM_AGENT = f"""
        You are a planning agent. Plan a series of instructions from the given input
        
        First generate your chain of thought.
        
        Then print "PLANNING:"
        
        Followed by each step
    """
    
    prompt = f"""
        PLANNING INSTRUCTIONS: {input}
        
        Generate a chain of thought on how to plan each step
        
        Then print "PLANNING:"
        
        And add each step in order. EXAMPLE:
        (action actuator1 object1 actuator2 object2)
    """
    
    
    
    r = call_model_chat_completions(
            system=SYSTEM_AGENT,
            prompt=prompt,
            temperature=0.2,
            max_tokens=400
        )
    
    got = (r.get("text") or "").strip()
    if "PLANNING:" in got:
        result = got.split("PLANNING:", 1)[1].strip()
    else:
        result = got
    #print('GOT1:', got)
    
    prompt2 = f"""
        Analyze these planning steps closely. Do they allign with the input prompt?
        
        PLANNING STEPS: {result}
        -----------------------------------------------
        INPUT PROMPT: {input}
        
        Generate a chain of thought, then print "FINAL OUTPUT"
        
        Give a concise answer on whether the planning steps allign with the input question
    """
    
    r = call_model_chat_completions(
            system=SYSTEM_AGENT,
            prompt=prompt2,
            temperature=0.2,
            max_tokens=400
        )
    
    got = (r.get("text") or "").strip()
    #result = got.split("PLANNING:", 1)[1].strip()
    #print('GOT2:', got)
    #return got
    
    prompt3 = f"""
        Given the analysis of our planning steps:
        
        ANALYSIS: {got}
        --------------------------------------
        Update steps if need
        
        return the final order of steps in this format, no explanation:
        (step 1)
        (step 2)
        (step n)
        
    """
    
    r = call_model_chat_completions(
            system=f"You are a helpful assistant who fixes plans",
            prompt=prompt3,
            temperature=0.2,
            max_tokens=400
        )
    
    got = (r.get("text") or "").strip()
    #result = got.split("PLANNING:", 1)[1].strip()
    #print('GOT3:', got)
    return got
    #return got

---

# Start loop

---

In [125]:
rng = random.randint(0,20000) #1741
#seed=11789, n=30 for diverse samples
test_prompts = get_tests(n=1, seed=rng, test_type=["planning"]) #get_test_type(["math"],end=10, upper=300) get_random_tests(n=3, upper=300)
print('seed:', rng)


QUESTIONS = []
for t in test_prompts:
    ttype = classify_prompt(t["prompt"])#guess_test_type(t["prompt"])
    print('test type:', ttype)
    print('question: ', t["prompt"])
    print('expected:', t["expected"])
    QUESTIONS.append((t["prompt"], t["expected"], ttype))

    try:
        match ttype:
            case 'math':
                print('FINAL MATH OUTPUT',evaluate_side_by_side(t["prompt"], verbose=True))
            case 'common_sense':
                print('FINAL SEARCH OUTPUT',search_bot(t["prompt"]))
            case 'coding':
                print('FINAL CODE OUTPUT',get_code(t))
            case 'planning':
                print('FINAL PLANNING OUTPUT', handle_planning(t["prompt"]))
            case _:
                print('NOT VALID TEST')
                
    except Exception as e:
        print(f"ERROR_TYPE: {type(e).__name__} ERROR: {e}")
        r = call_model_chat_completions(
            system="You are a helpful assistant. Respond with short, concise answers",
            prompt=t["prompt"],
            temperature=0.2,
            max_tokens=400
        )
        print(f"FALLBACK: {r['text']}")
        
        
    """ parsed = handle_test(t["prompt"])
    display(Markdown(f"OUTPUT:\n{parsed}"))
    
    print_json(parsed) """

#load_save_json(path_in="test_history.json", path_out="test_history.json", data_in=f"seed: {rng}, questions: {test_prompts}", clear=False)
""" results_llm_judge = self_evaluate_tests(test_prompts, verbose=True, model=MODEL, grader_model=MODEL)
results_llm_judge["seed"] = rng
print_json(results_llm_judge)
print("\n","="*64) 2861
load_save_json(path_in="test_history.json", path_out="test_history.json", data_in=results_llm_judge, clear=True) """


seed: 9840
prediction: planning
Classes order: ['coding' 'common_sense' 'future_prediction' 'math' 'planning']
probabilities: [1.18496349e-06 8.37065617e-08 3.10857857e-07 1.67177039e-06
 9.99996749e-01]
test type: planning
question:  I am playing with a set of objects. Here are the actions I can do

   Attack object
   Feast object from another object
   Succumb object
   Overcome object from another object

I have the following restrictions on my actions:
    To perform Attack action, the following facts need to be true: Province object, Planet object, Harmony.
    Once Attack action is performed the following facts will be true: Pain object.
    Once Attack action is performed the following facts will be false: Province object, Planet object, Harmony.
    To perform Succumb action, the following facts need to be true: Pain object.
    Once Succumb action is performed the following facts will be true: Province object, Planet object, Harmony.    
    Once Succumb action is performed t

' results_llm_judge = self_evaluate_tests(test_prompts, verbose=True, model=MODEL, grader_model=MODEL)\nresults_llm_judge["seed"] = rng\nprint_json(results_llm_judge)\nprint("\n","="*64) 2861\nload_save_json(path_in="test_history.json", path_out="test_history.json", data_in=results_llm_judge, clear=True) '

problem seeds: 1410, 12542 (NONE), 4260 (x), 8772 (choose), 1679(factorial)

15206 (wrong test), 18722(wiki cutting off n), math 2506

cool working example

```
seed: 4044
prediction: common_sense
Classes order: ['coding' 'common_sense' 'future_prediction' 'math' 'planning']
probabilities: [0.0938728  0.59071166 0.04256591 0.21505035 0.05779928]
test type: common_sense
expected: the Sumerians
Who invented the type of script used in autographs?
YES
Cursive script
Search results: ['Semi-cursive script', 'Cursive', 'Cursive script (East Asia)', 'Cursive Hebrew', 'Cursive script']
Summary: Semi-cursive script, also known as running script, is a style of Chinese calligraphy that emerged during the Han dynasty (202 BC ‚Äì 220 AD). The style is used to write Chinese characters and is abbreviated slightly where a character's strokes are permitted to be visibly connected as the writer writes, but not to the extent of the cursive style. This makes the style easily readable by readers who can read regular script and quickly writable by calligraphers who require ideas to be written down quickly.
('Semi-cursive script, also known as running script, is a style of Chinese '
 'calligraphy that emerged during the Han dynasty (202 BC ‚Äì 220 AD). The style '
 'is used to write Chinese characters and is abbreviated slightly where a '
 "character's strokes are permitted to be visibly connected as the writer "
 'writes, but not to the extent of the cursive style. This makes the style '
 'easily readable by readers who can read regular script and quickly writable '
 'by calligraphers who require ideas to be written down quickly.')
chat {'previous_chat': {'prompt': '\n    Do you want to research this question?\n\n    QUESTION: Who invented the type of script used in autographs?\n\n    (Answer YES or NO)\n    ', 'your answer': 'YES'}, 'previous_chat2': {'prompt': "\n    What wikipedia article should we research?\n\n    QUESTION: Who invented the type of script used in autographs?\n\n    Only respond with a single article, don't explain.\n    ", 'your answer': 'Cursive script', 'wikipedia response': "Semi-cursive script, also known as running script, is a style of Chinese calligraphy that emerged during the Han dynasty (202 BC ‚Äì 220 AD). The style is used to write Chinese characters and is abbreviated slightly where a character's strokes are permitted to be visibly connected as the writer writes, but not to the extent of the cursive style. This makes the style easily readable by readers who can read regular script and quickly writable by calligraphers who require ideas to be written down quickly."}, 'current_chat': {'prompt': '\n    Given the wikipedia response given in the last chat. How would you answer this question:\n\n    QUESTION: Who invented the type of script used in autographs?\n\n    Give a short concise response.\n    ', 'your answer': '...'}}
The semi-cursive script, used in autographs, was developed during the Han dynasty.```