In [1]:
# %% Minimal setup
# If needed (uncomment in a notebook):
# !pip install requests python-dotenv

import os, json, textwrap, re, time
import requests

API_KEY  = os.getenv("OPENAI_API_KEY", "cse476")
API_BASE = os.getenv("API_BASE", "http://10.4.58.53:41701/v1")  
MODEL    = os.getenv("MODEL_NAME", "bens_model")              

def call_model_chat_completions(prompt: str,
                                system: str = "You are a helpful assistant. Reply with only the final answerâ€”no explanation.",
                                model: str = MODEL,
                                temperature: float = 0.0,
                                timeout: int = 60) -> dict:
    """
    Calls an OpenAI-style /v1/chat/completions endpoint and returns:
    { 'ok': bool, 'text': str or None, 'raw': dict or None, 'status': int, 'error': str or None, 'headers': dict }
    """
    url = f"{API_BASE}/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type":  "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user",   "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": 128,
    }

    try:
        resp = requests.post(url, headers=headers, json=payload, timeout=timeout)
        status = resp.status_code
        hdrs   = dict(resp.headers)
        if status == 200:
            data = resp.json()
            text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            return {"ok": True, "text": text, "raw": data, "status": status, "error": None, "headers": hdrs}
        else:
            # try best-effort to surface error text
            err_text = None
            try:
                err_text = resp.json()
            except Exception:
                err_text = resp.text
            return {"ok": False, "text": None, "raw": None, "status": status, "error": str(err_text), "headers": hdrs}
    except requests.RequestException as e:
        return {"ok": False, "text": None, "raw": None, "status": -1, "error": str(e), "headers": {}}


In [2]:
# %% Direct call example
def direct_call(prompt="What is 17 + 28? Answer with just the number."):
    demo_prompt = prompt
    result = call_model_chat_completions(demo_prompt)
    print("OK:", result["ok"], "HTTP:", result["status"])
    print("MODEL SAYS:", (result["text"] or "").strip())

    # Optional: Inspect rate-limit headers if your provider exposes them
    for k in ["x-ratelimit-remaining-requests", "x-ratelimit-limit-requests", "x-request-id"]:
        if k in result["headers"]:
            print(f"{k}: {result['headers'][k]}")


In [3]:
# %% Define three tests: input + expected
my_tests = [
    {
        "id": "math_inequality",
        "type": "numeric",  # grader will prefer numeric extraction
        "prompt": "Solve for the smallest integer n such that 3n + 5 > 26. Answer with just the integer.",
        "expected": "8",    # Because 3n > 21 => n > 7, smallest integer is 8
    },
    {
        "id": "commonsense_ice",
        "type": "text",
        "prompt": (
            "You place an ice cube in a glass of water and mark the water level. "
            "After the ice melts, does the water level rise, fall, or stay the same? "
            "Answer with exactly one of: 'rise', 'fall', 'stay the same'."
        ),
        "expected": "stay the same",
    },
    {
        "id": "logic_race",
        "type": "text",
        "prompt": (
            "In a race, you pass the person in second place. What position are you now in? "
            "Answer with a single word like 'first', 'second', 'third'."
        ),
        "expected": "second",
    },
]


In [4]:
import json
from pprint import pprint
all_tests = json.load(open("cse476_final_project_dev_data.json", "r", encoding="utf-8"))

formatted_tests = []
for i, t in enumerate(all_tests, start=1):
    domain = t['domain']
    formatted_tests.append({
        "id": f"{domain}_{i}",
        "type": domain,
        "prompt": t['input'],
        "expected": t['output'],
    })
    
all_tests = formatted_tests

In [5]:
math_tests = [t for t in all_tests if t['type'] == 'math']
print(f"{len(math_tests)} Math tests loaded out of {len(all_tests)} tests")
tests = math_tests[:3]
pprint(tests)

300 Math tests loaded out of 1000 tests
[{'expected': '112',
  'id': 'math_1',
  'prompt': 'Let $ABCD$ be a convex quadrilateral with $AB = CD = 10$ , $BC = '
            '14$ , and $AD = 2\\sqrt{65}$ . Assume that the diagonals of '
            '$ABCD$ intersect at point $P$ , and that the sum of the areas of '
            'triangles $APB$ and $CPD$ equals the sum of the areas of '
            'triangles $BPC$ and $APD$ . Find the area of quadrilateral $ABCD$ '
            '.',
  'type': 'math'},
 {'expected': '164',
  'id': 'math_2',
  'prompt': 'A tennis player computes her win ratio by dividing the number of '
            'matches she has won by the total number of matches she has '
            'played. At the start of a weekend, her win ratio is exactly '
            '$0.500$ . During the weekend, she plays four matches, winning '
            'three and losing one. At the end of the weekend, her win ratio is '
            "greater than $.503$ . What's the largest number of matches

In [None]:
#simple hello world call to kick off the commits
direct_call("What is the most widely spoken language?")

OK: True HTTP: 200
MODEL SAYS: Mandarin Chinese
